Fixed #30190 -- Added JSONL serializer.
This commit is contained in:
parent
ea3beb4f5a
commit
e29637681b
1
AUTHORS
1
AUTHORS
|
@ -52,6 +52,7 @@ answer newbie questions, and generally made Django that much better:
|
||||||
Alex Robbins <alexander.j.robbins@gmail.com>
|
Alex Robbins <alexander.j.robbins@gmail.com>
|
||||||
Alexey Boriskin <alex@boriskin.me>
|
Alexey Boriskin <alex@boriskin.me>
|
||||||
Alexey Tsivunin <most-208@yandex.ru>
|
Alexey Tsivunin <most-208@yandex.ru>
|
||||||
|
Ali Vakilzade <ali@vakilzade.com>
|
||||||
Aljosa Mohorovic <aljosa.mohorovic@gmail.com>
|
Aljosa Mohorovic <aljosa.mohorovic@gmail.com>
|
||||||
Amit Chakradeo <https://amit.chakradeo.net/>
|
Amit Chakradeo <https://amit.chakradeo.net/>
|
||||||
Amit Ramon <amit.ramon@gmail.com>
|
Amit Ramon <amit.ramon@gmail.com>
|
||||||
|
|
|
@ -28,6 +28,7 @@ BUILTIN_SERIALIZERS = {
|
||||||
"python": "django.core.serializers.python",
|
"python": "django.core.serializers.python",
|
||||||
"json": "django.core.serializers.json",
|
"json": "django.core.serializers.json",
|
||||||
"yaml": "django.core.serializers.pyyaml",
|
"yaml": "django.core.serializers.pyyaml",
|
||||||
|
"jsonl": "django.core.serializers.jsonl",
|
||||||
}
|
}
|
||||||
|
|
||||||
_serializers = {}
|
_serializers = {}
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
"""
|
||||||
|
Serialize data to/from JSON Lines
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from django.core.serializers.base import DeserializationError
|
||||||
|
from django.core.serializers.json import DjangoJSONEncoder
|
||||||
|
from django.core.serializers.python import (
|
||||||
|
Deserializer as PythonDeserializer, Serializer as PythonSerializer,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Serializer(PythonSerializer):
|
||||||
|
"""Convert a queryset to JSON Lines."""
|
||||||
|
internal_use_only = False
|
||||||
|
|
||||||
|
def _init_options(self):
|
||||||
|
self._current = None
|
||||||
|
self.json_kwargs = self.options.copy()
|
||||||
|
self.json_kwargs.pop('stream', None)
|
||||||
|
self.json_kwargs.pop('fields', None)
|
||||||
|
self.json_kwargs.pop('indent', None)
|
||||||
|
self.json_kwargs['separators'] = (',', ': ')
|
||||||
|
self.json_kwargs.setdefault('cls', DjangoJSONEncoder)
|
||||||
|
self.json_kwargs.setdefault('ensure_ascii', False)
|
||||||
|
|
||||||
|
def start_serialization(self):
|
||||||
|
self._init_options()
|
||||||
|
|
||||||
|
def end_object(self, obj):
|
||||||
|
# self._current has the field data
|
||||||
|
json.dump(self.get_dump_object(obj), self.stream, **self.json_kwargs)
|
||||||
|
self.stream.write("\n")
|
||||||
|
self._current = None
|
||||||
|
|
||||||
|
def getvalue(self):
|
||||||
|
# Grandparent super
|
||||||
|
return super(PythonSerializer, self).getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def Deserializer(stream_or_string, **options):
|
||||||
|
"""Deserialize a stream or string of JSON data."""
|
||||||
|
if isinstance(stream_or_string, bytes):
|
||||||
|
stream_or_string = stream_or_string.decode()
|
||||||
|
if isinstance(stream_or_string, (bytes, str)):
|
||||||
|
stream_or_string = stream_or_string.split("\n")
|
||||||
|
|
||||||
|
for line in stream_or_string:
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
yield list(PythonDeserializer([json.loads(line), ], **options))[0]
|
||||||
|
except (GeneratorExit, DeserializationError):
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
raise DeserializationError() from exc
|
|
@ -215,7 +215,10 @@ Security
|
||||||
Serialization
|
Serialization
|
||||||
~~~~~~~~~~~~~
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
* ...
|
* The new :ref:`JSONL <serialization-formats-jsonl>` serializer allows using
|
||||||
|
the JSON Lines format with :djadmin:`dumpdata` and :djadmin:`loaddata`. This
|
||||||
|
can be useful for populating large databases because data is loaded line by
|
||||||
|
line into memory, rather than being loaded all at once.
|
||||||
|
|
||||||
Signals
|
Signals
|
||||||
~~~~~~~
|
~~~~~~~
|
||||||
|
|
|
@ -160,11 +160,14 @@ Identifier Information
|
||||||
|
|
||||||
``json`` Serializes to and from JSON_.
|
``json`` Serializes to and from JSON_.
|
||||||
|
|
||||||
|
``jsonl`` Serializes to and from JSONL_.
|
||||||
|
|
||||||
``yaml`` Serializes to YAML (YAML Ain't a Markup Language). This
|
``yaml`` Serializes to YAML (YAML Ain't a Markup Language). This
|
||||||
serializer is only available if PyYAML_ is installed.
|
serializer is only available if PyYAML_ is installed.
|
||||||
========== ==============================================================
|
========== ==============================================================
|
||||||
|
|
||||||
.. _json: https://json.org/
|
.. _json: https://json.org/
|
||||||
|
.. _jsonl: http://jsonlines.org/
|
||||||
.. _PyYAML: https://pyyaml.org/
|
.. _PyYAML: https://pyyaml.org/
|
||||||
|
|
||||||
XML
|
XML
|
||||||
|
@ -307,6 +310,24 @@ The JSON serializer uses ``DjangoJSONEncoder`` for encoding. A subclass of
|
||||||
|
|
||||||
.. _ecma-262: https://www.ecma-international.org/ecma-262/5.1/#sec-15.9.1.15
|
.. _ecma-262: https://www.ecma-international.org/ecma-262/5.1/#sec-15.9.1.15
|
||||||
|
|
||||||
|
.. _serialization-formats-jsonl:
|
||||||
|
|
||||||
|
JSONL
|
||||||
|
-----
|
||||||
|
|
||||||
|
.. versionadded:: 3.2
|
||||||
|
|
||||||
|
*JSONL* stands for *JSON Lines*. With this format, objects are separated by new
|
||||||
|
lines, and each line contains a valid JSON object. JSONL serialized data look
|
||||||
|
like this::
|
||||||
|
|
||||||
|
{ "pk": "4b678b301dfd8a4e0dad910de3ae245b", "model": "sessions.session", "fields": { ... }}
|
||||||
|
{ "pk": "88bea72c02274f3c9bf1cb2bb8cee4fc", "model": "sessions.session", "fields": { ... }}
|
||||||
|
{ "pk": "9cf0e26691b64147a67e2a9f06ad7a53", "model": "sessions.session", "fields": { ... }}
|
||||||
|
|
||||||
|
JSONL can be useful for populating large databases, since the data can be
|
||||||
|
processed line by line, rather than being loaded into memory all at once.
|
||||||
|
|
||||||
YAML
|
YAML
|
||||||
----
|
----
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,312 @@
|
||||||
|
import decimal
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
from django.core import serializers
|
||||||
|
from django.core.serializers.base import DeserializationError
|
||||||
|
from django.db import models
|
||||||
|
from django.test import TestCase, TransactionTestCase
|
||||||
|
from django.test.utils import isolate_apps
|
||||||
|
|
||||||
|
from .models import Score
|
||||||
|
from .tests import SerializersTestBase, SerializersTransactionTestBase
|
||||||
|
|
||||||
|
|
||||||
|
class JsonlSerializerTestCase(SerializersTestBase, TestCase):
|
||||||
|
serializer_name = "jsonl"
|
||||||
|
pkless_str = [
|
||||||
|
"""{
|
||||||
|
"pk": null,
|
||||||
|
"model": "serializers.category",
|
||||||
|
"fields": {"name": "Reference"}
|
||||||
|
}""",
|
||||||
|
"""{
|
||||||
|
"model": "serializers.category",
|
||||||
|
"fields": {"name": "Non-fiction"}
|
||||||
|
}"""
|
||||||
|
]
|
||||||
|
pkless_str = "\n".join([s.replace("\n", "") for s in pkless_str])
|
||||||
|
|
||||||
|
mapping_ordering_str = """{
|
||||||
|
"model": "serializers.article",
|
||||||
|
"pk": %(article_pk)s,
|
||||||
|
"fields": {
|
||||||
|
"author": %(author_pk)s,
|
||||||
|
"headline": "Poker has no place on ESPN",
|
||||||
|
"pub_date": "2006-06-16T11:00:00",
|
||||||
|
"categories": [
|
||||||
|
%(first_category_pk)s,
|
||||||
|
%(second_category_pk)s
|
||||||
|
],
|
||||||
|
"meta_data": []
|
||||||
|
}
|
||||||
|
}""".replace("\n", "") + "\n"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _validate_output(serial_str):
|
||||||
|
try:
|
||||||
|
for line in serial_str.split("\n"):
|
||||||
|
if line:
|
||||||
|
json.loads(line)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_pk_values(serial_str):
|
||||||
|
serial_list = [json.loads(line) for line in serial_str.split("\n") if line]
|
||||||
|
return [obj_dict['pk'] for obj_dict in serial_list]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_field_values(serial_str, field_name):
|
||||||
|
serial_list = [json.loads(line) for line in serial_str.split("\n") if line]
|
||||||
|
return [obj_dict['fields'][field_name] for obj_dict in serial_list if field_name in obj_dict['fields']]
|
||||||
|
|
||||||
|
def test_no_indentation(self):
|
||||||
|
s = serializers.jsonl.Serializer()
|
||||||
|
json_data = s.serialize([Score(score=5.0), Score(score=6.0)], indent=2)
|
||||||
|
for line in json_data.splitlines():
|
||||||
|
self.assertIsNone(re.search(r'.+,\s*$', line))
|
||||||
|
|
||||||
|
@isolate_apps('serializers')
|
||||||
|
def test_custom_encoder(self):
|
||||||
|
class ScoreDecimal(models.Model):
|
||||||
|
score = models.DecimalField()
|
||||||
|
|
||||||
|
class CustomJSONEncoder(json.JSONEncoder):
|
||||||
|
def default(self, o):
|
||||||
|
if isinstance(o, decimal.Decimal):
|
||||||
|
return str(o)
|
||||||
|
return super().default(o)
|
||||||
|
|
||||||
|
s = serializers.jsonl.Serializer()
|
||||||
|
json_data = s.serialize(
|
||||||
|
[ScoreDecimal(score=decimal.Decimal(1.0))], cls=CustomJSONEncoder
|
||||||
|
)
|
||||||
|
self.assertIn('"fields": {"score": "1"}', json_data)
|
||||||
|
|
||||||
|
def test_json_deserializer_exception(self):
|
||||||
|
with self.assertRaises(DeserializationError):
|
||||||
|
for obj in serializers.deserialize("jsonl", """[{"pk":1}"""):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_helpful_error_message_invalid_pk(self):
|
||||||
|
"""
|
||||||
|
If there is an invalid primary key, the error message should contain
|
||||||
|
the model associated with it.
|
||||||
|
"""
|
||||||
|
test_string = """{
|
||||||
|
"pk": "badpk",
|
||||||
|
"model": "serializers.player",
|
||||||
|
"fields": {
|
||||||
|
"name": "Bob",
|
||||||
|
"rank": 1,
|
||||||
|
"team": "Team"
|
||||||
|
}
|
||||||
|
}""".replace("\n", "")
|
||||||
|
with self.assertRaisesMessage(DeserializationError, "(serializers.player:pk=badpk)"):
|
||||||
|
list(serializers.deserialize('jsonl', test_string))
|
||||||
|
|
||||||
|
def test_helpful_error_message_invalid_field(self):
|
||||||
|
"""
|
||||||
|
If there is an invalid field value, the error message should contain
|
||||||
|
the model associated with it.
|
||||||
|
"""
|
||||||
|
test_string = """{
|
||||||
|
"pk": "1",
|
||||||
|
"model": "serializers.player",
|
||||||
|
"fields": {
|
||||||
|
"name": "Bob",
|
||||||
|
"rank": "invalidint",
|
||||||
|
"team": "Team"
|
||||||
|
}
|
||||||
|
}""".replace("\n", "")
|
||||||
|
expected = "(serializers.player:pk=1) field_value was 'invalidint'"
|
||||||
|
with self.assertRaisesMessage(DeserializationError, expected):
|
||||||
|
list(serializers.deserialize('jsonl', test_string))
|
||||||
|
|
||||||
|
def test_helpful_error_message_for_foreign_keys(self):
|
||||||
|
"""
|
||||||
|
Invalid foreign keys with a natural key should throw a helpful error
|
||||||
|
message, such as what the failing key is.
|
||||||
|
"""
|
||||||
|
test_string = """{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.category",
|
||||||
|
"fields": {
|
||||||
|
"name": "Unknown foreign key",
|
||||||
|
"meta_data": [
|
||||||
|
"doesnotexist",
|
||||||
|
"metadata"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}""".replace("\n", "")
|
||||||
|
key = ["doesnotexist", "metadata"]
|
||||||
|
expected = "(serializers.category:pk=1) field_value was '%r'" % key
|
||||||
|
with self.assertRaisesMessage(DeserializationError, expected):
|
||||||
|
list(serializers.deserialize('jsonl', test_string))
|
||||||
|
|
||||||
|
def test_helpful_error_message_for_many2many_non_natural(self):
|
||||||
|
"""
|
||||||
|
Invalid many-to-many keys should throw a helpful error message.
|
||||||
|
"""
|
||||||
|
test_strings = [
|
||||||
|
"""{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.article",
|
||||||
|
"fields": {
|
||||||
|
"author": 1,
|
||||||
|
"headline": "Unknown many to many",
|
||||||
|
"pub_date": "2014-09-15T10:35:00",
|
||||||
|
"categories": [1, "doesnotexist"]
|
||||||
|
}
|
||||||
|
}""",
|
||||||
|
"""{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.author",
|
||||||
|
"fields": {
|
||||||
|
"name": "Agnes"
|
||||||
|
}
|
||||||
|
}""",
|
||||||
|
"""{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.category",
|
||||||
|
"fields": {
|
||||||
|
"name": "Reference"
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
]
|
||||||
|
test_string = "\n".join([s.replace("\n", "") for s in test_strings])
|
||||||
|
expected = "(serializers.article:pk=1) field_value was 'doesnotexist'"
|
||||||
|
with self.assertRaisesMessage(DeserializationError, expected):
|
||||||
|
list(serializers.deserialize('jsonl', test_string))
|
||||||
|
|
||||||
|
def test_helpful_error_message_for_many2many_natural1(self):
|
||||||
|
"""
|
||||||
|
Invalid many-to-many keys should throw a helpful error message.
|
||||||
|
This tests the code path where one of a list of natural keys is invalid.
|
||||||
|
"""
|
||||||
|
test_strings = [
|
||||||
|
"""{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.categorymetadata",
|
||||||
|
"fields": {
|
||||||
|
"kind": "author",
|
||||||
|
"name": "meta1",
|
||||||
|
"value": "Agnes"
|
||||||
|
}
|
||||||
|
}""",
|
||||||
|
"""{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.article",
|
||||||
|
"fields": {
|
||||||
|
"author": 1,
|
||||||
|
"headline": "Unknown many to many",
|
||||||
|
"pub_date": "2014-09-15T10:35:00",
|
||||||
|
"meta_data": [
|
||||||
|
["author", "meta1"],
|
||||||
|
["doesnotexist", "meta1"],
|
||||||
|
["author", "meta1"]
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}""",
|
||||||
|
"""{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.author",
|
||||||
|
"fields": {
|
||||||
|
"name": "Agnes"
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
]
|
||||||
|
test_string = "\n".join([s.replace("\n", "") for s in test_strings])
|
||||||
|
key = ["doesnotexist", "meta1"]
|
||||||
|
expected = "(serializers.article:pk=1) field_value was '%r'" % key
|
||||||
|
with self.assertRaisesMessage(DeserializationError, expected):
|
||||||
|
for obj in serializers.deserialize('jsonl', test_string):
|
||||||
|
obj.save()
|
||||||
|
|
||||||
|
def test_helpful_error_message_for_many2many_natural2(self):
|
||||||
|
"""
|
||||||
|
Invalid many-to-many keys should throw a helpful error message. This
|
||||||
|
tests the code path where a natural many-to-many key has only a single
|
||||||
|
value.
|
||||||
|
"""
|
||||||
|
test_strings = [
|
||||||
|
"""{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.article",
|
||||||
|
"fields": {
|
||||||
|
"author": 1,
|
||||||
|
"headline": "Unknown many to many",
|
||||||
|
"pub_date": "2014-09-15T10:35:00",
|
||||||
|
"meta_data": [1, "doesnotexist"]
|
||||||
|
}
|
||||||
|
}""",
|
||||||
|
"""{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.categorymetadata",
|
||||||
|
"fields": {
|
||||||
|
"kind": "author",
|
||||||
|
"name": "meta1",
|
||||||
|
"value": "Agnes"
|
||||||
|
}
|
||||||
|
}""",
|
||||||
|
"""{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.author",
|
||||||
|
"fields": {
|
||||||
|
"name": "Agnes"
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
]
|
||||||
|
test_string = "\n".join([s.replace("\n", "") for s in test_strings])
|
||||||
|
expected = "(serializers.article:pk=1) field_value was 'doesnotexist'"
|
||||||
|
with self.assertRaisesMessage(DeserializationError, expected):
|
||||||
|
for obj in serializers.deserialize('jsonl', test_string, ignore=False):
|
||||||
|
obj.save()
|
||||||
|
|
||||||
|
def test_helpful_error_message_for_many2many_not_iterable(self):
|
||||||
|
"""
|
||||||
|
Not iterable many-to-many field value throws a helpful error message.
|
||||||
|
"""
|
||||||
|
test_string = """{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.m2mdata",
|
||||||
|
"fields": {"data": null}
|
||||||
|
}""".replace("\n", "")
|
||||||
|
|
||||||
|
expected = "(serializers.m2mdata:pk=1) field_value was 'None'"
|
||||||
|
with self.assertRaisesMessage(DeserializationError, expected):
|
||||||
|
next(serializers.deserialize('jsonl', test_string, ignore=False))
|
||||||
|
|
||||||
|
|
||||||
|
class JsonSerializerTransactionTestCase(SerializersTransactionTestBase, TransactionTestCase):
|
||||||
|
serializer_name = "jsonl"
|
||||||
|
fwd_ref_str = [
|
||||||
|
"""{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.article",
|
||||||
|
"fields": {
|
||||||
|
"headline": "Forward references pose no problem",
|
||||||
|
"pub_date": "2006-06-16T15:00:00",
|
||||||
|
"categories": [1],
|
||||||
|
"author": 1
|
||||||
|
}
|
||||||
|
}""",
|
||||||
|
"""{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.category",
|
||||||
|
"fields": {
|
||||||
|
"name": "Reference"
|
||||||
|
}
|
||||||
|
}""",
|
||||||
|
"""{
|
||||||
|
"pk": 1,
|
||||||
|
"model": "serializers.author",
|
||||||
|
"fields": {
|
||||||
|
"name": "Agnes"
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
]
|
||||||
|
fwd_ref_str = "\n".join([s.replace("\n", "") for s in fwd_ref_str])
|
Loading…
Reference in New Issue