mirror of https://github.com/django/django.git
Fixed #30190 -- Added JSONL serializer.
This commit is contained in:
parent
ea3beb4f5a
commit
e29637681b
1
AUTHORS
1
AUTHORS
|
@ -52,6 +52,7 @@ answer newbie questions, and generally made Django that much better:
|
|||
Alex Robbins <alexander.j.robbins@gmail.com>
|
||||
Alexey Boriskin <alex@boriskin.me>
|
||||
Alexey Tsivunin <most-208@yandex.ru>
|
||||
Ali Vakilzade <ali@vakilzade.com>
|
||||
Aljosa Mohorovic <aljosa.mohorovic@gmail.com>
|
||||
Amit Chakradeo <https://amit.chakradeo.net/>
|
||||
Amit Ramon <amit.ramon@gmail.com>
|
||||
|
|
|
@ -28,6 +28,7 @@ BUILTIN_SERIALIZERS = {
|
|||
"python": "django.core.serializers.python",
|
||||
"json": "django.core.serializers.json",
|
||||
"yaml": "django.core.serializers.pyyaml",
|
||||
"jsonl": "django.core.serializers.jsonl",
|
||||
}
|
||||
|
||||
_serializers = {}
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
"""
|
||||
Serialize data to/from JSON Lines
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from django.core.serializers.base import DeserializationError
|
||||
from django.core.serializers.json import DjangoJSONEncoder
|
||||
from django.core.serializers.python import (
|
||||
Deserializer as PythonDeserializer, Serializer as PythonSerializer,
|
||||
)
|
||||
|
||||
|
||||
class Serializer(PythonSerializer):
|
||||
"""Convert a queryset to JSON Lines."""
|
||||
internal_use_only = False
|
||||
|
||||
def _init_options(self):
|
||||
self._current = None
|
||||
self.json_kwargs = self.options.copy()
|
||||
self.json_kwargs.pop('stream', None)
|
||||
self.json_kwargs.pop('fields', None)
|
||||
self.json_kwargs.pop('indent', None)
|
||||
self.json_kwargs['separators'] = (',', ': ')
|
||||
self.json_kwargs.setdefault('cls', DjangoJSONEncoder)
|
||||
self.json_kwargs.setdefault('ensure_ascii', False)
|
||||
|
||||
def start_serialization(self):
|
||||
self._init_options()
|
||||
|
||||
def end_object(self, obj):
|
||||
# self._current has the field data
|
||||
json.dump(self.get_dump_object(obj), self.stream, **self.json_kwargs)
|
||||
self.stream.write("\n")
|
||||
self._current = None
|
||||
|
||||
def getvalue(self):
|
||||
# Grandparent super
|
||||
return super(PythonSerializer, self).getvalue()
|
||||
|
||||
|
||||
def Deserializer(stream_or_string, **options):
|
||||
"""Deserialize a stream or string of JSON data."""
|
||||
if isinstance(stream_or_string, bytes):
|
||||
stream_or_string = stream_or_string.decode()
|
||||
if isinstance(stream_or_string, (bytes, str)):
|
||||
stream_or_string = stream_or_string.split("\n")
|
||||
|
||||
for line in stream_or_string:
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
yield list(PythonDeserializer([json.loads(line), ], **options))[0]
|
||||
except (GeneratorExit, DeserializationError):
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise DeserializationError() from exc
|
|
@ -215,7 +215,10 @@ Security
|
|||
Serialization
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
* ...
|
||||
* The new :ref:`JSONL <serialization-formats-jsonl>` serializer allows using
|
||||
the JSON Lines format with :djadmin:`dumpdata` and :djadmin:`loaddata`. This
|
||||
can be useful for populating large databases because data is loaded line by
|
||||
line into memory, rather than being loaded all at once.
|
||||
|
||||
Signals
|
||||
~~~~~~~
|
||||
|
|
|
@ -160,11 +160,14 @@ Identifier Information
|
|||
|
||||
``json`` Serializes to and from JSON_.
|
||||
|
||||
``jsonl`` Serializes to and from JSONL_.
|
||||
|
||||
``yaml`` Serializes to YAML (YAML Ain't a Markup Language). This
|
||||
serializer is only available if PyYAML_ is installed.
|
||||
========== ==============================================================
|
||||
|
||||
.. _json: https://json.org/
|
||||
.. _jsonl: http://jsonlines.org/
|
||||
.. _PyYAML: https://pyyaml.org/
|
||||
|
||||
XML
|
||||
|
@ -307,6 +310,24 @@ The JSON serializer uses ``DjangoJSONEncoder`` for encoding. A subclass of
|
|||
|
||||
.. _ecma-262: https://www.ecma-international.org/ecma-262/5.1/#sec-15.9.1.15
|
||||
|
||||
.. _serialization-formats-jsonl:
|
||||
|
||||
JSONL
|
||||
-----
|
||||
|
||||
.. versionadded:: 3.2
|
||||
|
||||
*JSONL* stands for *JSON Lines*. With this format, objects are separated by new
|
||||
lines, and each line contains a valid JSON object. JSONL serialized data look
|
||||
like this::
|
||||
|
||||
{ "pk": "4b678b301dfd8a4e0dad910de3ae245b", "model": "sessions.session", "fields": { ... }}
|
||||
{ "pk": "88bea72c02274f3c9bf1cb2bb8cee4fc", "model": "sessions.session", "fields": { ... }}
|
||||
{ "pk": "9cf0e26691b64147a67e2a9f06ad7a53", "model": "sessions.session", "fields": { ... }}
|
||||
|
||||
JSONL can be useful for populating large databases, since the data can be
|
||||
processed line by line, rather than being loaded into memory all at once.
|
||||
|
||||
YAML
|
||||
----
|
||||
|
||||
|
|
|
@ -0,0 +1,312 @@
|
|||
import decimal
|
||||
import json
|
||||
import re
|
||||
|
||||
from django.core import serializers
|
||||
from django.core.serializers.base import DeserializationError
|
||||
from django.db import models
|
||||
from django.test import TestCase, TransactionTestCase
|
||||
from django.test.utils import isolate_apps
|
||||
|
||||
from .models import Score
|
||||
from .tests import SerializersTestBase, SerializersTransactionTestBase
|
||||
|
||||
|
||||
class JsonlSerializerTestCase(SerializersTestBase, TestCase):
|
||||
serializer_name = "jsonl"
|
||||
pkless_str = [
|
||||
"""{
|
||||
"pk": null,
|
||||
"model": "serializers.category",
|
||||
"fields": {"name": "Reference"}
|
||||
}""",
|
||||
"""{
|
||||
"model": "serializers.category",
|
||||
"fields": {"name": "Non-fiction"}
|
||||
}"""
|
||||
]
|
||||
pkless_str = "\n".join([s.replace("\n", "") for s in pkless_str])
|
||||
|
||||
mapping_ordering_str = """{
|
||||
"model": "serializers.article",
|
||||
"pk": %(article_pk)s,
|
||||
"fields": {
|
||||
"author": %(author_pk)s,
|
||||
"headline": "Poker has no place on ESPN",
|
||||
"pub_date": "2006-06-16T11:00:00",
|
||||
"categories": [
|
||||
%(first_category_pk)s,
|
||||
%(second_category_pk)s
|
||||
],
|
||||
"meta_data": []
|
||||
}
|
||||
}""".replace("\n", "") + "\n"
|
||||
|
||||
@staticmethod
|
||||
def _validate_output(serial_str):
|
||||
try:
|
||||
for line in serial_str.split("\n"):
|
||||
if line:
|
||||
json.loads(line)
|
||||
except Exception:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def _get_pk_values(serial_str):
|
||||
serial_list = [json.loads(line) for line in serial_str.split("\n") if line]
|
||||
return [obj_dict['pk'] for obj_dict in serial_list]
|
||||
|
||||
@staticmethod
|
||||
def _get_field_values(serial_str, field_name):
|
||||
serial_list = [json.loads(line) for line in serial_str.split("\n") if line]
|
||||
return [obj_dict['fields'][field_name] for obj_dict in serial_list if field_name in obj_dict['fields']]
|
||||
|
||||
def test_no_indentation(self):
|
||||
s = serializers.jsonl.Serializer()
|
||||
json_data = s.serialize([Score(score=5.0), Score(score=6.0)], indent=2)
|
||||
for line in json_data.splitlines():
|
||||
self.assertIsNone(re.search(r'.+,\s*$', line))
|
||||
|
||||
@isolate_apps('serializers')
|
||||
def test_custom_encoder(self):
|
||||
class ScoreDecimal(models.Model):
|
||||
score = models.DecimalField()
|
||||
|
||||
class CustomJSONEncoder(json.JSONEncoder):
|
||||
def default(self, o):
|
||||
if isinstance(o, decimal.Decimal):
|
||||
return str(o)
|
||||
return super().default(o)
|
||||
|
||||
s = serializers.jsonl.Serializer()
|
||||
json_data = s.serialize(
|
||||
[ScoreDecimal(score=decimal.Decimal(1.0))], cls=CustomJSONEncoder
|
||||
)
|
||||
self.assertIn('"fields": {"score": "1"}', json_data)
|
||||
|
||||
def test_json_deserializer_exception(self):
|
||||
with self.assertRaises(DeserializationError):
|
||||
for obj in serializers.deserialize("jsonl", """[{"pk":1}"""):
|
||||
pass
|
||||
|
||||
def test_helpful_error_message_invalid_pk(self):
|
||||
"""
|
||||
If there is an invalid primary key, the error message should contain
|
||||
the model associated with it.
|
||||
"""
|
||||
test_string = """{
|
||||
"pk": "badpk",
|
||||
"model": "serializers.player",
|
||||
"fields": {
|
||||
"name": "Bob",
|
||||
"rank": 1,
|
||||
"team": "Team"
|
||||
}
|
||||
}""".replace("\n", "")
|
||||
with self.assertRaisesMessage(DeserializationError, "(serializers.player:pk=badpk)"):
|
||||
list(serializers.deserialize('jsonl', test_string))
|
||||
|
||||
def test_helpful_error_message_invalid_field(self):
|
||||
"""
|
||||
If there is an invalid field value, the error message should contain
|
||||
the model associated with it.
|
||||
"""
|
||||
test_string = """{
|
||||
"pk": "1",
|
||||
"model": "serializers.player",
|
||||
"fields": {
|
||||
"name": "Bob",
|
||||
"rank": "invalidint",
|
||||
"team": "Team"
|
||||
}
|
||||
}""".replace("\n", "")
|
||||
expected = "(serializers.player:pk=1) field_value was 'invalidint'"
|
||||
with self.assertRaisesMessage(DeserializationError, expected):
|
||||
list(serializers.deserialize('jsonl', test_string))
|
||||
|
||||
def test_helpful_error_message_for_foreign_keys(self):
|
||||
"""
|
||||
Invalid foreign keys with a natural key should throw a helpful error
|
||||
message, such as what the failing key is.
|
||||
"""
|
||||
test_string = """{
|
||||
"pk": 1,
|
||||
"model": "serializers.category",
|
||||
"fields": {
|
||||
"name": "Unknown foreign key",
|
||||
"meta_data": [
|
||||
"doesnotexist",
|
||||
"metadata"
|
||||
]
|
||||
}
|
||||
}""".replace("\n", "")
|
||||
key = ["doesnotexist", "metadata"]
|
||||
expected = "(serializers.category:pk=1) field_value was '%r'" % key
|
||||
with self.assertRaisesMessage(DeserializationError, expected):
|
||||
list(serializers.deserialize('jsonl', test_string))
|
||||
|
||||
def test_helpful_error_message_for_many2many_non_natural(self):
|
||||
"""
|
||||
Invalid many-to-many keys should throw a helpful error message.
|
||||
"""
|
||||
test_strings = [
|
||||
"""{
|
||||
"pk": 1,
|
||||
"model": "serializers.article",
|
||||
"fields": {
|
||||
"author": 1,
|
||||
"headline": "Unknown many to many",
|
||||
"pub_date": "2014-09-15T10:35:00",
|
||||
"categories": [1, "doesnotexist"]
|
||||
}
|
||||
}""",
|
||||
"""{
|
||||
"pk": 1,
|
||||
"model": "serializers.author",
|
||||
"fields": {
|
||||
"name": "Agnes"
|
||||
}
|
||||
}""",
|
||||
"""{
|
||||
"pk": 1,
|
||||
"model": "serializers.category",
|
||||
"fields": {
|
||||
"name": "Reference"
|
||||
}
|
||||
}"""
|
||||
]
|
||||
test_string = "\n".join([s.replace("\n", "") for s in test_strings])
|
||||
expected = "(serializers.article:pk=1) field_value was 'doesnotexist'"
|
||||
with self.assertRaisesMessage(DeserializationError, expected):
|
||||
list(serializers.deserialize('jsonl', test_string))
|
||||
|
||||
def test_helpful_error_message_for_many2many_natural1(self):
|
||||
"""
|
||||
Invalid many-to-many keys should throw a helpful error message.
|
||||
This tests the code path where one of a list of natural keys is invalid.
|
||||
"""
|
||||
test_strings = [
|
||||
"""{
|
||||
"pk": 1,
|
||||
"model": "serializers.categorymetadata",
|
||||
"fields": {
|
||||
"kind": "author",
|
||||
"name": "meta1",
|
||||
"value": "Agnes"
|
||||
}
|
||||
}""",
|
||||
"""{
|
||||
"pk": 1,
|
||||
"model": "serializers.article",
|
||||
"fields": {
|
||||
"author": 1,
|
||||
"headline": "Unknown many to many",
|
||||
"pub_date": "2014-09-15T10:35:00",
|
||||
"meta_data": [
|
||||
["author", "meta1"],
|
||||
["doesnotexist", "meta1"],
|
||||
["author", "meta1"]
|
||||
]
|
||||
}
|
||||
}""",
|
||||
"""{
|
||||
"pk": 1,
|
||||
"model": "serializers.author",
|
||||
"fields": {
|
||||
"name": "Agnes"
|
||||
}
|
||||
}"""
|
||||
]
|
||||
test_string = "\n".join([s.replace("\n", "") for s in test_strings])
|
||||
key = ["doesnotexist", "meta1"]
|
||||
expected = "(serializers.article:pk=1) field_value was '%r'" % key
|
||||
with self.assertRaisesMessage(DeserializationError, expected):
|
||||
for obj in serializers.deserialize('jsonl', test_string):
|
||||
obj.save()
|
||||
|
||||
def test_helpful_error_message_for_many2many_natural2(self):
|
||||
"""
|
||||
Invalid many-to-many keys should throw a helpful error message. This
|
||||
tests the code path where a natural many-to-many key has only a single
|
||||
value.
|
||||
"""
|
||||
test_strings = [
|
||||
"""{
|
||||
"pk": 1,
|
||||
"model": "serializers.article",
|
||||
"fields": {
|
||||
"author": 1,
|
||||
"headline": "Unknown many to many",
|
||||
"pub_date": "2014-09-15T10:35:00",
|
||||
"meta_data": [1, "doesnotexist"]
|
||||
}
|
||||
}""",
|
||||
"""{
|
||||
"pk": 1,
|
||||
"model": "serializers.categorymetadata",
|
||||
"fields": {
|
||||
"kind": "author",
|
||||
"name": "meta1",
|
||||
"value": "Agnes"
|
||||
}
|
||||
}""",
|
||||
"""{
|
||||
"pk": 1,
|
||||
"model": "serializers.author",
|
||||
"fields": {
|
||||
"name": "Agnes"
|
||||
}
|
||||
}"""
|
||||
]
|
||||
test_string = "\n".join([s.replace("\n", "") for s in test_strings])
|
||||
expected = "(serializers.article:pk=1) field_value was 'doesnotexist'"
|
||||
with self.assertRaisesMessage(DeserializationError, expected):
|
||||
for obj in serializers.deserialize('jsonl', test_string, ignore=False):
|
||||
obj.save()
|
||||
|
||||
def test_helpful_error_message_for_many2many_not_iterable(self):
|
||||
"""
|
||||
Not iterable many-to-many field value throws a helpful error message.
|
||||
"""
|
||||
test_string = """{
|
||||
"pk": 1,
|
||||
"model": "serializers.m2mdata",
|
||||
"fields": {"data": null}
|
||||
}""".replace("\n", "")
|
||||
|
||||
expected = "(serializers.m2mdata:pk=1) field_value was 'None'"
|
||||
with self.assertRaisesMessage(DeserializationError, expected):
|
||||
next(serializers.deserialize('jsonl', test_string, ignore=False))
|
||||
|
||||
|
||||
class JsonSerializerTransactionTestCase(SerializersTransactionTestBase, TransactionTestCase):
|
||||
serializer_name = "jsonl"
|
||||
fwd_ref_str = [
|
||||
"""{
|
||||
"pk": 1,
|
||||
"model": "serializers.article",
|
||||
"fields": {
|
||||
"headline": "Forward references pose no problem",
|
||||
"pub_date": "2006-06-16T15:00:00",
|
||||
"categories": [1],
|
||||
"author": 1
|
||||
}
|
||||
}""",
|
||||
"""{
|
||||
"pk": 1,
|
||||
"model": "serializers.category",
|
||||
"fields": {
|
||||
"name": "Reference"
|
||||
}
|
||||
}""",
|
||||
"""{
|
||||
"pk": 1,
|
||||
"model": "serializers.author",
|
||||
"fields": {
|
||||
"name": "Agnes"
|
||||
}
|
||||
}"""
|
||||
]
|
||||
fwd_ref_str = "\n".join([s.replace("\n", "") for s in fwd_ref_str])
|
Loading…
Reference in New Issue