Fixed #30190 -- Added JSONL serializer.

This commit is contained in:
Ali Vakilzade 2020-06-16 15:51:58 +01:00 committed by GitHub
parent ea3beb4f5a
commit e29637681b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 396 additions and 1 deletions

View File

@ -52,6 +52,7 @@ answer newbie questions, and generally made Django that much better:
Alex Robbins <alexander.j.robbins@gmail.com>
Alexey Boriskin <alex@boriskin.me>
Alexey Tsivunin <most-208@yandex.ru>
Ali Vakilzade <ali@vakilzade.com>
Aljosa Mohorovic <aljosa.mohorovic@gmail.com>
Amit Chakradeo <https://amit.chakradeo.net/>
Amit Ramon <amit.ramon@gmail.com>

View File

@ -28,6 +28,7 @@ BUILTIN_SERIALIZERS = {
"python": "django.core.serializers.python",
"json": "django.core.serializers.json",
"yaml": "django.core.serializers.pyyaml",
"jsonl": "django.core.serializers.jsonl",
}
_serializers = {}

View File

@ -0,0 +1,57 @@
"""
Serialize data to/from JSON Lines
"""
import json
from django.core.serializers.base import DeserializationError
from django.core.serializers.json import DjangoJSONEncoder
from django.core.serializers.python import (
Deserializer as PythonDeserializer, Serializer as PythonSerializer,
)
class Serializer(PythonSerializer):
"""Convert a queryset to JSON Lines."""
internal_use_only = False
def _init_options(self):
self._current = None
self.json_kwargs = self.options.copy()
self.json_kwargs.pop('stream', None)
self.json_kwargs.pop('fields', None)
self.json_kwargs.pop('indent', None)
self.json_kwargs['separators'] = (',', ': ')
self.json_kwargs.setdefault('cls', DjangoJSONEncoder)
self.json_kwargs.setdefault('ensure_ascii', False)
def start_serialization(self):
self._init_options()
def end_object(self, obj):
# self._current has the field data
json.dump(self.get_dump_object(obj), self.stream, **self.json_kwargs)
self.stream.write("\n")
self._current = None
def getvalue(self):
# Grandparent super
return super(PythonSerializer, self).getvalue()
def Deserializer(stream_or_string, **options):
"""Deserialize a stream or string of JSON data."""
if isinstance(stream_or_string, bytes):
stream_or_string = stream_or_string.decode()
if isinstance(stream_or_string, (bytes, str)):
stream_or_string = stream_or_string.split("\n")
for line in stream_or_string:
if not line.strip():
continue
try:
yield list(PythonDeserializer([json.loads(line), ], **options))[0]
except (GeneratorExit, DeserializationError):
raise
except Exception as exc:
raise DeserializationError() from exc

View File

@ -215,7 +215,10 @@ Security
Serialization
~~~~~~~~~~~~~
* ...
* The new :ref:`JSONL <serialization-formats-jsonl>` serializer allows using
the JSON Lines format with :djadmin:`dumpdata` and :djadmin:`loaddata`. This
can be useful for populating large databases because data is loaded line by
line into memory, rather than being loaded all at once.
Signals
~~~~~~~

View File

@ -160,11 +160,14 @@ Identifier Information
``json`` Serializes to and from JSON_.
``jsonl`` Serializes to and from JSONL_.
``yaml`` Serializes to YAML (YAML Ain't a Markup Language). This
serializer is only available if PyYAML_ is installed.
========== ==============================================================
.. _json: https://json.org/
.. _jsonl: http://jsonlines.org/
.. _PyYAML: https://pyyaml.org/
XML
@ -307,6 +310,24 @@ The JSON serializer uses ``DjangoJSONEncoder`` for encoding. A subclass of
.. _ecma-262: https://www.ecma-international.org/ecma-262/5.1/#sec-15.9.1.15
.. _serialization-formats-jsonl:
JSONL
-----
.. versionadded:: 3.2
*JSONL* stands for *JSON Lines*. With this format, objects are separated by new
lines, and each line contains a valid JSON object. JSONL serialized data look
like this::
{ "pk": "4b678b301dfd8a4e0dad910de3ae245b", "model": "sessions.session", "fields": { ... }}
{ "pk": "88bea72c02274f3c9bf1cb2bb8cee4fc", "model": "sessions.session", "fields": { ... }}
{ "pk": "9cf0e26691b64147a67e2a9f06ad7a53", "model": "sessions.session", "fields": { ... }}
JSONL can be useful for populating large databases, since the data can be
processed line by line, rather than being loaded into memory all at once.
YAML
----

View File

@ -0,0 +1,312 @@
import decimal
import json
import re
from django.core import serializers
from django.core.serializers.base import DeserializationError
from django.db import models
from django.test import TestCase, TransactionTestCase
from django.test.utils import isolate_apps
from .models import Score
from .tests import SerializersTestBase, SerializersTransactionTestBase
class JsonlSerializerTestCase(SerializersTestBase, TestCase):
serializer_name = "jsonl"
pkless_str = [
"""{
"pk": null,
"model": "serializers.category",
"fields": {"name": "Reference"}
}""",
"""{
"model": "serializers.category",
"fields": {"name": "Non-fiction"}
}"""
]
pkless_str = "\n".join([s.replace("\n", "") for s in pkless_str])
mapping_ordering_str = """{
"model": "serializers.article",
"pk": %(article_pk)s,
"fields": {
"author": %(author_pk)s,
"headline": "Poker has no place on ESPN",
"pub_date": "2006-06-16T11:00:00",
"categories": [
%(first_category_pk)s,
%(second_category_pk)s
],
"meta_data": []
}
}""".replace("\n", "") + "\n"
@staticmethod
def _validate_output(serial_str):
try:
for line in serial_str.split("\n"):
if line:
json.loads(line)
except Exception:
return False
else:
return True
@staticmethod
def _get_pk_values(serial_str):
serial_list = [json.loads(line) for line in serial_str.split("\n") if line]
return [obj_dict['pk'] for obj_dict in serial_list]
@staticmethod
def _get_field_values(serial_str, field_name):
serial_list = [json.loads(line) for line in serial_str.split("\n") if line]
return [obj_dict['fields'][field_name] for obj_dict in serial_list if field_name in obj_dict['fields']]
def test_no_indentation(self):
s = serializers.jsonl.Serializer()
json_data = s.serialize([Score(score=5.0), Score(score=6.0)], indent=2)
for line in json_data.splitlines():
self.assertIsNone(re.search(r'.+,\s*$', line))
@isolate_apps('serializers')
def test_custom_encoder(self):
class ScoreDecimal(models.Model):
score = models.DecimalField()
class CustomJSONEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, decimal.Decimal):
return str(o)
return super().default(o)
s = serializers.jsonl.Serializer()
json_data = s.serialize(
[ScoreDecimal(score=decimal.Decimal(1.0))], cls=CustomJSONEncoder
)
self.assertIn('"fields": {"score": "1"}', json_data)
def test_json_deserializer_exception(self):
with self.assertRaises(DeserializationError):
for obj in serializers.deserialize("jsonl", """[{"pk":1}"""):
pass
def test_helpful_error_message_invalid_pk(self):
"""
If there is an invalid primary key, the error message should contain
the model associated with it.
"""
test_string = """{
"pk": "badpk",
"model": "serializers.player",
"fields": {
"name": "Bob",
"rank": 1,
"team": "Team"
}
}""".replace("\n", "")
with self.assertRaisesMessage(DeserializationError, "(serializers.player:pk=badpk)"):
list(serializers.deserialize('jsonl', test_string))
def test_helpful_error_message_invalid_field(self):
"""
If there is an invalid field value, the error message should contain
the model associated with it.
"""
test_string = """{
"pk": "1",
"model": "serializers.player",
"fields": {
"name": "Bob",
"rank": "invalidint",
"team": "Team"
}
}""".replace("\n", "")
expected = "(serializers.player:pk=1) field_value was 'invalidint'"
with self.assertRaisesMessage(DeserializationError, expected):
list(serializers.deserialize('jsonl', test_string))
def test_helpful_error_message_for_foreign_keys(self):
"""
Invalid foreign keys with a natural key should throw a helpful error
message, such as what the failing key is.
"""
test_string = """{
"pk": 1,
"model": "serializers.category",
"fields": {
"name": "Unknown foreign key",
"meta_data": [
"doesnotexist",
"metadata"
]
}
}""".replace("\n", "")
key = ["doesnotexist", "metadata"]
expected = "(serializers.category:pk=1) field_value was '%r'" % key
with self.assertRaisesMessage(DeserializationError, expected):
list(serializers.deserialize('jsonl', test_string))
def test_helpful_error_message_for_many2many_non_natural(self):
"""
Invalid many-to-many keys should throw a helpful error message.
"""
test_strings = [
"""{
"pk": 1,
"model": "serializers.article",
"fields": {
"author": 1,
"headline": "Unknown many to many",
"pub_date": "2014-09-15T10:35:00",
"categories": [1, "doesnotexist"]
}
}""",
"""{
"pk": 1,
"model": "serializers.author",
"fields": {
"name": "Agnes"
}
}""",
"""{
"pk": 1,
"model": "serializers.category",
"fields": {
"name": "Reference"
}
}"""
]
test_string = "\n".join([s.replace("\n", "") for s in test_strings])
expected = "(serializers.article:pk=1) field_value was 'doesnotexist'"
with self.assertRaisesMessage(DeserializationError, expected):
list(serializers.deserialize('jsonl', test_string))
def test_helpful_error_message_for_many2many_natural1(self):
"""
Invalid many-to-many keys should throw a helpful error message.
This tests the code path where one of a list of natural keys is invalid.
"""
test_strings = [
"""{
"pk": 1,
"model": "serializers.categorymetadata",
"fields": {
"kind": "author",
"name": "meta1",
"value": "Agnes"
}
}""",
"""{
"pk": 1,
"model": "serializers.article",
"fields": {
"author": 1,
"headline": "Unknown many to many",
"pub_date": "2014-09-15T10:35:00",
"meta_data": [
["author", "meta1"],
["doesnotexist", "meta1"],
["author", "meta1"]
]
}
}""",
"""{
"pk": 1,
"model": "serializers.author",
"fields": {
"name": "Agnes"
}
}"""
]
test_string = "\n".join([s.replace("\n", "") for s in test_strings])
key = ["doesnotexist", "meta1"]
expected = "(serializers.article:pk=1) field_value was '%r'" % key
with self.assertRaisesMessage(DeserializationError, expected):
for obj in serializers.deserialize('jsonl', test_string):
obj.save()
def test_helpful_error_message_for_many2many_natural2(self):
"""
Invalid many-to-many keys should throw a helpful error message. This
tests the code path where a natural many-to-many key has only a single
value.
"""
test_strings = [
"""{
"pk": 1,
"model": "serializers.article",
"fields": {
"author": 1,
"headline": "Unknown many to many",
"pub_date": "2014-09-15T10:35:00",
"meta_data": [1, "doesnotexist"]
}
}""",
"""{
"pk": 1,
"model": "serializers.categorymetadata",
"fields": {
"kind": "author",
"name": "meta1",
"value": "Agnes"
}
}""",
"""{
"pk": 1,
"model": "serializers.author",
"fields": {
"name": "Agnes"
}
}"""
]
test_string = "\n".join([s.replace("\n", "") for s in test_strings])
expected = "(serializers.article:pk=1) field_value was 'doesnotexist'"
with self.assertRaisesMessage(DeserializationError, expected):
for obj in serializers.deserialize('jsonl', test_string, ignore=False):
obj.save()
def test_helpful_error_message_for_many2many_not_iterable(self):
"""
Not iterable many-to-many field value throws a helpful error message.
"""
test_string = """{
"pk": 1,
"model": "serializers.m2mdata",
"fields": {"data": null}
}""".replace("\n", "")
expected = "(serializers.m2mdata:pk=1) field_value was 'None'"
with self.assertRaisesMessage(DeserializationError, expected):
next(serializers.deserialize('jsonl', test_string, ignore=False))
class JsonSerializerTransactionTestCase(SerializersTransactionTestBase, TransactionTestCase):
serializer_name = "jsonl"
fwd_ref_str = [
"""{
"pk": 1,
"model": "serializers.article",
"fields": {
"headline": "Forward references pose no problem",
"pub_date": "2006-06-16T15:00:00",
"categories": [1],
"author": 1
}
}""",
"""{
"pk": 1,
"model": "serializers.category",
"fields": {
"name": "Reference"
}
}""",
"""{
"pk": 1,
"model": "serializers.author",
"fields": {
"name": "Agnes"
}
}"""
]
fwd_ref_str = "\n".join([s.replace("\n", "") for s in fwd_ref_str])