diff --git a/AUTHORS b/AUTHORS index 3cb92e21da..7ae9bb185a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -52,6 +52,7 @@ answer newbie questions, and generally made Django that much better: Alex Robbins Alexey Boriskin Alexey Tsivunin + Ali Vakilzade Aljosa Mohorovic Amit Chakradeo Amit Ramon diff --git a/django/core/serializers/__init__.py b/django/core/serializers/__init__.py index 5e16a7560f..793f6dc2bd 100644 --- a/django/core/serializers/__init__.py +++ b/django/core/serializers/__init__.py @@ -28,6 +28,7 @@ BUILTIN_SERIALIZERS = { "python": "django.core.serializers.python", "json": "django.core.serializers.json", "yaml": "django.core.serializers.pyyaml", + "jsonl": "django.core.serializers.jsonl", } _serializers = {} diff --git a/django/core/serializers/jsonl.py b/django/core/serializers/jsonl.py new file mode 100644 index 0000000000..ff0d9eb605 --- /dev/null +++ b/django/core/serializers/jsonl.py @@ -0,0 +1,57 @@ +""" +Serialize data to/from JSON Lines +""" + +import json + +from django.core.serializers.base import DeserializationError +from django.core.serializers.json import DjangoJSONEncoder +from django.core.serializers.python import ( + Deserializer as PythonDeserializer, Serializer as PythonSerializer, +) + + +class Serializer(PythonSerializer): + """Convert a queryset to JSON Lines.""" + internal_use_only = False + + def _init_options(self): + self._current = None + self.json_kwargs = self.options.copy() + self.json_kwargs.pop('stream', None) + self.json_kwargs.pop('fields', None) + self.json_kwargs.pop('indent', None) + self.json_kwargs['separators'] = (',', ': ') + self.json_kwargs.setdefault('cls', DjangoJSONEncoder) + self.json_kwargs.setdefault('ensure_ascii', False) + + def start_serialization(self): + self._init_options() + + def end_object(self, obj): + # self._current has the field data + json.dump(self.get_dump_object(obj), self.stream, **self.json_kwargs) + self.stream.write("\n") + self._current = None + + def getvalue(self): + # Grandparent super + return super(PythonSerializer, self).getvalue() + + +def Deserializer(stream_or_string, **options): + """Deserialize a stream or string of JSON data.""" + if isinstance(stream_or_string, bytes): + stream_or_string = stream_or_string.decode() + if isinstance(stream_or_string, (bytes, str)): + stream_or_string = stream_or_string.split("\n") + + for line in stream_or_string: + if not line.strip(): + continue + try: + yield list(PythonDeserializer([json.loads(line), ], **options))[0] + except (GeneratorExit, DeserializationError): + raise + except Exception as exc: + raise DeserializationError() from exc diff --git a/docs/releases/3.2.txt b/docs/releases/3.2.txt index 6b2fb41144..1bb4dbed1e 100644 --- a/docs/releases/3.2.txt +++ b/docs/releases/3.2.txt @@ -215,7 +215,10 @@ Security Serialization ~~~~~~~~~~~~~ -* ... +* The new :ref:`JSONL ` serializer allows using + the JSON Lines format with :djadmin:`dumpdata` and :djadmin:`loaddata`. This + can be useful for populating large databases because data is loaded line by + line into memory, rather than being loaded all at once. Signals ~~~~~~~ diff --git a/docs/topics/serialization.txt b/docs/topics/serialization.txt index fc27a76138..7cc4905da3 100644 --- a/docs/topics/serialization.txt +++ b/docs/topics/serialization.txt @@ -160,11 +160,14 @@ Identifier Information ``json`` Serializes to and from JSON_. +``jsonl`` Serializes to and from JSONL_. + ``yaml`` Serializes to YAML (YAML Ain't a Markup Language). This serializer is only available if PyYAML_ is installed. ========== ============================================================== .. _json: https://json.org/ +.. _jsonl: http://jsonlines.org/ .. _PyYAML: https://pyyaml.org/ XML @@ -307,6 +310,24 @@ The JSON serializer uses ``DjangoJSONEncoder`` for encoding. A subclass of .. _ecma-262: https://www.ecma-international.org/ecma-262/5.1/#sec-15.9.1.15 +.. _serialization-formats-jsonl: + +JSONL +----- + +.. versionadded:: 3.2 + +*JSONL* stands for *JSON Lines*. With this format, objects are separated by new +lines, and each line contains a valid JSON object. JSONL serialized data look +like this:: + + { "pk": "4b678b301dfd8a4e0dad910de3ae245b", "model": "sessions.session", "fields": { ... }} + { "pk": "88bea72c02274f3c9bf1cb2bb8cee4fc", "model": "sessions.session", "fields": { ... }} + { "pk": "9cf0e26691b64147a67e2a9f06ad7a53", "model": "sessions.session", "fields": { ... }} + +JSONL can be useful for populating large databases, since the data can be +processed line by line, rather than being loaded into memory all at once. + YAML ---- diff --git a/tests/serializers/test_jsonl.py b/tests/serializers/test_jsonl.py new file mode 100644 index 0000000000..19557e6c29 --- /dev/null +++ b/tests/serializers/test_jsonl.py @@ -0,0 +1,312 @@ +import decimal +import json +import re + +from django.core import serializers +from django.core.serializers.base import DeserializationError +from django.db import models +from django.test import TestCase, TransactionTestCase +from django.test.utils import isolate_apps + +from .models import Score +from .tests import SerializersTestBase, SerializersTransactionTestBase + + +class JsonlSerializerTestCase(SerializersTestBase, TestCase): + serializer_name = "jsonl" + pkless_str = [ + """{ + "pk": null, + "model": "serializers.category", + "fields": {"name": "Reference"} + }""", + """{ + "model": "serializers.category", + "fields": {"name": "Non-fiction"} + }""" + ] + pkless_str = "\n".join([s.replace("\n", "") for s in pkless_str]) + + mapping_ordering_str = """{ +"model": "serializers.article", +"pk": %(article_pk)s, +"fields": { +"author": %(author_pk)s, +"headline": "Poker has no place on ESPN", +"pub_date": "2006-06-16T11:00:00", +"categories": [ +%(first_category_pk)s, +%(second_category_pk)s +], +"meta_data": [] +} +}""".replace("\n", "") + "\n" + + @staticmethod + def _validate_output(serial_str): + try: + for line in serial_str.split("\n"): + if line: + json.loads(line) + except Exception: + return False + else: + return True + + @staticmethod + def _get_pk_values(serial_str): + serial_list = [json.loads(line) for line in serial_str.split("\n") if line] + return [obj_dict['pk'] for obj_dict in serial_list] + + @staticmethod + def _get_field_values(serial_str, field_name): + serial_list = [json.loads(line) for line in serial_str.split("\n") if line] + return [obj_dict['fields'][field_name] for obj_dict in serial_list if field_name in obj_dict['fields']] + + def test_no_indentation(self): + s = serializers.jsonl.Serializer() + json_data = s.serialize([Score(score=5.0), Score(score=6.0)], indent=2) + for line in json_data.splitlines(): + self.assertIsNone(re.search(r'.+,\s*$', line)) + + @isolate_apps('serializers') + def test_custom_encoder(self): + class ScoreDecimal(models.Model): + score = models.DecimalField() + + class CustomJSONEncoder(json.JSONEncoder): + def default(self, o): + if isinstance(o, decimal.Decimal): + return str(o) + return super().default(o) + + s = serializers.jsonl.Serializer() + json_data = s.serialize( + [ScoreDecimal(score=decimal.Decimal(1.0))], cls=CustomJSONEncoder + ) + self.assertIn('"fields": {"score": "1"}', json_data) + + def test_json_deserializer_exception(self): + with self.assertRaises(DeserializationError): + for obj in serializers.deserialize("jsonl", """[{"pk":1}"""): + pass + + def test_helpful_error_message_invalid_pk(self): + """ + If there is an invalid primary key, the error message should contain + the model associated with it. + """ + test_string = """{ + "pk": "badpk", + "model": "serializers.player", + "fields": { + "name": "Bob", + "rank": 1, + "team": "Team" + } + }""".replace("\n", "") + with self.assertRaisesMessage(DeserializationError, "(serializers.player:pk=badpk)"): + list(serializers.deserialize('jsonl', test_string)) + + def test_helpful_error_message_invalid_field(self): + """ + If there is an invalid field value, the error message should contain + the model associated with it. + """ + test_string = """{ + "pk": "1", + "model": "serializers.player", + "fields": { + "name": "Bob", + "rank": "invalidint", + "team": "Team" + } + }""".replace("\n", "") + expected = "(serializers.player:pk=1) field_value was 'invalidint'" + with self.assertRaisesMessage(DeserializationError, expected): + list(serializers.deserialize('jsonl', test_string)) + + def test_helpful_error_message_for_foreign_keys(self): + """ + Invalid foreign keys with a natural key should throw a helpful error + message, such as what the failing key is. + """ + test_string = """{ + "pk": 1, + "model": "serializers.category", + "fields": { + "name": "Unknown foreign key", + "meta_data": [ + "doesnotexist", + "metadata" + ] + } + }""".replace("\n", "") + key = ["doesnotexist", "metadata"] + expected = "(serializers.category:pk=1) field_value was '%r'" % key + with self.assertRaisesMessage(DeserializationError, expected): + list(serializers.deserialize('jsonl', test_string)) + + def test_helpful_error_message_for_many2many_non_natural(self): + """ + Invalid many-to-many keys should throw a helpful error message. + """ + test_strings = [ + """{ + "pk": 1, + "model": "serializers.article", + "fields": { + "author": 1, + "headline": "Unknown many to many", + "pub_date": "2014-09-15T10:35:00", + "categories": [1, "doesnotexist"] + } + }""", + """{ + "pk": 1, + "model": "serializers.author", + "fields": { + "name": "Agnes" + } + }""", + """{ + "pk": 1, + "model": "serializers.category", + "fields": { + "name": "Reference" + } + }""" + ] + test_string = "\n".join([s.replace("\n", "") for s in test_strings]) + expected = "(serializers.article:pk=1) field_value was 'doesnotexist'" + with self.assertRaisesMessage(DeserializationError, expected): + list(serializers.deserialize('jsonl', test_string)) + + def test_helpful_error_message_for_many2many_natural1(self): + """ + Invalid many-to-many keys should throw a helpful error message. + This tests the code path where one of a list of natural keys is invalid. + """ + test_strings = [ + """{ + "pk": 1, + "model": "serializers.categorymetadata", + "fields": { + "kind": "author", + "name": "meta1", + "value": "Agnes" + } + }""", + """{ + "pk": 1, + "model": "serializers.article", + "fields": { + "author": 1, + "headline": "Unknown many to many", + "pub_date": "2014-09-15T10:35:00", + "meta_data": [ + ["author", "meta1"], + ["doesnotexist", "meta1"], + ["author", "meta1"] + ] + } + }""", + """{ + "pk": 1, + "model": "serializers.author", + "fields": { + "name": "Agnes" + } + }""" + ] + test_string = "\n".join([s.replace("\n", "") for s in test_strings]) + key = ["doesnotexist", "meta1"] + expected = "(serializers.article:pk=1) field_value was '%r'" % key + with self.assertRaisesMessage(DeserializationError, expected): + for obj in serializers.deserialize('jsonl', test_string): + obj.save() + + def test_helpful_error_message_for_many2many_natural2(self): + """ + Invalid many-to-many keys should throw a helpful error message. This + tests the code path where a natural many-to-many key has only a single + value. + """ + test_strings = [ + """{ + "pk": 1, + "model": "serializers.article", + "fields": { + "author": 1, + "headline": "Unknown many to many", + "pub_date": "2014-09-15T10:35:00", + "meta_data": [1, "doesnotexist"] + } + }""", + """{ + "pk": 1, + "model": "serializers.categorymetadata", + "fields": { + "kind": "author", + "name": "meta1", + "value": "Agnes" + } + }""", + """{ + "pk": 1, + "model": "serializers.author", + "fields": { + "name": "Agnes" + } + }""" + ] + test_string = "\n".join([s.replace("\n", "") for s in test_strings]) + expected = "(serializers.article:pk=1) field_value was 'doesnotexist'" + with self.assertRaisesMessage(DeserializationError, expected): + for obj in serializers.deserialize('jsonl', test_string, ignore=False): + obj.save() + + def test_helpful_error_message_for_many2many_not_iterable(self): + """ + Not iterable many-to-many field value throws a helpful error message. + """ + test_string = """{ + "pk": 1, + "model": "serializers.m2mdata", + "fields": {"data": null} + }""".replace("\n", "") + + expected = "(serializers.m2mdata:pk=1) field_value was 'None'" + with self.assertRaisesMessage(DeserializationError, expected): + next(serializers.deserialize('jsonl', test_string, ignore=False)) + + +class JsonSerializerTransactionTestCase(SerializersTransactionTestBase, TransactionTestCase): + serializer_name = "jsonl" + fwd_ref_str = [ + """{ + "pk": 1, + "model": "serializers.article", + "fields": { + "headline": "Forward references pose no problem", + "pub_date": "2006-06-16T15:00:00", + "categories": [1], + "author": 1 + } + }""", + """{ + "pk": 1, + "model": "serializers.category", + "fields": { + "name": "Reference" + } + }""", + """{ + "pk": 1, + "model": "serializers.author", + "fields": { + "name": "Agnes" + } + }""" + ] + fwd_ref_str = "\n".join([s.replace("\n", "") for s in fwd_ref_str])