From 4ea7a11659b8a0ab07b0d2e847975f7324664f10 Mon Sep 17 00:00:00 2001 From: Jacob Kaplan-Moss Date: Wed, 28 Jun 2006 16:00:37 +0000 Subject: [PATCH] Added initial cut at serialization framework, along with some basic tests and a stab at some docs. This is all a bit rough right now, so expect some bumps. git-svn-id: http://code.djangoproject.com/svn/django/trunk@3225 bcc190cf-cafb-0310-a4f2-bffc1f526a37 --- django/core/serializers/__init__.py | 76 ++++++++ django/core/serializers/base.py | 159 ++++++++++++++++ django/core/serializers/xml_serializer.py | 218 ++++++++++++++++++++++ docs/serialization.txt | 85 +++++++++ tests/modeltests/serializers/__init__.py | 0 tests/modeltests/serializers/models.py | 94 ++++++++++ 6 files changed, 632 insertions(+) create mode 100644 django/core/serializers/__init__.py create mode 100644 django/core/serializers/base.py create mode 100644 django/core/serializers/xml_serializer.py create mode 100644 docs/serialization.txt create mode 100644 tests/modeltests/serializers/__init__.py create mode 100644 tests/modeltests/serializers/models.py diff --git a/django/core/serializers/__init__.py b/django/core/serializers/__init__.py new file mode 100644 index 0000000000..72c4407b59 --- /dev/null +++ b/django/core/serializers/__init__.py @@ -0,0 +1,76 @@ +""" +Interfaces for serializing Django objects. + +Usage:: + + >>> from django.core import serializers + >>> json = serializers.serialize("json", some_query_set) + >>> objects = list(serializers.deserialize("json", json)) + +To add your own serializers, use the SERIALIZATION_MODULES setting:: + + SERIALIZATION_MODULES = { + "csv" : "path.to.csv.serializer", + "txt" : "path.to.txt.serializer", + } + +""" + +from django.conf import settings + +# Built-in serializers +BUILTIN_SERIALIZERS = { + "xml" : "django.core.serializers.xml_serializer", +} + +_serializers = {} + +def register_serializer(format, serializer_module): + """Register a new serializer by passing in a module name.""" + module = __import__(serializer_module, '', '', ['']) + _serializers[format] = module + +def unregister_serializer(format): + """Unregister a given serializer""" + del _serializers[format] + +def get_serializer(format): + if not _serializers: + _load_serializers() + return _serializers[format].Serializer + +def get_deserializer(format): + if not _serializers: + _load_serializers() + return _serializers[format].Deserializer + +def serialize(format, queryset, **options): + """ + Serialize a queryset (or any iterator that returns database objects) using + a certain serializer. + """ + s = get_serializer(format)() + s.serialize(queryset, **options) + return s.getvalue() + +def deserialize(format, stream_or_string): + """ + Deserialize a stream or a string. Returns an iterator that yields ``(obj, + m2m_relation_dict)``, where ``obj`` is a instantiated -- but *unsaved* -- + object, and ``m2m_relation_dict`` is a dictionary of ``{m2m_field_name : + list_of_related_objects}``. + """ + d = get_deserializer(format) + return d(stream_or_string) + +def _load_serializers(): + """ + Register built-in and settings-defined serializers. This is done lazily so + that user code has a chance to (e.g.) set up custom settings without + needing to be careful of import order. + """ + for format in BUILTIN_SERIALIZERS: + register_serializer(format, BUILTIN_SERIALIZERS[format]) + if hasattr(settings, "SERIALIZATION_MODULES"): + for format in settings.SERIALIZATION_MODULES: + register_serializer(format, settings.SERIALIZATION_MODULES[format]) \ No newline at end of file diff --git a/django/core/serializers/base.py b/django/core/serializers/base.py new file mode 100644 index 0000000000..5c84861326 --- /dev/null +++ b/django/core/serializers/base.py @@ -0,0 +1,159 @@ +""" +Module for abstract serializer/unserializer base classes. +""" + +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO +from django.db import models + +class SerializationError(Exception): + """Something bad happened during serialization.""" + pass + +class DeserializationError(Exception): + """Something bad happened during deserialization.""" + pass + +class Serializer(object): + """ + Abstract serializer base class. + """ + + def serialize(self, queryset, **options): + """ + Serialize a queryset. + """ + self.options = options + + self.stream = options.get("stream", StringIO()) + + self.start_serialization() + for obj in queryset: + self.start_object(obj) + for field in obj._meta.fields: + if field.rel is None: + self.handle_field(obj, field) + else: + self.handle_fk_field(obj, field) + for field in obj._meta.many_to_many: + self.handle_m2m_field(obj, field) + self.end_object(obj) + self.end_serialization() + return self.getvalue() + + def get_string_value(self, obj, field): + """ + Convert a field's value to a string. + """ + if isinstance(field, models.DateTimeField): + value = getattr(obj, field.name).strftime("%Y-%m-%d %H:%M:%S") + elif isinstance(field, models.FileField): + value = getattr(obj, "get_%s_url" % field.name, lambda: None)() + else: + value = field.flatten_data(follow=None, obj=obj).get(field.name, "") + return str(value) + + def start_serialization(self): + """ + Called when serializing of the queryset starts. + """ + raise NotImplementedError + + def end_serialization(self): + """ + Called when serializing of the queryset ends. + """ + pass + + def start_object(self, obj): + """ + Called when serializing of an object starts. + """ + raise NotImplementedError + + def end_object(self, obj): + """ + Called when serializing of an object ends. + """ + pass + + def handle_field(self, obj, field): + """ + Called to handle each individual (non-relational) field on an object. + """ + raise NotImplementedError + + def handle_fk_field(self, obj, field): + """ + Called to handle a ForeignKey field. + """ + raise NotImplementedError + + def handle_m2m_field(self, obj, field): + """ + Called to handle a ManyToManyField. + """ + raise NotImplementedError + + def getvalue(self): + """ + Return the fully serialized queryset. + """ + return self.stream.getvalue() + +class Deserializer(object): + """ + Abstract base deserializer class. + """ + + def __init__(self, stream_or_string, **options): + """ + Init this serializer given a stream or a string + """ + self.options = options + if isinstance(stream_or_string, basestring): + self.stream = StringIO(stream_or_string) + else: + self.stream = stream_or_string + # hack to make sure that the models have all been loaded before + # deserialization starts (otherwise subclass calls to get_model() + # and friends might fail...) + models.get_apps() + + def __iter__(self): + return self + + def next(self): + """Iteration iterface -- return the next item in the stream""" + raise NotImplementedError + +class DeserializedObject(object): + """ + A deserialzed model. + + Basically a container for holding the pre-saved deserialized data along + with the many-to-many data saved with the object. + + Call ``save()`` to save the object (with the many-to-many data) to the + database; call ``save(save_m2m=False)`` to save just the object fields + (and not touch the many-to-many stuff.) + """ + + def __init__(self, obj, m2m_data=None): + self.object = obj + self.m2m_data = m2m_data + + def __repr__(self): + return "" % str(self.object) + + def save(self, save_m2m=True): + self.object.save() + if self.m2m_data and save_m2m: + for accessor_name, object_list in self.m2m_data.items(): + setattr(self.object, accessor_name, object_list) + + # prevent a second (possibly accidental) call to save() from saving + # the m2m data twice. + self.m2m_data = None diff --git a/django/core/serializers/xml_serializer.py b/django/core/serializers/xml_serializer.py new file mode 100644 index 0000000000..ab8769f237 --- /dev/null +++ b/django/core/serializers/xml_serializer.py @@ -0,0 +1,218 @@ +""" +XML serializer. +""" + +from xml.dom import pulldom +from django.utils.xmlutils import SimplerXMLGenerator +from django.core.serializers import base +from django.db import models + +class Serializer(base.Serializer): + """ + Serializes a QuerySet to XML. + """ + + def start_serialization(self): + """ + Start serialization -- open the XML document and the root element. + """ + self.xml = SimplerXMLGenerator(self.stream, self.options.get("encoding", "utf-8")) + self.xml.startDocument() + self.xml.startElement("django-objects", {"version" : "1.0"}) + + def end_serialization(self): + """ + End serialization -- end the document. + """ + self.xml.endElement("django-objects") + self.xml.endDocument() + + def start_object(self, obj): + """ + Called as each object is handled. + """ + if not hasattr(obj, "_meta"): + raise base.SerializationError("Non-model object (%s) encountered during serialization" % type(obj)) + + self.xml.startElement("object", { + "pk" : str(obj._get_pk_val()), + "model" : str(obj._meta), + }) + + def end_object(self, obj): + """ + Called after handling all fields for an object. + """ + self.xml.endElement("object") + + def handle_field(self, obj, field): + """ + Called to handle each field on an object (except for ForeignKeys and + ManyToManyFields) + """ + self.xml.startElement("field", { + "name" : field.name, + "type" : field.get_internal_type() + }) + + # Get a "string version" of the object's data (this is handled by the + # serializer base class). None is handled specially. + value = self.get_string_value(obj, field) + if value is None: + self.xml.addQuickElement("None") + else: + self.xml.characters(str(value)) + + self.xml.endElement("field") + + def handle_fk_field(self, obj, field): + """ + Called to handle a ForeignKey (we need to treat them slightly + differently from regular fields). + """ + self._start_relational_field(field) + related = getattr(obj, field.name) + if related is not None: + self.xml.characters(str(related._get_pk_val())) + else: + self.xml.addQuickElement("None") + self.xml.endElement("field") + + def handle_m2m_field(self, obj, field): + """ + Called to handle a ManyToManyField. Related objects are only + serialized as references to the object's PK (i.e. the related *data* + is not dumped, just the relation). + """ + self._start_relational_field(field) + for relobj in getattr(obj, field.name).iterator(): + self.xml.addQuickElement("object", attrs={"pk" : str(relobj._get_pk_val())}) + self.xml.endElement("field") + + def _start_relational_field(self, field): + """ + Helper to output the element for relational fields + """ + self.xml.startElement("field", { + "name" : field.name, + "rel" : field.rel.__class__.__name__, + "to" : str(field.rel.to._meta), + }) + +class Deserializer(base.Deserializer): + """ + Deserialize XML. + """ + + def __init__(self, stream_or_string, **options): + super(Deserializer, self).__init__(stream_or_string, **options) + self.encoding = self.options.get("encoding", "utf-8") + self.event_stream = pulldom.parse(self.stream) + + def next(self): + for event, node in self.event_stream: + if event == "START_ELEMENT" and node.nodeName == "object": + self.event_stream.expandNode(node) + return self._handle_object(node) + raise StopIteration + + def _handle_object(self, node): + """ + Convert an node to a DeserializedObject. + """ + # Look up the model using the model loading mechanism. If this fails, bail. + Model = self._get_model_from_node(node, "model") + + # Start building a data dictionary from the object. If the node is + # missing the pk attribute, bail. + pk = node.getAttribute("pk") + if not pk: + raise base.DeserializationError(" node is missing the 'pk' attribute") + data = {Model._meta.pk.name : pk} + + # Also start building a dict of m2m data (this is saved as + # {m2m_accessor_attribute : [list_of_related_objects]}) + m2m_data = {} + + # Deseralize each field. + for field_node in node.getElementsByTagName("field"): + # If the field is missing the name attribute, bail (are you + # sensing a pattern here?) + field_name = field_node.getAttribute("name") + if not field_name: + raise base.DeserializationError(" node is missing the 'name' attribute") + + # Get the field from the Model. This will raise a + # FieldDoesNotExist if, well, the field doesn't exist, which will + # be propagated correctly. + field = Model._meta.get_field(field_name) + + # As is usually the case, relation fields get the special treatment. + if field.rel and isinstance(field.rel, models.ManyToManyRel): + m2m_data[field.name] = self._handle_m2m_field_node(field_node) + elif field.rel and isinstance(field.rel, models.ManyToOneRel): + data[field.name] = self._handle_fk_field_node(field_node) + else: + value = field.to_python(getInnerText(field_node).strip().encode(self.encoding)) + data[field.name] = value + + # Return a DeserializedObject so that the m2m data has a place to live. + return base.DeserializedObject(Model(**data), m2m_data) + + def _handle_fk_field_node(self, node): + """ + Handle a node for a ForeignKey + """ + # Try to set the foreign key by looking up the foreign related object. + # If it doesn't exist, set the field to None (which might trigger + # validation error, but that's expected). + RelatedModel = self._get_model_from_node(node, "to") + return RelatedModel.objects.get(pk=getInnerText(node).strip().encode(self.encoding)) + + def _handle_m2m_field_node(self, node): + """ + Handle a node for a ManyToManyField + """ + # Load the related model + RelatedModel = self._get_model_from_node(node, "to") + + # Look up all the related objects. Using the in_bulk() lookup ensures + # that missing related objects don't cause an exception + related_ids = [c.getAttribute("pk").encode(self.encoding) for c in node.getElementsByTagName("object")] + return RelatedModel._default_manager.in_bulk(related_ids).values() + + def _get_model_from_node(self, node, attr): + """ + Helper to look up a model from a or a node. + """ + model_identifier = node.getAttribute(attr) + if not model_identifier: + raise base.DeserializationError( + "<%s> node is missing the required '%s' attribute" \ + % (node.nodeName, attr)) + try: + Model = models.get_model(*model_identifier.split(".")) + except TypeError: + Model = None + if Model is None: + raise base.DeserializationError( + "<%s> node has invalid model identifier: '%s'" % \ + (node.nodeName, model_identifier)) + return Model + + +def getInnerText(node): + """ + Get all the inner text of a DOM node (recursively). + """ + # inspired by http://mail.python.org/pipermail/xml-sig/2005-March/011022.html + inner_text = [] + for child in node.childNodes: + if child.nodeType == child.TEXT_NODE or child.nodeType == child.CDATA_SECTION_NODE: + inner_text.append(child.data) + elif child.nodeType == child.ELEMENT_NODE: + inner_text.extend(getInnerText(child)) + else: + pass + return "".join(inner_text) \ No newline at end of file diff --git a/docs/serialization.txt b/docs/serialization.txt new file mode 100644 index 0000000000..41954b7a0d --- /dev/null +++ b/docs/serialization.txt @@ -0,0 +1,85 @@ +========================== +Serializing Django objects +========================== + +.. note:: + + This API is currently under heavy development and may change -- + perhaps drastically -- in the future. + + You have been warned. + +Django's serialization framework provides a mechanism for "translating" Django +objects into other formats. Usually these other formats will be text-based and +used for sending Django objects over a wire, but it's possible for a +serializer to handle any format (text-based or not). + +Serializing data +---------------- + +At the highest level, serializing data is a very simple operation:: + + from django.core import serializers + data = serializers.serialize("xml", SomeModel.objects.all()) + +The arguments to the ``serialize`` function are the format to serialize the +data to (see `Serialization formats`_) and a QuerySet_ to serialize. +(Actually, the second argument can be any iterator that yields Django objects, +but it'll almost always be a QuerySet). + +.. _QuerySet: ../db_api/#retrieving-objects + +You can also use a serializer object directly:: + + xml_serializer = serializers.get_serializer("xml") + xml_serializer.serialize(queryset) + data = xml_serializer.getvalue() + +This is useful if you want to serialize data directly to a file-like object +(which includes a HTTPResponse_):: + + out = open("file.xml", "w") + xml_serializer.serialize(SomeModel.objects.all(), stream=out) + +.. _HTTPResponse: ../request_response/#httpresponse-objects + +Deserializing data +------------------ + +Deserializing data is also a fairly simple operation:: + + for obj in serializers.deserialize("xml", data): + do_something_with(obj) + +As you can see, the ``deserialize`` function takes the same format argument as +``serialize``, a string or stream of data, and returns an iterator. + +However, here it gets slightly complicated. The objects returned by the +``deserialize`` iterator *aren't* simple Django objects. Instead, they are +special ``DeserializedObject`` instances that wrap a created -- but unsaved -- +object and any associated relationship data. + +Calling ``DeserializedObject.save()`` saves the object to the database. + +This ensures that deserializing is a non-destructive operation even if the +data in your serialized representation doesn't match what's currently in the +database. Usually, working with these ``DeserializedObject`` instances looks +something like:: + + for deserialized_object in serializers.deserialize("xml", data): + if object_should_be_saved(deserialized_object): + obj.save() + +In other words, the usual use is to examine the deserialized objects to make +sure that they are "appropriate" for saving before doing so. Of course, if you trust your data source you could just save the object and move on. + +The Django object itself can be inspected as ``deserialized_object.object``. + +Serialization formats +--------------------- + +Django "ships" with a few included serializers, and there's a simple API for creating and registering your own... + +.. note:: + + ... which will be documented once the API is stable :) diff --git a/tests/modeltests/serializers/__init__.py b/tests/modeltests/serializers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/modeltests/serializers/models.py b/tests/modeltests/serializers/models.py new file mode 100644 index 0000000000..8c9483beba --- /dev/null +++ b/tests/modeltests/serializers/models.py @@ -0,0 +1,94 @@ +""" +XXX. Serialization + +``django.core.serializers`` provides interfaces to converting Django querysets +to and from "flat" data (i.e. strings). +""" + +from django.db import models + +class Category(models.Model): + name = models.CharField(maxlength=20) + + class Meta: + ordering = ('name',) + + def __str__(self): + return self.name + +class Author(models.Model): + name = models.CharField(maxlength=20) + + class Meta: + ordering = ('name',) + + def __str__(self): + return self.name + +class Article(models.Model): + author = models.ForeignKey(Author) + headline = models.CharField(maxlength=50) + pub_date = models.DateTimeField() + categories = models.ManyToManyField(Category) + + class Meta: + ordering = ('pub_date',) + + def __str__(self): + return self.headline + +API_TESTS = """ +# Create some data: +>>> from datetime import datetime +>>> sports = Category(name="Sports") +>>> music = Category(name="Music") +>>> op_ed = Category(name="Op-Ed") +>>> sports.save(); music.save(); op_ed.save() + +>>> joe = Author(name="Joe") +>>> jane = Author(name="Jane") +>>> joe.save(); jane.save() + +>>> a1 = Article( +... author = jane, +... headline = "Poker has no place on ESPN", +... pub_date = datetime(2006, 6, 16, 11, 00)) +>>> a2 = Article( +... author = joe, +... headline = "Time to reform copyright", +... pub_date = datetime(2006, 6, 16, 13, 00)) +>>> a1.save(); a2.save() +>>> a1.categories = [sports, op_ed] +>>> a2.categories = [music, op_ed] + +# Serialize a queryset to XML +>>> from django.core import serializers +>>> xml = serializers.serialize("xml", Article.objects.all()) + +# The output is valid XML +>>> from xml.dom import minidom +>>> dom = minidom.parseString(xml) + +# Deserializing has a similar interface, except that special DeserializedObject +# instances are returned. This is because data might have changed in the +# database since the data was serialized (we'll simulate that below). +>>> for obj in serializers.deserialize("xml", xml): +... print obj + + + +# Deserializing data with different field values doesn't change anything in the +# database until we call save(): +>>> xml = xml.replace("Poker has no place on ESPN", "Poker has no place on television") +>>> objs = list(serializers.deserialize("xml", xml)) + +# Even those I deserialized, the database hasn't been touched +>>> Article.objects.all() +[, ] + +# But when I save, the data changes as you might except. +>>> objs[0].save() +>>> Article.objects.all() +[, ] + +"""