diff --git a/django/db/backends/__init__.py b/django/db/backends/__init__.py index 2da1f2be0f..3075bdb3e4 100644 --- a/django/db/backends/__init__.py +++ b/django/db/backends/__init__.py @@ -475,6 +475,14 @@ class BaseDatabaseOperations(object): """ return None + def bulk_batch_size(self, fields, objs): + """ + Returns the maximum allowed batch size for the backend. The fields + are the fields going to be inserted in the batch, the objs contains + all the objects to be inserted. + """ + return len(objs) + def cache_key_culling_sql(self): """ Returns a SQL query that retrieves the first cache key greater than the @@ -522,6 +530,17 @@ class BaseDatabaseOperations(object): """ return '' + def distinct_sql(self, fields): + """ + Returns an SQL DISTINCT clause which removes duplicate rows from the + result set. If any fields are given, only the given fields are being + checked for duplicates. + """ + if fields: + raise NotImplementedError('DISTINCT ON fields is not supported by this database backend') + else: + return 'DISTINCT' + def drop_foreignkey_sql(self): """ Returns the SQL command that drops a foreign key. @@ -577,17 +596,6 @@ class BaseDatabaseOperations(object): """ raise NotImplementedError('Full-text search is not implemented for this database backend') - def distinct_sql(self, fields): - """ - Returns an SQL DISTINCT clause which removes duplicate rows from the - result set. If any fields are given, only the given fields are being - checked for duplicates. - """ - if fields: - raise NotImplementedError('DISTINCT ON fields is not supported by this database backend') - else: - return 'DISTINCT' - def last_executed_query(self, cursor, sql, params): """ Returns a string of the query last executed by the given cursor, with diff --git a/django/db/backends/sqlite3/base.py b/django/db/backends/sqlite3/base.py index d7f51605d5..60723124c1 100644 --- a/django/db/backends/sqlite3/base.py +++ b/django/db/backends/sqlite3/base.py @@ -85,7 +85,7 @@ class DatabaseFeatures(BaseDatabaseFeatures): supports_1000_query_parameters = False supports_mixed_date_datetime_comparisons = False has_bulk_insert = True - can_combine_inserts_with_and_without_auto_increment_pk = True + can_combine_inserts_with_and_without_auto_increment_pk = False @cached_property def supports_stddev(self): @@ -107,6 +107,13 @@ class DatabaseFeatures(BaseDatabaseFeatures): return has_support class DatabaseOperations(BaseDatabaseOperations): + def bulk_batch_size(self, fields, objs): + """ + SQLite has a compile-time default (SQLITE_LIMIT_VARIABLE_NUMBER) of + 999 variables per query. + """ + return (999 // len(fields)) if len(fields) > 0 else len(objs) + def date_extract_sql(self, lookup_type, field_name): # sqlite doesn't support extract, so we fake it with the user-defined # function django_extract that's registered in connect(). Note that diff --git a/django/db/models/query.py b/django/db/models/query.py index 0f1d87c642..ebe61a1226 100644 --- a/django/db/models/query.py +++ b/django/db/models/query.py @@ -388,7 +388,7 @@ class QuerySet(object): obj.save(force_insert=True, using=self.db) return obj - def bulk_create(self, objs): + def bulk_create(self, objs, batch_size=None): """ Inserts each of the instances into the database. This does *not* call save() on each of the instances, does not send any pre/post save @@ -401,8 +401,10 @@ class QuerySet(object): # this could be implemented if you didn't have an autoincrement pk, # and 2) you could do it by doing O(n) normal inserts into the parent # tables to get the primary keys back, and then doing a single bulk - # insert into the childmost table. We're punting on these for now - # because they are relatively rare cases. + # insert into the childmost table. Some databases might allow doing + # this by using RETURNING clause for the insert query. We're punting + # on these for now because they are relatively rare cases. + assert batch_size is None or batch_size > 0 if self.model._meta.parents: raise ValueError("Can't bulk create an inherited model") if not objs: @@ -418,13 +420,14 @@ class QuerySet(object): try: if (connection.features.can_combine_inserts_with_and_without_auto_increment_pk and self.model._meta.has_auto_field): - self.model._base_manager._insert(objs, fields=fields, using=self.db) + self._batched_insert(objs, fields, batch_size) else: objs_with_pk, objs_without_pk = partition(lambda o: o.pk is None, objs) if objs_with_pk: - self.model._base_manager._insert(objs_with_pk, fields=fields, using=self.db) + self._batched_insert(objs_with_pk, fields, batch_size) if objs_without_pk: - self.model._base_manager._insert(objs_without_pk, fields=[f for f in fields if not isinstance(f, AutoField)], using=self.db) + fields= [f for f in fields if not isinstance(f, AutoField)] + self._batched_insert(objs_without_pk, fields, batch_size) if forced_managed: transaction.commit(using=self.db) else: @@ -860,6 +863,20 @@ class QuerySet(object): ################### # PRIVATE METHODS # ################### + def _batched_insert(self, objs, fields, batch_size): + """ + A little helper method for bulk_insert to insert the bulk one batch + at a time. Inserts recursively a batch from the front of the bulk and + then _batched_insert() the remaining objects again. + """ + if not objs: + return + ops = connections[self.db].ops + batch_size = (batch_size or max(ops.bulk_batch_size(fields, objs), 1)) + for batch in [objs[i:i+batch_size] + for i in range(0, len(objs), batch_size)]: + self.model._base_manager._insert(batch, fields=fields, + using=self.db) def _clone(self, klass=None, setup=False, **kwargs): if klass is None: diff --git a/docs/ref/models/querysets.txt b/docs/ref/models/querysets.txt index 0a9005ad26..8c188c67c3 100644 --- a/docs/ref/models/querysets.txt +++ b/docs/ref/models/querysets.txt @@ -1350,7 +1350,7 @@ has a side effect on your data. For more, see `Safe methods`_ in the HTTP spec. bulk_create ~~~~~~~~~~~ -.. method:: bulk_create(objs) +.. method:: bulk_create(objs, batch_size=None) .. versionadded:: 1.4 @@ -1372,20 +1372,12 @@ This has a number of caveats though: * If the model's primary key is an :class:`~django.db.models.AutoField` it does not retrieve and set the primary key attribute, as ``save()`` does. -.. admonition:: Limits of SQLite +The ``batch_size`` parameter controls how many objects are created in single +query. The default is to create all objects in one batch, except for SQLite +where the default is such that at maximum 999 variables per query is used. - SQLite sets a limit on the number of parameters per SQL statement. The - maximum is defined by the SQLITE_MAX_VARIABLE_NUMBER_ compilation option, - which defaults to 999. For instance, if your model has 8 fields (including - the primary key), you cannot create more than 999 // 8 = 124 instances at - a time. If you exceed this limit, you'll get an exception:: - - django.db.utils.DatabaseError: too many SQL variables - - If your application's performance requirements exceed SQLite's limits, you - should switch to another database engine, such as PostgreSQL. - -.. _SQLITE_MAX_VARIABLE_NUMBER: http://sqlite.org/limits.html#max_variable_number +.. versionadded:: 1.5 + The ``batch_size`` parameter was added in version 1.5. count ~~~~~ diff --git a/docs/releases/1.5.txt b/docs/releases/1.5.txt index b863d102ec..fd9ae4f038 100644 --- a/docs/releases/1.5.txt +++ b/docs/releases/1.5.txt @@ -106,6 +106,11 @@ Django 1.5 also includes several smaller improvements worth noting: * The :ref:`receiver ` decorator is now able to connect to more than one signal by supplying a list of signals. +* :meth:`QuerySet.bulk_create() + ` has now a batch_size + argument. By default the batch_size is unlimited except for SQLite where + single batch is limited so that 999 parameters per query isn't exceeded. + Backwards incompatible changes in 1.5 ===================================== diff --git a/tests/regressiontests/bulk_create/models.py b/tests/regressiontests/bulk_create/models.py index a4c611d537..bc685bbbe4 100644 --- a/tests/regressiontests/bulk_create/models.py +++ b/tests/regressiontests/bulk_create/models.py @@ -18,4 +18,8 @@ class Pizzeria(Restaurant): pass class State(models.Model): - two_letter_code = models.CharField(max_length=2, primary_key=True) \ No newline at end of file + two_letter_code = models.CharField(max_length=2, primary_key=True) + +class TwoFields(models.Model): + f1 = models.IntegerField(unique=True) + f2 = models.IntegerField(unique=True) diff --git a/tests/regressiontests/bulk_create/tests.py b/tests/regressiontests/bulk_create/tests.py index 0b55f637a4..33108ea9b0 100644 --- a/tests/regressiontests/bulk_create/tests.py +++ b/tests/regressiontests/bulk_create/tests.py @@ -2,9 +2,11 @@ from __future__ import absolute_import from operator import attrgetter -from django.test import TestCase, skipIfDBFeature, skipUnlessDBFeature +from django.db import connection +from django.test import TestCase, skipIfDBFeature +from django.test.utils import override_settings -from .models import Country, Restaurant, Pizzeria, State +from .models import Country, Restaurant, Pizzeria, State, TwoFields class BulkCreateTests(TestCase): @@ -27,7 +29,6 @@ class BulkCreateTests(TestCase): self.assertEqual(created, []) self.assertEqual(Country.objects.count(), 4) - @skipUnlessDBFeature("has_bulk_insert") def test_efficiency(self): with self.assertNumQueries(1): Country.objects.bulk_create(self.data) @@ -69,3 +70,42 @@ class BulkCreateTests(TestCase): invalid_country = Country(id=0, name='Poland', iso_two_letter='PL') with self.assertRaises(ValueError): Country.objects.bulk_create([valid_country, invalid_country]) + + def test_large_batch(self): + with override_settings(DEBUG=True): + connection.queries = [] + TwoFields.objects.bulk_create([ + TwoFields(f1=i, f2=i+1) for i in range(0, 1001) + ]) + self.assertTrue(len(connection.queries) < 10) + self.assertEqual(TwoFields.objects.count(), 1001) + self.assertEqual( + TwoFields.objects.filter(f1__gte=450, f1__lte=550).count(), + 101) + self.assertEqual(TwoFields.objects.filter(f2__gte=901).count(), 101) + + def test_large_batch_mixed(self): + """ + Test inserting a large batch with objects having primary key set + mixed together with objects without PK set. + """ + with override_settings(DEBUG=True): + connection.queries = [] + TwoFields.objects.bulk_create([ + TwoFields(id=i if i % 2 == 0 else None, f1=i, f2=i+1) + for i in range(100000, 101000)]) + self.assertTrue(len(connection.queries) < 10) + self.assertEqual(TwoFields.objects.count(), 1000) + # We can't assume much about the ID's created, except that the above + # created IDs must exist. + id_range = range(100000, 101000, 2) + self.assertEqual(TwoFields.objects.filter(id__in=id_range).count(), 500) + self.assertEqual(TwoFields.objects.exclude(id__in=id_range).count(), 500) + + def test_explicit_batch_size(self): + objs = [TwoFields(f1=i, f2=i) for i in range(0, 100)] + with self.assertNumQueries(2): + TwoFields.objects.bulk_create(objs, 50) + TwoFields.objects.all().delete() + with self.assertNumQueries(1): + TwoFields.objects.bulk_create(objs, len(objs)) diff --git a/tests/regressiontests/queries/tests.py b/tests/regressiontests/queries/tests.py index 9bb3a29430..1582993dfc 100644 --- a/tests/regressiontests/queries/tests.py +++ b/tests/regressiontests/queries/tests.py @@ -1879,8 +1879,7 @@ class ConditionalTests(BaseQuerysetTest): # Test that the "in" lookup works with lists of 1000 items or more. Number.objects.all().delete() numbers = range(2500) - for num in numbers: - _ = Number.objects.create(num=num) + Number.objects.bulk_create(Number(num=num) for num in numbers) self.assertEqual( Number.objects.filter(num__in=numbers[:1000]).count(), 1000