Fixed #24366 -- Optimized traversal of large migration dependency graphs.

Switched from an adjancency list and uncached, iterative depth-first
search to a Node-based design with direct parent/child links and a
cached, recursive depth-first search. With this change, calculating
a migration plan for a large graph takes several seconds instead of
several hours.

Marked test `migrations.test_graph.GraphTests.test_dfs` as an expected
failure due to reaching the maximum recursion depth.
This commit is contained in:
Marten Kenbeek 2015-02-20 05:43:45 +01:00 committed by Markus Holtermann
parent 7fa7dd48c4
commit 78d43a5e10
3 changed files with 112 additions and 33 deletions

View File

@ -47,7 +47,7 @@ class MigrationExecutor(object):
# child(ren) in the same app, and no further.
next_in_app = sorted(
n for n in
self.loader.graph.dependents.get(target, set())
self.loader.graph.node_map[target].children
if n[0] == target[0]
)
for node in next_in_app:

View File

@ -5,6 +5,68 @@ from collections import deque
from django.db.migrations.state import ProjectState
from django.utils.datastructures import OrderedSet
from django.utils.encoding import python_2_unicode_compatible
from django.utils.functional import total_ordering
@python_2_unicode_compatible
@total_ordering
class Node(object):
"""
A single node in the migration graph. Contains direct links to adjacent
nodes in either direction.
"""
def __init__(self, key):
self.key = key
self.children = set()
self.parents = set()
def __eq__(self, other):
return self.key == other
def __lt__(self, other):
return self.key < other
def __hash__(self):
return hash(self.key)
def __getitem__(self, item):
return self.key[item]
def __str__(self):
return str(self.key)
def __repr__(self):
return '<Node: (%r, %r)>' % self.key
def add_child(self, child):
self.children.add(child)
def add_parent(self, parent):
self.parents.add(parent)
# Use manual caching, @cached_property effectively doubles the
# recursion depth for each recursion.
def ancestors(self):
# Use self.key instead of self to speed up the frequent hashing
# when constructing an OrderedSet.
if '_ancestors' not in self.__dict__:
ancestors = deque([self.key])
for parent in sorted(self.parents):
ancestors.extendleft(reversed(parent.ancestors()))
self.__dict__['_ancestors'] = list(OrderedSet(ancestors))
return self.__dict__['_ancestors']
# Use manual caching, @cached_property effectively doubles the
# recursion depth for each recursion.
def descendants(self):
# Use self.key instead of self to speed up the frequent hashing
# when constructing an OrderedSet.
if '_descendants' not in self.__dict__:
descendants = deque([self.key])
for child in sorted(self.children):
descendants.extendleft(reversed(child.descendants()))
self.__dict__['_descendants'] = list(OrderedSet(descendants))
return self.__dict__['_descendants']
@python_2_unicode_compatible
@ -32,12 +94,15 @@ class MigrationGraph(object):
"""
def __init__(self):
self.node_map = {}
self.nodes = {}
self.dependencies = {}
self.dependents = {}
self.cached = False
def add_node(self, node, implementation):
self.nodes[node] = implementation
def add_node(self, key, implementation):
node = Node(key)
self.node_map[key] = node
self.nodes[key] = implementation
self.clear_cache()
def add_dependency(self, migration, child, parent):
if child not in self.nodes:
@ -50,8 +115,16 @@ class MigrationGraph(object):
"Migration %s dependencies reference nonexistent parent node %r" % (migration, parent),
parent
)
self.dependencies.setdefault(child, set()).add(parent)
self.dependents.setdefault(parent, set()).add(child)
self.node_map[child].add_parent(self.node_map[parent])
self.node_map[parent].add_child(self.node_map[child])
self.clear_cache()
def clear_cache(self):
if self.cached:
for node in self.nodes:
self.node_map[node].__dict__.pop('_ancestors', None)
self.node_map[node].__dict__.pop('_descendants', None)
self.cached = False
def forwards_plan(self, node):
"""
@ -62,7 +135,10 @@ class MigrationGraph(object):
"""
if node not in self.nodes:
raise NodeNotFoundError("Node %r not a valid node" % (node, ), node)
return self.dfs(node, lambda x: self.dependencies.get(x, set()))
# Use parent.key instead of parent to speed up the frequent hashing in ensure_not_cyclic
self.ensure_not_cyclic(node, lambda x: (parent.key for parent in self.node_map[x].parents))
self.cached = True
return self.node_map[node].ancestors()
def backwards_plan(self, node):
"""
@ -73,7 +149,10 @@ class MigrationGraph(object):
"""
if node not in self.nodes:
raise NodeNotFoundError("Node %r not a valid node" % (node, ), node)
return self.dfs(node, lambda x: self.dependents.get(x, set()))
# Use child.key instead of child to speed up the frequent hashing in ensure_not_cyclic
self.ensure_not_cyclic(node, lambda x: (child.key for child in self.node_map[x].children))
self.cached = True
return self.node_map[node].descendants()
def root_nodes(self, app=None):
"""
@ -82,7 +161,7 @@ class MigrationGraph(object):
"""
roots = set()
for node in self.nodes:
if (not any(key[0] == node[0] for key in self.dependencies.get(node, set()))
if (not any(key[0] == node[0] for key in self.node_map[node].parents)
and (not app or app == node[0])):
roots.add(node)
return sorted(roots)
@ -97,7 +176,7 @@ class MigrationGraph(object):
"""
leaves = set()
for node in self.nodes:
if (not any(key[0] == node[0] for key in self.dependents.get(node, set()))
if (not any(key[0] == node[0] for key in self.node_map[node].children)
and (not app or app == node[0])):
leaves.add(node)
return sorted(leaves)
@ -105,7 +184,7 @@ class MigrationGraph(object):
def ensure_not_cyclic(self, start, get_children):
# Algo from GvR:
# http://neopythonic.blogspot.co.uk/2009/01/detecting-cycles-in-directed-graph.html
todo = set(self.nodes.keys())
todo = set(self.nodes)
while todo:
node = todo.pop()
stack = [node]
@ -122,28 +201,10 @@ class MigrationGraph(object):
else:
node = stack.pop()
def dfs(self, start, get_children):
"""
Iterative depth first search, for finding dependencies.
"""
self.ensure_not_cyclic(start, get_children)
visited = deque()
visited.append(start)
stack = deque(sorted(get_children(start)))
while stack:
node = stack.popleft()
visited.appendleft(node)
children = sorted(get_children(node), reverse=True)
# reverse sorting is needed because prepending using deque.extendleft
# also effectively reverses values
stack.extendleft(children)
return list(OrderedSet(visited))
def __str__(self):
return "Graph: %s nodes, %s edges" % (
len(self.nodes),
sum(len(x) for x in self.dependencies.values()),
sum(len(node.parents) for node in self.node_map.values()),
)
def make_state(self, nodes=None, at_end=True, real_apps=None):

View File

@ -1,3 +1,5 @@
from unittest import expectedFailure
from django.db.migrations.graph import (
CircularDependencyError, MigrationGraph, NodeNotFoundError,
)
@ -151,7 +153,23 @@ class GraphTests(TestCase):
graph.forwards_plan, ('C', '0001')
)
def test_dfs(self):
def test_deep_graph(self):
graph = MigrationGraph()
root = ("app_a", "1")
graph.add_node(root, None)
expected = [root]
for i in range(2, 750):
parent = ("app_a", str(i - 1))
child = ("app_a", str(i))
graph.add_node(child, None)
graph.add_dependency(str(i), child, parent)
expected.append(child)
actual = graph.node_map[root].descendants()
self.assertEqual(expected[::-1], actual)
@expectedFailure
def test_recursion_depth(self):
graph = MigrationGraph()
root = ("app_a", "1")
graph.add_node(root, None)
@ -163,7 +181,7 @@ class GraphTests(TestCase):
graph.add_dependency(str(i), child, parent)
expected.append(child)
actual = graph.dfs(root, lambda x: graph.dependents.get(x, set()))
actual = graph.node_map[root].descendants()
self.assertEqual(expected[::-1], actual)
def test_plan_invalid_node(self):