From badde8a7e5090347feea0b39221dbdea428582b8 Mon Sep 17 00:00:00 2001 From: Malcolm Tredinnick Date: Sat, 26 Jul 2008 05:07:16 +0000 Subject: [PATCH] Fixed #7793 -- Handle sitemaps with more than 50,000 URLs in them (by using pagination). Patch from Julian Bez. The docs patch here could probably do with some rewording. git-svn-id: http://code.djangoproject.com/svn/django/trunk@8088 bcc190cf-cafb-0310-a4f2-bffc1f526a37 --- AUTHORS | 3 ++- django/contrib/sitemaps/__init__.py | 16 +++++++++++++--- django/contrib/sitemaps/views.py | 24 +++++++++++++++++++----- docs/sitemaps.txt | 4 ++++ 4 files changed, 38 insertions(+), 9 deletions(-) diff --git a/AUTHORS b/AUTHORS index 8e05c927d8..967c9f77cd 100644 --- a/AUTHORS +++ b/AUTHORS @@ -71,7 +71,7 @@ answer newbie questions, and generally made Django that much better: Esdras Beleza Chris Bennett James Bennett - Ben Godfrey + Julian Bez Arvis Bickovskis Paul Bissex Simon Blanchard @@ -166,6 +166,7 @@ answer newbie questions, and generally made Django that much better: glin@seznam.cz martin.glueck@gmail.com Artyom Gnilov + Ben Godfrey GomoX Guilherme Mesquita Gondim Mario Gonzalez diff --git a/django/contrib/sitemaps/__init__.py b/django/contrib/sitemaps/__init__.py index 30949837e4..13e667e142 100644 --- a/django/contrib/sitemaps/__init__.py +++ b/django/contrib/sitemaps/__init__.py @@ -1,4 +1,4 @@ -from django.core import urlresolvers +from django.core import urlresolvers, paginator import urllib PING_URL = "http://www.google.com/webmasters/tools/ping" @@ -34,6 +34,10 @@ def ping_google(sitemap_url=None, ping_url=PING_URL): urllib.urlopen("%s?%s" % (ping_url, params)) class Sitemap: + # This limit is defined by Google. See the index documentation at + # http://sitemaps.org/protocol.php#index. + limit = 50000 + def __get(self, name, obj, default=None): try: attr = getattr(self, name) @@ -49,11 +53,17 @@ class Sitemap: def location(self, obj): return obj.get_absolute_url() - def get_urls(self): + def _get_paginator(self): + if not hasattr(self, "paginator"): + self.paginator = paginator.Paginator(self.items(), self.limit) + return self.paginator + paginator = property(_get_paginator) + + def get_urls(self, page=1): from django.contrib.sites.models import Site current_site = Site.objects.get_current() urls = [] - for item in self.items(): + for item in self.paginator.page(page).object_list: loc = "http://%s%s" % (current_site.domain, self.__get('location', item)) url_info = { 'location': loc, diff --git a/django/contrib/sitemaps/views.py b/django/contrib/sitemaps/views.py index 86ef1e3526..7a5fe38a08 100644 --- a/django/contrib/sitemaps/views.py +++ b/django/contrib/sitemaps/views.py @@ -3,14 +3,22 @@ from django.template import loader from django.contrib.sites.models import Site from django.core import urlresolvers from django.utils.encoding import smart_str +from django.core.paginator import EmptyPage, PageNotAnInteger def index(request, sitemaps): current_site = Site.objects.get_current() sites = [] protocol = request.is_secure() and 'https' or 'http' - for section in sitemaps.keys(): + for section, site in sitemaps.items(): + if callable(site): + pages = site().paginator.num_pages + else: + pages = site.paginator.num_pages sitemap_url = urlresolvers.reverse('django.contrib.sitemaps.views.sitemap', kwargs={'section': section}) sites.append('%s://%s%s' % (protocol, current_site.domain, sitemap_url)) + if pages > 1: + for page in range(2, pages+1): + sites.append('%s://%s%s?p=%s' % (protocol, current_site.domain, sitemap_url, page)) xml = loader.render_to_string('sitemap_index.xml', {'sitemaps': sites}) return HttpResponse(xml, mimetype='application/xml') @@ -22,10 +30,16 @@ def sitemap(request, sitemaps, section=None): maps.append(sitemaps[section]) else: maps = sitemaps.values() + page = request.GET.get("p", 1) for site in maps: - if callable(site): - urls.extend(site().get_urls()) - else: - urls.extend(site.get_urls()) + try: + if callable(site): + urls.extend(site().get_urls(page)) + else: + urls.extend(site.get_urls(page)) + except EmptyPage: + raise Http404("Page %s empty" % page) + except PageNotAnInteger: + raise Http404("No page '%s'" % page) xml = smart_str(loader.render_to_string('sitemap.xml', {'urlset': urls})) return HttpResponse(xml, mimetype='application/xml') diff --git a/docs/sitemaps.txt b/docs/sitemaps.txt index 6a16e61879..3e7411c168 100644 --- a/docs/sitemaps.txt +++ b/docs/sitemaps.txt @@ -282,6 +282,10 @@ This will automatically generate a ``sitemap.xml`` file that references both ``sitemap-flatpages.xml`` and ``sitemap-blog.xml``. The ``Sitemap`` classes and the ``sitemaps`` dict don't change at all. +If one of your sitemaps is going to have more than 50,000 URLs you should +create an index file. Your sitemap will be paginated and the index will +reflect that. + Pinging Google ==============