Fixed #7793 -- Handle sitemaps with more than 50,000 URLs in them (by using

pagination). Patch from Julian Bez.

The docs patch here could probably do with some rewording.


git-svn-id: http://code.djangoproject.com/svn/django/trunk@8088 bcc190cf-cafb-0310-a4f2-bffc1f526a37
This commit is contained in:
Malcolm Tredinnick 2008-07-26 05:07:16 +00:00
parent a26ba33111
commit badde8a7e5
4 changed files with 38 additions and 9 deletions

View File

@ -71,7 +71,7 @@ answer newbie questions, and generally made Django that much better:
Esdras Beleza <linux@esdrasbeleza.com> Esdras Beleza <linux@esdrasbeleza.com>
Chris Bennett <chrisrbennett@yahoo.com> Chris Bennett <chrisrbennett@yahoo.com>
James Bennett James Bennett
Ben Godfrey <http://aftnn.org> Julian Bez
Arvis Bickovskis <viestards.lists@gmail.com> Arvis Bickovskis <viestards.lists@gmail.com>
Paul Bissex <http://e-scribe.com/> Paul Bissex <http://e-scribe.com/>
Simon Blanchard Simon Blanchard
@ -166,6 +166,7 @@ answer newbie questions, and generally made Django that much better:
glin@seznam.cz glin@seznam.cz
martin.glueck@gmail.com martin.glueck@gmail.com
Artyom Gnilov <boobsd@gmail.com> Artyom Gnilov <boobsd@gmail.com>
Ben Godfrey <http://aftnn.org>
GomoX <gomo@datafull.com> GomoX <gomo@datafull.com>
Guilherme Mesquita Gondim <semente@taurinus.org> Guilherme Mesquita Gondim <semente@taurinus.org>
Mario Gonzalez <gonzalemario@gmail.com> Mario Gonzalez <gonzalemario@gmail.com>

View File

@ -1,4 +1,4 @@
from django.core import urlresolvers from django.core import urlresolvers, paginator
import urllib import urllib
PING_URL = "http://www.google.com/webmasters/tools/ping" PING_URL = "http://www.google.com/webmasters/tools/ping"
@ -34,6 +34,10 @@ def ping_google(sitemap_url=None, ping_url=PING_URL):
urllib.urlopen("%s?%s" % (ping_url, params)) urllib.urlopen("%s?%s" % (ping_url, params))
class Sitemap: class Sitemap:
# This limit is defined by Google. See the index documentation at
# http://sitemaps.org/protocol.php#index.
limit = 50000
def __get(self, name, obj, default=None): def __get(self, name, obj, default=None):
try: try:
attr = getattr(self, name) attr = getattr(self, name)
@ -49,11 +53,17 @@ class Sitemap:
def location(self, obj): def location(self, obj):
return obj.get_absolute_url() return obj.get_absolute_url()
def get_urls(self): def _get_paginator(self):
if not hasattr(self, "paginator"):
self.paginator = paginator.Paginator(self.items(), self.limit)
return self.paginator
paginator = property(_get_paginator)
def get_urls(self, page=1):
from django.contrib.sites.models import Site from django.contrib.sites.models import Site
current_site = Site.objects.get_current() current_site = Site.objects.get_current()
urls = [] urls = []
for item in self.items(): for item in self.paginator.page(page).object_list:
loc = "http://%s%s" % (current_site.domain, self.__get('location', item)) loc = "http://%s%s" % (current_site.domain, self.__get('location', item))
url_info = { url_info = {
'location': loc, 'location': loc,

View File

@ -3,14 +3,22 @@ from django.template import loader
from django.contrib.sites.models import Site from django.contrib.sites.models import Site
from django.core import urlresolvers from django.core import urlresolvers
from django.utils.encoding import smart_str from django.utils.encoding import smart_str
from django.core.paginator import EmptyPage, PageNotAnInteger
def index(request, sitemaps): def index(request, sitemaps):
current_site = Site.objects.get_current() current_site = Site.objects.get_current()
sites = [] sites = []
protocol = request.is_secure() and 'https' or 'http' protocol = request.is_secure() and 'https' or 'http'
for section in sitemaps.keys(): for section, site in sitemaps.items():
if callable(site):
pages = site().paginator.num_pages
else:
pages = site.paginator.num_pages
sitemap_url = urlresolvers.reverse('django.contrib.sitemaps.views.sitemap', kwargs={'section': section}) sitemap_url = urlresolvers.reverse('django.contrib.sitemaps.views.sitemap', kwargs={'section': section})
sites.append('%s://%s%s' % (protocol, current_site.domain, sitemap_url)) sites.append('%s://%s%s' % (protocol, current_site.domain, sitemap_url))
if pages > 1:
for page in range(2, pages+1):
sites.append('%s://%s%s?p=%s' % (protocol, current_site.domain, sitemap_url, page))
xml = loader.render_to_string('sitemap_index.xml', {'sitemaps': sites}) xml = loader.render_to_string('sitemap_index.xml', {'sitemaps': sites})
return HttpResponse(xml, mimetype='application/xml') return HttpResponse(xml, mimetype='application/xml')
@ -22,10 +30,16 @@ def sitemap(request, sitemaps, section=None):
maps.append(sitemaps[section]) maps.append(sitemaps[section])
else: else:
maps = sitemaps.values() maps = sitemaps.values()
page = request.GET.get("p", 1)
for site in maps: for site in maps:
try:
if callable(site): if callable(site):
urls.extend(site().get_urls()) urls.extend(site().get_urls(page))
else: else:
urls.extend(site.get_urls()) urls.extend(site.get_urls(page))
except EmptyPage:
raise Http404("Page %s empty" % page)
except PageNotAnInteger:
raise Http404("No page '%s'" % page)
xml = smart_str(loader.render_to_string('sitemap.xml', {'urlset': urls})) xml = smart_str(loader.render_to_string('sitemap.xml', {'urlset': urls}))
return HttpResponse(xml, mimetype='application/xml') return HttpResponse(xml, mimetype='application/xml')

View File

@ -282,6 +282,10 @@ This will automatically generate a ``sitemap.xml`` file that references
both ``sitemap-flatpages.xml`` and ``sitemap-blog.xml``. The ``Sitemap`` both ``sitemap-flatpages.xml`` and ``sitemap-blog.xml``. The ``Sitemap``
classes and the ``sitemaps`` dict don't change at all. classes and the ``sitemaps`` dict don't change at all.
If one of your sitemaps is going to have more than 50,000 URLs you should
create an index file. Your sitemap will be paginated and the index will
reflect that.
Pinging Google Pinging Google
============== ==============