Fixed #7793 -- Handle sitemaps with more than 50,000 URLs in them (by using

pagination). Patch from Julian Bez.

The docs patch here could probably do with some rewording.


git-svn-id: http://code.djangoproject.com/svn/django/trunk@8088 bcc190cf-cafb-0310-a4f2-bffc1f526a37
This commit is contained in:
Malcolm Tredinnick 2008-07-26 05:07:16 +00:00
parent a26ba33111
commit badde8a7e5
4 changed files with 38 additions and 9 deletions

View File

@ -71,7 +71,7 @@ answer newbie questions, and generally made Django that much better:
Esdras Beleza <linux@esdrasbeleza.com>
Chris Bennett <chrisrbennett@yahoo.com>
James Bennett
Ben Godfrey <http://aftnn.org>
Julian Bez
Arvis Bickovskis <viestards.lists@gmail.com>
Paul Bissex <http://e-scribe.com/>
Simon Blanchard
@ -166,6 +166,7 @@ answer newbie questions, and generally made Django that much better:
glin@seznam.cz
martin.glueck@gmail.com
Artyom Gnilov <boobsd@gmail.com>
Ben Godfrey <http://aftnn.org>
GomoX <gomo@datafull.com>
Guilherme Mesquita Gondim <semente@taurinus.org>
Mario Gonzalez <gonzalemario@gmail.com>

View File

@ -1,4 +1,4 @@
from django.core import urlresolvers
from django.core import urlresolvers, paginator
import urllib
PING_URL = "http://www.google.com/webmasters/tools/ping"
@ -34,6 +34,10 @@ def ping_google(sitemap_url=None, ping_url=PING_URL):
urllib.urlopen("%s?%s" % (ping_url, params))
class Sitemap:
# This limit is defined by Google. See the index documentation at
# http://sitemaps.org/protocol.php#index.
limit = 50000
def __get(self, name, obj, default=None):
try:
attr = getattr(self, name)
@ -49,11 +53,17 @@ class Sitemap:
def location(self, obj):
return obj.get_absolute_url()
def get_urls(self):
def _get_paginator(self):
if not hasattr(self, "paginator"):
self.paginator = paginator.Paginator(self.items(), self.limit)
return self.paginator
paginator = property(_get_paginator)
def get_urls(self, page=1):
from django.contrib.sites.models import Site
current_site = Site.objects.get_current()
urls = []
for item in self.items():
for item in self.paginator.page(page).object_list:
loc = "http://%s%s" % (current_site.domain, self.__get('location', item))
url_info = {
'location': loc,

View File

@ -3,14 +3,22 @@ from django.template import loader
from django.contrib.sites.models import Site
from django.core import urlresolvers
from django.utils.encoding import smart_str
from django.core.paginator import EmptyPage, PageNotAnInteger
def index(request, sitemaps):
current_site = Site.objects.get_current()
sites = []
protocol = request.is_secure() and 'https' or 'http'
for section in sitemaps.keys():
for section, site in sitemaps.items():
if callable(site):
pages = site().paginator.num_pages
else:
pages = site.paginator.num_pages
sitemap_url = urlresolvers.reverse('django.contrib.sitemaps.views.sitemap', kwargs={'section': section})
sites.append('%s://%s%s' % (protocol, current_site.domain, sitemap_url))
if pages > 1:
for page in range(2, pages+1):
sites.append('%s://%s%s?p=%s' % (protocol, current_site.domain, sitemap_url, page))
xml = loader.render_to_string('sitemap_index.xml', {'sitemaps': sites})
return HttpResponse(xml, mimetype='application/xml')
@ -22,10 +30,16 @@ def sitemap(request, sitemaps, section=None):
maps.append(sitemaps[section])
else:
maps = sitemaps.values()
page = request.GET.get("p", 1)
for site in maps:
try:
if callable(site):
urls.extend(site().get_urls())
urls.extend(site().get_urls(page))
else:
urls.extend(site.get_urls())
urls.extend(site.get_urls(page))
except EmptyPage:
raise Http404("Page %s empty" % page)
except PageNotAnInteger:
raise Http404("No page '%s'" % page)
xml = smart_str(loader.render_to_string('sitemap.xml', {'urlset': urls}))
return HttpResponse(xml, mimetype='application/xml')

View File

@ -282,6 +282,10 @@ This will automatically generate a ``sitemap.xml`` file that references
both ``sitemap-flatpages.xml`` and ``sitemap-blog.xml``. The ``Sitemap``
classes and the ``sitemaps`` dict don't change at all.
If one of your sitemaps is going to have more than 50,000 URLs you should
create an index file. Your sitemap will be paginated and the index will
reflect that.
Pinging Google
==============