2005-07-13 09:25:57 +08:00
|
|
|
# Performance note: I benchmarked this code using a set instead of
|
|
|
|
# a list for the stopwords and was surprised to find that the list
|
|
|
|
# performed /better/ than the set - maybe because it's only a small
|
|
|
|
# list.
|
|
|
|
|
|
|
|
stopwords = '''
|
|
|
|
i
|
|
|
|
a
|
|
|
|
an
|
|
|
|
are
|
|
|
|
as
|
|
|
|
at
|
|
|
|
be
|
|
|
|
by
|
|
|
|
for
|
|
|
|
from
|
|
|
|
how
|
|
|
|
in
|
|
|
|
is
|
|
|
|
it
|
|
|
|
of
|
|
|
|
on
|
|
|
|
or
|
|
|
|
that
|
|
|
|
the
|
|
|
|
this
|
|
|
|
to
|
|
|
|
was
|
|
|
|
what
|
|
|
|
when
|
|
|
|
where
|
|
|
|
'''.split()
|
|
|
|
|
|
|
|
def strip_stopwords(sentence):
|
|
|
|
"Removes stopwords - also normalizes whitespace"
|
|
|
|
words = sentence.split()
|
|
|
|
sentence = []
|
|
|
|
for word in words:
|
|
|
|
if word.lower() not in stopwords:
|
|
|
|
sentence.append(word)
|
Merged Unicode branch into trunk (r4952:5608). This should be fully
backwards compatible for all practical purposes.
Fixed #2391, #2489, #2996, #3322, #3344, #3370, #3406, #3432, #3454, #3492, #3582, #3690, #3878, #3891, #3937, #4039, #4141, #4227, #4286, #4291, #4300, #4452, #4702
git-svn-id: http://code.djangoproject.com/svn/django/trunk@5609 bcc190cf-cafb-0310-a4f2-bffc1f526a37
2007-07-04 20:11:04 +08:00
|
|
|
return u' '.join(sentence)
|
2005-07-13 09:25:57 +08:00
|
|
|
|