Fixed #8149 -- Made File.__iter__() support universal newlines.

The following are recognized as ending a line: the Unix end-of-line
convention '\n', the Windows convention '\r\n', and the old
Macintosh convention '\r'.

http://www.python.org/dev/peps/pep-0278

Thanks tchaumeny for review.
This commit is contained in:
Jon Dufresne 2014-09-29 18:24:33 -07:00 committed by Tim Graham
parent eab3dc195e
commit eb4f6de980
5 changed files with 108 additions and 12 deletions

View File

@ -102,16 +102,22 @@ class File(FileProxyMixin):
# Iterate over this file-like object by newlines # Iterate over this file-like object by newlines
buffer_ = None buffer_ = None
for chunk in self.chunks(): for chunk in self.chunks():
chunk_buffer = BytesIO(chunk) for line in chunk.splitlines(True):
for line in chunk_buffer:
if buffer_: if buffer_:
line = buffer_ + line if endswith_cr(buffer_) and not equals_lf(line):
# Line split after a \r newline; yield buffer_.
yield buffer_
# Continue with line.
else:
# Line either split without a newline (line
# continues after buffer_) or with \r\n
# newline (line == b'\n').
line = buffer_ + line
# buffer_ handled, clear it.
buffer_ = None buffer_ = None
# If this is the end of a line, yield # If this is the end of a \n or \r\n line, yield.
# otherwise, wait for the next round if endswith_lf(line):
if line[-1:] in (b'\n', b'\r'):
yield line yield line
else: else:
buffer_ = line buffer_ = line
@ -165,3 +171,24 @@ class ContentFile(File):
def close(self): def close(self):
pass pass
def endswith_cr(line):
"""
Return True if line (a text or byte string) ends with '\r'.
"""
return line.endswith('\r' if isinstance(line, six.text_type) else b'\r')
def endswith_lf(line):
"""
Return True if line (a text or byte string) ends with '\n'.
"""
return line.endswith('\n' if isinstance(line, six.text_type) else b'\n')
def equals_lf(line):
"""
Return True if line (a text or byte string) equals '\n'.
"""
return line == ('\n' if isinstance(line, six.text_type) else b'\n')

View File

@ -53,6 +53,15 @@ The ``File`` Class
Iterate over the file yielding one line at a time. Iterate over the file yielding one line at a time.
.. versionchanged:: 1.8
``File`` now uses `universal newlines`_. The following are
recognized as ending a line: the Unix end-of-line convention
``'\n'``, the Windows convention ``'\r\n'``, and the old Macintosh
convention ``'\r'``.
.. _universal newlines: http://www.python.org/dev/peps/pep-0278
.. method:: chunks([chunk_size=None]) .. method:: chunks([chunk_size=None])
Iterate over the file yielding "chunks" of a given size. ``chunk_size`` Iterate over the file yielding "chunks" of a given size. ``chunk_size``

View File

@ -82,10 +82,15 @@ Here are some useful attributes of ``UploadedFile``:
for line in uploadedfile: for line in uploadedfile:
do_something_with(line) do_something_with(line)
However, *unlike* standard Python files, :class:`UploadedFile` only Lines are split using `universal newlines`_. The following are recognized
understands ``\n`` (also known as "Unix-style") line endings. If you know as ending a line: the Unix end-of-line convention ``'\n'``, the Windows
that you need to handle uploaded files with different line endings, you'll convention ``'\r\n'``, and the old Macintosh convention ``'\r'``.
need to do so in your view.
.. _universal newlines: http://www.python.org/dev/peps/pep-0278
.. versionchanged:: 1.8
Previously lines were only split on the Unix end-of-line ``'\n'``.
Subclasses of ``UploadedFile`` include: Subclasses of ``UploadedFile`` include:

View File

@ -659,6 +659,13 @@ Miscellaneous
* By default, :ref:`call_command <call-command>` now always skips the check * By default, :ref:`call_command <call-command>` now always skips the check
framework (unless you pass it ``skip_checks=False``). framework (unless you pass it ``skip_checks=False``).
* When iterating over lines, :class:`~django.core.files.File` now uses
`universal newlines`_. The following are recognized as ending a line: the
Unix end-of-line convention ``'\n'``, the Windows convention ``'\r\n'``, and
the old Macintosh convention ``'\r'``.
.. _universal newlines: http://www.python.org/dev/peps/pep-0278
.. _deprecated-features-1.8: .. _deprecated-features-1.8:
Features deprecated in 1.8 Features deprecated in 1.8

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import unicode_literals from __future__ import unicode_literals
from io import BytesIO from io import BytesIO, StringIO
import os import os
import gzip import gzip
import tempfile import tempfile
@ -72,6 +72,54 @@ class FileTests(unittest.TestCase):
file = File(BytesIO(b'one\ntwo\nthree')) file = File(BytesIO(b'one\ntwo\nthree'))
self.assertEqual(list(file), [b'one\n', b'two\n', b'three']) self.assertEqual(list(file), [b'one\n', b'two\n', b'three'])
def test_file_iteration_windows_newlines(self):
"""
#8149 - File objects with \r\n line endings should yield lines
when iterated over.
"""
f = File(BytesIO(b'one\r\ntwo\r\nthree'))
self.assertEqual(list(f), [b'one\r\n', b'two\r\n', b'three'])
def test_file_iteration_mac_newlines(self):
"""
#8149 - File objects with \r line endings should yield lines
when iterated over.
"""
f = File(BytesIO(b'one\rtwo\rthree'))
self.assertEqual(list(f), [b'one\r', b'two\r', b'three'])
def test_file_iteration_mixed_newlines(self):
f = File(BytesIO(b'one\rtwo\nthree\r\nfour'))
self.assertEqual(list(f), [b'one\r', b'two\n', b'three\r\n', b'four'])
def test_file_iteration_with_unix_newline_at_chunk_boundary(self):
f = File(BytesIO(b'one\ntwo\nthree'))
# Set chunk size to create a boundary after \n:
# b'one\n...
# ^
f.DEFAULT_CHUNK_SIZE = 4
self.assertEqual(list(f), [b'one\n', b'two\n', b'three'])
def test_file_iteration_with_windows_newline_at_chunk_boundary(self):
f = File(BytesIO(b'one\r\ntwo\r\nthree'))
# Set chunk size to create a boundary between \r and \n:
# b'one\r\n...
# ^
f.DEFAULT_CHUNK_SIZE = 4
self.assertEqual(list(f), [b'one\r\n', b'two\r\n', b'three'])
def test_file_iteration_with_mac_newline_at_chunk_boundary(self):
f = File(BytesIO(b'one\rtwo\rthree'))
# Set chunk size to create a boundary after \r:
# b'one\r...
# ^
f.DEFAULT_CHUNK_SIZE = 4
self.assertEqual(list(f), [b'one\r', b'two\r', b'three'])
def test_file_iteration_with_text(self):
f = File(StringIO('one\ntwo\nthree'))
self.assertEqual(list(f), ['one\n', 'two\n', 'three'])
class NoNameFileTestCase(unittest.TestCase): class NoNameFileTestCase(unittest.TestCase):
""" """