From 0cf79b29cdb8aa488d30910778c951e88c95b580 Mon Sep 17 00:00:00 2001
From: Benjamin Peterson <benjamin@python.org>
Date: Fri, 8 Mar 2013 10:44:41 -0500
Subject: [PATCH] in the default Python 2 case, manually check the source is
 ASCII (fixes #269)

---
 CHANGELOG                     |  3 +++
 _pytest/assertion/rewrite.py  | 27 +++++++++++++++++++++++++++
 testing/test_assertrewrite.py |  8 ++++++++
 3 files changed, 38 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 08a9c1a65..574179b82 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,8 @@
 Changes between 2.3.4 and 2.3.5dev
 -----------------------------------
+- issue 259 - when assertion rewriting, be consistent with the default
+  source encoding of ASCII on Python 2
+
 - issue 251 - report a skip instead of ignoring classes with init
 
 - issue250 unicode/str mixes in parametrization names and values now works
diff --git a/_pytest/assertion/rewrite.py b/_pytest/assertion/rewrite.py
index 9a0ebda3d..e964ea9f8 100644
--- a/_pytest/assertion/rewrite.py
+++ b/_pytest/assertion/rewrite.py
@@ -6,6 +6,7 @@ import itertools
 import imp
 import marshal
 import os
+import re
 import struct
 import sys
 import types
@@ -38,6 +39,7 @@ PYC_EXT = ".py" + (__debug__ and "c" or "o")
 PYC_TAIL = "." + PYTEST_TAG + PYC_EXT
 
 REWRITE_NEWLINES = sys.version_info[:2] != (2, 7) and sys.version_info < (3, 2)
+ASCII_IS_DEFAULT_ENCODING = sys.version_info[0] < 3
 
 class AssertionRewritingHook(object):
     """PEP302 Import hook which rewrites asserts."""
@@ -187,12 +189,37 @@ def _write_pyc(co, source_path, pyc):
 RN = "\r\n".encode("utf-8")
 N = "\n".encode("utf-8")
 
+cookie_re = re.compile("coding[:=]\s*[-\w.]+")
+BOM_UTF8 = '\xef\xbb\xbf'
+
 def _rewrite_test(state, fn):
     """Try to read and rewrite *fn* and return the code object."""
     try:
         source = fn.read("rb")
     except EnvironmentError:
         return None
+    if ASCII_IS_DEFAULT_ENCODING:
+        # ASCII is the default encoding in Python 2. Without a coding
+        # declaration, Python 2 will complain about any bytes in the file
+        # outside the ASCII range. Sadly, this behavior does not extend to
+        # compile() or ast.parse(), which prefer to interpret the bytes as
+        # latin-1. (At least they properly handle explicit coding cookies.) To
+        # preserve this error behavior, we could force ast.parse() to use ASCII
+        # as the encoding by inserting a coding cookie. Unfortunately, that
+        # messes up line numbers. Thus, we have to check ourselves if anything
+        # is outside the ASCII range in the case no encoding is explicitly
+        # declared. For more context, see issue #269. Yay for Python 3 which
+        # gets this right.
+        end1 = source.find("\n")
+        end2 = source.find("\n", end1 + 1)
+        if (not source.startswith(BOM_UTF8) and
+            (not cookie_re.match(source[0:end1]) or
+            not cookie_re.match(source[end1:end2]))):
+            try:
+                source.decode("ascii")
+            except UnicodeDecodeError:
+                # Let it fail in real import.
+                return None
     # On Python versions which are not 2.7 and less than or equal to 3.1, the
     # parser expects *nix newlines.
     if REWRITE_NEWLINES:
diff --git a/testing/test_assertrewrite.py b/testing/test_assertrewrite.py
index 9b73c337a..4841ff47c 100644
--- a/testing/test_assertrewrite.py
+++ b/testing/test_assertrewrite.py
@@ -394,3 +394,11 @@ def test_rewritten():
         b = content.encode("utf-8")
         testdir.tmpdir.join("test_newlines.py").write(b, "wb")
         assert testdir.runpytest().ret == 0
+
+    @pytest.mark.skipif("sys.version_info[0] >= 3")
+    def test_assume_ascii(self, testdir):
+        content = "u'\xe2\x99\xa5'"
+        testdir.tmpdir.join("test_encoding.py").write(content, "wb")
+        res = testdir.runpytest()
+        assert res.ret != 0
+        assert "SyntaxError: Non-ASCII character" in res.stdout.str()