Issue #10335: Add tokenize.open(), detect the file encoding using

tokenize.detect_encoding() and open it in read only mode.
This commit is contained in:
Victor Stinner 2010-11-09 01:08:59 +00:00
parent ae4836df6d
commit 58c0752a33
8 changed files with 54 additions and 22 deletions

View File

@ -101,14 +101,16 @@ function it uses to do this is available:
If no encoding is specified, then the default of ``'utf-8'`` will be If no encoding is specified, then the default of ``'utf-8'`` will be
returned. returned.
:func:`detect_encoding` is useful for robustly reading Python source files. Use :func:`open` to open Python source files: it uses
A common pattern for this follows:: :func:`detect_encoding` to detect the file encoding.
def read_python_source(file_name):
with open(file_name, "rb") as fp: .. function:: open(filename)
encoding = tokenize.detect_encoding(fp.readline)[0]
with open(file_name, "r", encoding=encoding) as fp: Open a file in read only mode using the encoding detected by
return fp.read() :func:`detect_encoding`.
.. versionadded:: 3.2
Example of a script rewriter that transforms float literals into Decimal Example of a script rewriter that transforms float literals into Decimal
@ -153,4 +155,3 @@ objects::
result.append((toknum, tokval)) result.append((toknum, tokval))
return untokenize(result).decode('utf-8') return untokenize(result).decode('utf-8')

View File

@ -123,9 +123,7 @@ def updatecache(filename, module_globals=None):
else: else:
return [] return []
try: try:
with open(fullname, 'rb') as fp: with tokenize.open(fullname) as fp:
coding, line = tokenize.detect_encoding(fp.readline)
with open(fullname, 'r', encoding=coding) as fp:
lines = fp.readlines() lines = fp.readlines()
except IOError: except IOError:
return [] return []

View File

@ -104,9 +104,7 @@ def compile(file, cfile=None, dfile=None, doraise=False):
byte-compile all installed files (or all files in selected byte-compile all installed files (or all files in selected
directories). directories).
""" """
with open(file, "rb") as f: with tokenize.open(file) as f:
encoding = tokenize.detect_encoding(f.readline)[0]
with open(file, encoding=encoding) as f:
try: try:
timestamp = int(os.fstat(f.fileno()).st_mtime) timestamp = int(os.fstat(f.fileno()).st_mtime)
except AttributeError: except AttributeError:

View File

@ -93,11 +93,8 @@ def check(file):
check(fullname) check(fullname)
return return
with open(file, 'rb') as f:
encoding, lines = tokenize.detect_encoding(f.readline)
try: try:
f = open(file, encoding=encoding) f = tokenize.open(file)
except IOError as msg: except IOError as msg:
errprint("%r: I/O Error: %s" % (file, msg)) errprint("%r: I/O Error: %s" % (file, msg))
return return

View File

@ -564,7 +564,8 @@ Non-ascii identifiers
from test import support from test import support
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, tok_name, detect_encoding) STRING, ENDMARKER, tok_name, detect_encoding,
open as tokenize_open)
from io import BytesIO from io import BytesIO
from unittest import TestCase from unittest import TestCase
import os, sys, glob import os, sys, glob
@ -857,6 +858,26 @@ class TestDetectEncoding(TestCase):
readline = self.get_readline((b'# coding: bad\n',)) readline = self.get_readline((b'# coding: bad\n',))
self.assertRaises(SyntaxError, detect_encoding, readline) self.assertRaises(SyntaxError, detect_encoding, readline)
def test_open(self):
filename = support.TESTFN + '.py'
self.addCleanup(support.unlink, filename)
# test coding cookie
for encoding in ('iso-8859-15', 'utf-8'):
with open(filename, 'w', encoding=encoding) as fp:
print("# coding: %s" % encoding, file=fp)
print("print('euro:\u20ac')", file=fp)
with tokenize_open(filename) as fp:
assert fp.encoding == encoding
assert fp.mode == 'r'
# test BOM (no coding cookie)
with open(filename, 'w', encoding='utf-8-sig') as fp:
print("print('euro:\u20ac')", file=fp)
with tokenize_open(filename) as fp:
assert fp.encoding == 'utf-8-sig'
assert fp.mode == 'r'
class TestTokenize(TestCase): class TestTokenize(TestCase):
def test_tokenize(self): def test_tokenize(self):

View File

@ -29,6 +29,7 @@ import sys
from token import * from token import *
from codecs import lookup, BOM_UTF8 from codecs import lookup, BOM_UTF8
import collections import collections
from io import TextIOWrapper
cookie_re = re.compile("coding[:=]\s*([-\w.]+)") cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
import token import token
@ -335,6 +336,20 @@ def detect_encoding(readline):
return default, [first, second] return default, [first, second]
_builtin_open = open
def open(filename):
"""Open a file in read only mode using the encoding detected by
detect_encoding().
"""
buffer = _builtin_open(filename, 'rb')
encoding, lines = detect_encoding(buffer.readline)
buffer.seek(0)
text = TextIOWrapper(buffer, encoding, line_buffering=True)
text.mode = 'r'
return text
def tokenize(readline): def tokenize(readline):
""" """
The tokenize() generator requires one argment, readline, which The tokenize() generator requires one argment, readline, which

View File

@ -432,10 +432,9 @@ def find_strings(filename, encoding=None):
def find_executable_linenos(filename): def find_executable_linenos(filename):
"""Return dict where keys are line numbers in the line number table.""" """Return dict where keys are line numbers in the line number table."""
try: try:
with io.FileIO(filename, 'r') as file: with tokenize.open(filename) as f:
encoding, lines = tokenize.detect_encoding(file.readline)
with open(filename, "r", encoding=encoding) as f:
prog = f.read() prog = f.read()
encoding = f.encoding
except IOError as err: except IOError as err:
print(("Not printing coverage data for %r: %s" print(("Not printing coverage data for %r: %s"
% (filename, err)), file=sys.stderr) % (filename, err)), file=sys.stderr)

View File

@ -60,6 +60,9 @@ Core and Builtins
Library Library
------- -------
- Issue #10335: Add tokenize.open(), detect the file encoding using
tokenize.detect_encoding() and open it in read only mode.
- Issue #10321: Added support for binary data to smtplib.SMTP.sendmail, - Issue #10321: Added support for binary data to smtplib.SMTP.sendmail,
and a new method send_message to send an email.message.Message object. and a new method send_message to send an email.message.Message object.