bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215)

(cherry picked from commit 694d31e714)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Miss Islington (bot) 2020-07-01 11:22:45 -07:00 committed by GitHub
parent 42f05e6292
commit c3fa7534c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 41 additions and 152 deletions

View File

@ -1,10 +1,8 @@
import codecs
from codecs import BOM_UTF8
import os
import re
import shlex
import sys
import tempfile
import tokenize
import tkinter.filedialog as tkFileDialog
import tkinter.messagebox as tkMessageBox
@ -20,49 +18,6 @@ else:
errors = 'surrogateescape'
coding_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
def coding_spec(data):
"""Return the encoding declaration according to PEP 263.
When checking encoded data, only the first two lines should be passed
in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
The first two lines would contain the encoding specification.
Raise a LookupError if the encoding is declared but unknown.
"""
if isinstance(data, bytes):
# This encoding might be wrong. However, the coding
# spec must be ASCII-only, so any non-ASCII characters
# around here will be ignored. Decoding to Latin-1 should
# never fail (except for memory outage)
lines = data.decode('iso-8859-1')
else:
lines = data
# consider only the first two lines
if '\n' in lines:
lst = lines.split('\n', 2)[:2]
elif '\r' in lines:
lst = lines.split('\r', 2)[:2]
else:
lst = [lines]
for line in lst:
match = coding_re.match(line)
if match is not None:
break
if not blank_re.match(line):
return None
else:
return None
name = match.group(1)
try:
codecs.lookup(name)
except LookupError:
# The standard encoding error does not indicate the encoding
raise LookupError("Unknown encoding: "+name)
return name
class IOBinding:
# One instance per editor Window so methods know which to save, close.
@ -78,7 +33,7 @@ class IOBinding:
self.save_as)
self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>",
self.save_a_copy)
self.fileencoding = None
self.fileencoding = 'utf-8'
self.__id_print = self.text.bind("<<print-window>>", self.print_window)
def close(self):
@ -165,34 +120,44 @@ class IOBinding:
self.text.focus_set()
return "break"
eol = r"(\r\n)|\n|\r" # \r\n (Windows), \n (UNIX), or \r (Mac)
eol_re = re.compile(eol)
eol_convention = os.linesep # default
def loadfile(self, filename):
try:
# open the file in binary mode so that we can handle
# end-of-line convention ourselves.
with open(filename, 'rb') as f:
two_lines = f.readline() + f.readline()
f.seek(0)
bytes = f.read()
except OSError as msg:
tkMessageBox.showerror("I/O Error", str(msg), parent=self.text)
try:
with tokenize.open(filename) as f:
chars = f.read()
fileencoding = f.encoding
eol_convention = f.newlines
converted = False
except (UnicodeDecodeError, SyntaxError):
# Wait for the editor window to appear
self.editwin.text.update()
enc = askstring(
"Specify file encoding",
"The file's encoding is invalid for Python 3.x.\n"
"IDLE will convert it to UTF-8.\n"
"What is the current encoding of the file?",
initialvalue='utf-8',
parent=self.editwin.text)
with open(filename, encoding=enc) as f:
chars = f.read()
fileencoding = f.encoding
eol_convention = f.newlines
converted = True
except OSError as err:
tkMessageBox.showerror("I/O Error", str(err), parent=self.text)
return False
chars, converted = self._decode(two_lines, bytes)
if chars is None:
except UnicodeDecodeError:
tkMessageBox.showerror("Decoding Error",
"File %s\nFailed to Decode" % filename,
parent=self.text)
return False
# We now convert all end-of-lines to '\n's
firsteol = self.eol_re.search(chars)
if firsteol:
self.eol_convention = firsteol.group(0)
chars = self.eol_re.sub(r"\n", chars)
self.text.delete("1.0", "end")
self.set_filename(None)
self.fileencoding = fileencoding
self.eol_convention = eol_convention
self.text.insert("1.0", chars)
self.reset_undo()
self.set_filename(filename)
@ -205,74 +170,6 @@ class IOBinding:
self.updaterecentfileslist(filename)
return True
def _decode(self, two_lines, bytes):
"Create a Unicode string."
chars = None
# Check presence of a UTF-8 signature first
if bytes.startswith(BOM_UTF8):
try:
chars = bytes[3:].decode("utf-8")
except UnicodeDecodeError:
# has UTF-8 signature, but fails to decode...
return None, False
else:
# Indicates that this file originally had a BOM
self.fileencoding = 'BOM'
return chars, False
# Next look for coding specification
try:
enc = coding_spec(two_lines)
except LookupError as name:
tkMessageBox.showerror(
title="Error loading the file",
message="The encoding '%s' is not known to this Python "\
"installation. The file may not display correctly" % name,
parent = self.text)
enc = None
except UnicodeDecodeError:
return None, False
if enc:
try:
chars = str(bytes, enc)
self.fileencoding = enc
return chars, False
except UnicodeDecodeError:
pass
# Try ascii:
try:
chars = str(bytes, 'ascii')
self.fileencoding = None
return chars, False
except UnicodeDecodeError:
pass
# Try utf-8:
try:
chars = str(bytes, 'utf-8')
self.fileencoding = 'utf-8'
return chars, False
except UnicodeDecodeError:
pass
# Finally, try the locale's encoding. This is deprecated;
# the user should declare a non-ASCII encoding
try:
# Wait for the editor window to appear
self.editwin.text.update()
enc = askstring(
"Specify file encoding",
"The file's encoding is invalid for Python 3.x.\n"
"IDLE will convert it to UTF-8.\n"
"What is the current encoding of the file?",
initialvalue = encoding,
parent = self.editwin.text)
if enc:
chars = str(bytes, enc)
self.fileencoding = None
return chars, True
except (UnicodeDecodeError, LookupError):
pass
return None, False # None on failure
def maybesave(self):
if self.get_saved():
return "yes"
@ -360,38 +257,30 @@ class IOBinding:
# text to us. Don't try to guess further.
return chars
# Preserve a BOM that might have been present on opening
if self.fileencoding == 'BOM':
return BOM_UTF8 + chars.encode("utf-8")
if self.fileencoding == 'utf-8-sig':
return chars.encode('utf-8-sig')
# See whether there is anything non-ASCII in it.
# If not, no need to figure out the encoding.
try:
return chars.encode('ascii')
except UnicodeError:
except UnicodeEncodeError:
pass
# Check if there is an encoding declared
try:
# a string, let coding_spec slice it to the first two lines
enc = coding_spec(chars)
failed = None
except LookupError as msg:
failed = msg
enc = None
else:
if not enc:
# PEP 3120: default source encoding is UTF-8
enc = 'utf-8'
if enc:
try:
return chars.encode(enc)
except UnicodeError:
failed = "Invalid encoding '%s'" % enc
encoded = chars.encode('ascii', 'replace')
enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline)
return chars.encode(enc)
except SyntaxError as err:
failed = str(err)
except UnicodeEncodeError:
failed = "Invalid encoding '%s'" % enc
tkMessageBox.showerror(
"I/O Error",
"%s.\nSaving as UTF-8" % failed,
parent = self.text)
parent=self.text)
# Fallback: save as UTF-8, with BOM - ignoring the incorrect
# declared encoding
return BOM_UTF8 + chars.encode("utf-8")
return chars.encode('utf-8-sig')
def print_window(self, event):
confirm = tkMessageBox.askokcancel(