Make IDLE's file decode more robust.
1. coding_spec() only looks at first two lines of bytes to avoid a UnicodeDecodeError if rest of file is e.g. latin-1 2. coding_spec() handles \n or \r 3. Clarify that locale_encoding is used by calling it that. However, it's still called IOBinding.encoding in other parts of IDLE and that usage needs to be checked to verify that's still what is desired. 4. Return None from _decode() if decoding fails. 5. Name the vars representing bytes and strings or chars appropriately.
This commit is contained in:
parent
504d885417
commit
44fa8f650f
|
@ -22,15 +22,15 @@ except (ImportError, locale.Error):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Encoding for file names
|
# Encoding for file names
|
||||||
filesystemencoding = sys.getfilesystemencoding()
|
filesystemencoding = sys.getfilesystemencoding() ### currently unused
|
||||||
|
|
||||||
encoding = "ascii"
|
locale_encoding = 'ascii'
|
||||||
if sys.platform == 'win32':
|
if sys.platform == 'win32':
|
||||||
# On Windows, we could use "mbcs". However, to give the user
|
# On Windows, we could use "mbcs". However, to give the user
|
||||||
# a portable encoding name, we need to find the code page
|
# a portable encoding name, we need to find the code page
|
||||||
try:
|
try:
|
||||||
encoding = locale.getdefaultlocale()[1]
|
locale_encoding = locale.getdefaultlocale()[1]
|
||||||
codecs.lookup(encoding)
|
codecs.lookup(locale_encoding)
|
||||||
except LookupError:
|
except LookupError:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
@ -39,25 +39,28 @@ else:
|
||||||
# loaded, it may not offer nl_langinfo, or CODESET, or the
|
# loaded, it may not offer nl_langinfo, or CODESET, or the
|
||||||
# resulting codeset may be unknown to Python. We ignore all
|
# resulting codeset may be unknown to Python. We ignore all
|
||||||
# these problems, falling back to ASCII
|
# these problems, falling back to ASCII
|
||||||
encoding = locale.nl_langinfo(locale.CODESET)
|
locale_encoding = locale.nl_langinfo(locale.CODESET)
|
||||||
if encoding is None or encoding is '':
|
if locale_encoding is None or locale_encoding is '':
|
||||||
# situation occurs on Mac OS X
|
# situation occurs on Mac OS X
|
||||||
encoding = 'ascii'
|
locale_encoding = 'ascii'
|
||||||
codecs.lookup(encoding)
|
codecs.lookup(locale_encoding)
|
||||||
except (NameError, AttributeError, LookupError):
|
except (NameError, AttributeError, LookupError):
|
||||||
# Try getdefaultlocale well: it parses environment variables,
|
# Try getdefaultlocale: it parses environment variables,
|
||||||
# which may give a clue. Unfortunately, getdefaultlocale has
|
# which may give a clue. Unfortunately, getdefaultlocale has
|
||||||
# bugs that can cause ValueError.
|
# bugs that can cause ValueError.
|
||||||
try:
|
try:
|
||||||
encoding = locale.getdefaultlocale()[1]
|
locale_encoding = locale.getdefaultlocale()[1]
|
||||||
if encoding is None or encoding is '':
|
if locale_encoding is None or locale_encoding is '':
|
||||||
# situation occurs on Mac OS X
|
# situation occurs on Mac OS X
|
||||||
encoding = 'ascii'
|
locale_encoding = 'ascii'
|
||||||
codecs.lookup(encoding)
|
codecs.lookup(locale_encoding)
|
||||||
except (ValueError, LookupError):
|
except (ValueError, LookupError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
encoding = encoding.lower()
|
locale_encoding = locale_encoding.lower()
|
||||||
|
|
||||||
|
encoding = locale_encoding ### KBK 07Sep07 This is used all over IDLE, check!
|
||||||
|
### 'encoding' is used below in encode(), check!
|
||||||
|
|
||||||
coding_re = re.compile("coding[:=]\s*([-\w_.]+)")
|
coding_re = re.compile("coding[:=]\s*([-\w_.]+)")
|
||||||
|
|
||||||
|
@ -110,26 +113,36 @@ class EncodingMessage(SimpleDialog):
|
||||||
def coding_spec(data):
|
def coding_spec(data):
|
||||||
"""Return the encoding declaration according to PEP 263.
|
"""Return the encoding declaration according to PEP 263.
|
||||||
|
|
||||||
Raise LookupError if the encoding is declared but unknown.
|
When checking encoded data, only the first two lines should be passed
|
||||||
|
in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
|
||||||
|
The first two lines would contain the encoding specification.
|
||||||
|
|
||||||
|
Raise a LookupError if the encoding is declared but unknown.
|
||||||
"""
|
"""
|
||||||
if isinstance(data, bytes):
|
if isinstance(data, bytes):
|
||||||
str = data.decode('utf-8')
|
try:
|
||||||
|
lines = data.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
return None
|
||||||
else:
|
else:
|
||||||
str = data
|
lines = data
|
||||||
# Only consider the first two lines
|
# consider only the first two lines
|
||||||
str = str.split("\n")[:2]
|
if '\n' in lines:
|
||||||
str = "\n".join(str)
|
lst = lines.split('\n')[:2]
|
||||||
|
elif '\r' in lines:
|
||||||
|
lst = lines.split('\r')[:2]
|
||||||
|
else:
|
||||||
|
lst = list(lines)
|
||||||
|
str = '\n'.join(lst)
|
||||||
match = coding_re.search(str)
|
match = coding_re.search(str)
|
||||||
if not match:
|
if not match:
|
||||||
return None
|
return None
|
||||||
name = match.group(1)
|
name = match.group(1)
|
||||||
# Check whether the encoding is known
|
|
||||||
import codecs
|
|
||||||
try:
|
try:
|
||||||
codecs.lookup(name)
|
codecs.lookup(name)
|
||||||
except LookupError:
|
except LookupError:
|
||||||
# The standard encoding error does not indicate the encoding
|
# The standard encoding error does not indicate the encoding
|
||||||
raise LookupError("Unknown encoding "+name)
|
raise LookupError("Unknown encoding: "+name)
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
@ -236,12 +249,19 @@ class IOBinding:
|
||||||
# open the file in binary mode so that we can handle
|
# open the file in binary mode so that we can handle
|
||||||
# end-of-line convention ourselves.
|
# end-of-line convention ourselves.
|
||||||
f = open(filename,'rb')
|
f = open(filename,'rb')
|
||||||
|
two_lines = f.readline() + f.readline()
|
||||||
|
f.seek(0)
|
||||||
bytes = f.read()
|
bytes = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
tkMessageBox.showerror("I/O Error", str(msg), master=self.text)
|
tkMessageBox.showerror("I/O Error", str(msg), master=self.text)
|
||||||
return False
|
return False
|
||||||
chars = self.decode(bytes)
|
chars = self._decode(two_lines, bytes)
|
||||||
|
if chars is None:
|
||||||
|
tkMessageBox.showerror("Decoding Error",
|
||||||
|
"File %s\nFailed to Decode" % filename,
|
||||||
|
parent=self.text)
|
||||||
|
return False
|
||||||
# We now convert all end-of-lines to '\n's
|
# We now convert all end-of-lines to '\n's
|
||||||
firsteol = self.eol_re.search(chars)
|
firsteol = self.eol_re.search(chars)
|
||||||
if firsteol:
|
if firsteol:
|
||||||
|
@ -257,25 +277,23 @@ class IOBinding:
|
||||||
self.updaterecentfileslist(filename)
|
self.updaterecentfileslist(filename)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def decode(self, chars):
|
def _decode(self, two_lines, bytes):
|
||||||
"""Create a Unicode string
|
"Create a Unicode string."
|
||||||
|
chars = None
|
||||||
If that fails, let Tcl try its best
|
|
||||||
"""
|
|
||||||
# Check presence of a UTF-8 signature first
|
# Check presence of a UTF-8 signature first
|
||||||
if chars.startswith(BOM_UTF8):
|
if bytes.startswith(BOM_UTF8):
|
||||||
try:
|
try:
|
||||||
chars = chars[3:].decode("utf-8")
|
chars = bytes[3:].decode("utf-8")
|
||||||
except UnicodeError:
|
except UnicodeDecodeError:
|
||||||
# has UTF-8 signature, but fails to decode...
|
# has UTF-8 signature, but fails to decode...
|
||||||
return chars
|
return None
|
||||||
else:
|
else:
|
||||||
# Indicates that this file originally had a BOM
|
# Indicates that this file originally had a BOM
|
||||||
self.fileencoding = 'BOM'
|
self.fileencoding = 'BOM'
|
||||||
return chars
|
return chars
|
||||||
# Next look for coding specification
|
# Next look for coding specification
|
||||||
try:
|
try:
|
||||||
enc = coding_spec(chars)
|
enc = coding_spec(two_lines)
|
||||||
except LookupError as name:
|
except LookupError as name:
|
||||||
tkMessageBox.showerror(
|
tkMessageBox.showerror(
|
||||||
title="Error loading the file",
|
title="Error loading the file",
|
||||||
|
@ -283,24 +301,37 @@ class IOBinding:
|
||||||
"installation. The file may not display correctly" % name,
|
"installation. The file may not display correctly" % name,
|
||||||
master = self.text)
|
master = self.text)
|
||||||
enc = None
|
enc = None
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
return None
|
||||||
if enc:
|
if enc:
|
||||||
try:
|
try:
|
||||||
return str(chars, enc)
|
chars = str(bytes, enc)
|
||||||
except UnicodeError:
|
self.fileencoding = enc
|
||||||
|
return chars
|
||||||
|
except UnicodeDecodeError:
|
||||||
pass
|
pass
|
||||||
# If it is ASCII, we need not to record anything
|
# Try ascii:
|
||||||
try:
|
try:
|
||||||
return str(chars, 'ascii')
|
chars = str(bytes, 'ascii')
|
||||||
except UnicodeError:
|
self.fileencoding = None
|
||||||
|
return chars
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
# Try utf-8:
|
||||||
|
try:
|
||||||
|
chars = str(bytes, 'utf-8')
|
||||||
|
self.fileencoding = 'utf-8'
|
||||||
|
return chars
|
||||||
|
except UnicodeDecodeError:
|
||||||
pass
|
pass
|
||||||
# Finally, try the locale's encoding. This is deprecated;
|
# Finally, try the locale's encoding. This is deprecated;
|
||||||
# the user should declare a non-ASCII encoding
|
# the user should declare a non-ASCII encoding
|
||||||
try:
|
try:
|
||||||
chars = str(chars, encoding)
|
chars = str(bytes, locale_encoding)
|
||||||
self.fileencoding = encoding
|
self.fileencoding = locale_encoding
|
||||||
except UnicodeError:
|
except UnicodeDecodeError:
|
||||||
pass
|
pass
|
||||||
return chars
|
return chars # None on failure
|
||||||
|
|
||||||
def maybesave(self):
|
def maybesave(self):
|
||||||
if self.get_saved():
|
if self.get_saved():
|
||||||
|
@ -383,8 +414,9 @@ class IOBinding:
|
||||||
return chars.encode('ascii')
|
return chars.encode('ascii')
|
||||||
except UnicodeError:
|
except UnicodeError:
|
||||||
pass
|
pass
|
||||||
# If there is an encoding declared, try this first.
|
# Check if there is an encoding declared
|
||||||
try:
|
try:
|
||||||
|
# a string, let coding_spec slice it to the first two lines
|
||||||
enc = coding_spec(chars)
|
enc = coding_spec(chars)
|
||||||
failed = None
|
failed = None
|
||||||
except LookupError as msg:
|
except LookupError as msg:
|
||||||
|
@ -509,7 +541,6 @@ class IOBinding:
|
||||||
self.opendialog = tkFileDialog.Open(master=self.text,
|
self.opendialog = tkFileDialog.Open(master=self.text,
|
||||||
filetypes=self.filetypes)
|
filetypes=self.filetypes)
|
||||||
filename = self.opendialog.show(initialdir=dir, initialfile=base)
|
filename = self.opendialog.show(initialdir=dir, initialfile=base)
|
||||||
assert isinstance(filename, str)
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
def defaultfilename(self, mode="open"):
|
def defaultfilename(self, mode="open"):
|
||||||
|
|
Loading…
Reference in New Issue