From 4d840e428ab1a2712f219c5e4008658cbe15892e Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Fri, 25 Dec 2020 14:35:46 -0800 Subject: [PATCH] [3.8] bpo-42318: Fix support of non-BMP characters in Tkinter on macOS (GH-23281). (GH-23784) (GH-23787) (cherry picked from commit a26215db11cfcf7b5f55cab9e91396761a0e0bcf) (cherry picked from commit 28bf6ab61f77c69b732a211c398ac882bf3f65f4) --- Lib/test/test_tcl.py | 46 +++++++++++++--- .../2020-11-14-13-46-27.bpo-42318.wYAcBD.rst | 1 + Modules/_tkinter.c | 54 ++++++++++++++++++- 3 files changed, 94 insertions(+), 7 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst diff --git a/Lib/test/test_tcl.py b/Lib/test/test_tcl.py index 3183ea850f7..6005349eaac 100644 --- a/Lib/test/test_tcl.py +++ b/Lib/test/test_tcl.py @@ -1,4 +1,5 @@ import unittest +import locale import re import subprocess import sys @@ -58,6 +59,10 @@ class TclTest(unittest.TestCase): tcl = self.interp self.assertEqual(tcl.eval('set a "a\\0b"'), 'a\x00b') + def test_eval_surrogates_in_result(self): + tcl = self.interp + self.assertIn(tcl.eval(r'set a "<\ud83d\udcbb>"'), '<\U0001f4bb>') + def testEvalException(self): tcl = self.interp self.assertRaises(TclError,tcl.eval,'set a') @@ -190,29 +195,48 @@ class TclTest(unittest.TestCase): def testEvalFile(self): tcl = self.interp - with open(support.TESTFN, 'w') as f: - self.addCleanup(support.unlink, support.TESTFN) + filename = support.TESTFN + self.addCleanup(support.unlink, filename) + with open(filename, 'w') as f: f.write("""set a 1 set b 2 set c [ expr $a + $b ] """) - tcl.evalfile(support.TESTFN) + tcl.evalfile(filename) self.assertEqual(tcl.eval('set a'),'1') self.assertEqual(tcl.eval('set b'),'2') self.assertEqual(tcl.eval('set c'),'3') def test_evalfile_null_in_result(self): tcl = self.interp - with open(support.TESTFN, 'w') as f: - self.addCleanup(support.unlink, support.TESTFN) + filename = support.TESTFN + self.addCleanup(support.unlink, filename) + with open(filename, 'w') as f: f.write(""" set a "a\0b" set b "a\\0b" """) - tcl.evalfile(support.TESTFN) + tcl.evalfile(filename) self.assertEqual(tcl.eval('set a'), 'a\x00b') self.assertEqual(tcl.eval('set b'), 'a\x00b') + def test_evalfile_surrogates_in_result(self): + tcl = self.interp + encoding = tcl.call('encoding', 'system') + self.addCleanup(tcl.call, 'encoding', 'system', encoding) + tcl.call('encoding', 'system', 'utf-8') + + filename = support.TESTFN + self.addCleanup(support.unlink, filename) + with open(filename, 'wb') as f: + f.write(b""" + set a "<\xed\xa0\xbd\xed\xb2\xbb>" + set b "<\\ud83d\\udcbb>" + """) + tcl.evalfile(filename) + self.assertEqual(tcl.eval('set a'), '<\U0001f4bb>') + self.assertEqual(tcl.eval('set b'), '<\U0001f4bb>') + def testEvalFileException(self): tcl = self.interp filename = "doesnotexists" @@ -435,6 +459,11 @@ class TclTest(unittest.TestCase): self.assertEqual(passValue('str\x00ing\u20ac'), 'str\x00ing\u20ac') self.assertEqual(passValue('str\x00ing\U0001f4bb'), 'str\x00ing\U0001f4bb') + if sys.platform != 'win32': + self.assertEqual(passValue('<\udce2\udc82\udcac>'), + '<\u20ac>') + self.assertEqual(passValue('<\udced\udca0\udcbd\udced\udcb2\udcbb>'), + '<\U0001f4bb>') self.assertEqual(passValue(b'str\x00ing'), b'str\x00ing' if self.wantobjects else 'str\x00ing') self.assertEqual(passValue(b'str\xc0\x80ing'), @@ -494,6 +523,9 @@ class TclTest(unittest.TestCase): check('string\xbd') check('string\u20ac') check('string\U0001f4bb') + if sys.platform != 'win32': + check('<\udce2\udc82\udcac>', '<\u20ac>') + check('<\udced\udca0\udcbd\udced\udcb2\udcbb>', '<\U0001f4bb>') check('') check(b'string', 'string') check(b'string\xe2\x82\xac', 'string\xe2\x82\xac') @@ -537,6 +569,8 @@ class TclTest(unittest.TestCase): ('a \u20ac', ('a', '\u20ac')), ('a \U0001f4bb', ('a', '\U0001f4bb')), (b'a \xe2\x82\xac', ('a', '\u20ac')), + (b'a \xf0\x9f\x92\xbb', ('a', '\U0001f4bb')), + (b'a \xed\xa0\xbd\xed\xb2\xbb', ('a', '\U0001f4bb')), (b'a\xc0\x80b c\xc0\x80d', ('a\x00b', 'c\x00d')), ('a {b c}', ('a', 'b c')), (r'a b\ c', ('a', 'b c')), diff --git a/Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst b/Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst new file mode 100644 index 00000000000..e72daebb2f1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst @@ -0,0 +1 @@ +Fixed support of non-BMP characters in :mod:`tkinter` on macOS. diff --git a/Modules/_tkinter.c b/Modules/_tkinter.c index a1071e5a581..c1eb6e10c7f 100644 --- a/Modules/_tkinter.c +++ b/Modules/_tkinter.c @@ -397,7 +397,8 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size) char *buf = NULL; PyErr_Clear(); - /* Tcl encodes null character as \xc0\x80 */ + /* Tcl encodes null character as \xc0\x80. + https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 */ if (memchr(s, '\xc0', size)) { char *q; const char *e = s + size; @@ -421,6 +422,57 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size) if (buf != NULL) { PyMem_Free(buf); } + if (r == NULL || PyUnicode_KIND(r) == PyUnicode_1BYTE_KIND) { + return r; + } + + /* In CESU-8 non-BMP characters are represented as a surrogate pair, + like in UTF-16, and then each surrogate code point is encoded in UTF-8. + https://en.wikipedia.org/wiki/CESU-8 */ + Py_ssize_t len = PyUnicode_GET_LENGTH(r); + Py_ssize_t i, j; + /* All encoded surrogate characters start with \xED. */ + i = PyUnicode_FindChar(r, 0xdcED, 0, len, 1); + if (i == -2) { + Py_DECREF(r); + return NULL; + } + if (i == -1) { + return r; + } + Py_UCS4 *u = PyUnicode_AsUCS4Copy(r); + Py_DECREF(r); + if (u == NULL) { + return NULL; + } + Py_UCS4 ch; + for (j = i; i < len; i++, u[j++] = ch) { + Py_UCS4 ch1, ch2, ch3, high, low; + /* Low surrogates U+D800 - U+DBFF are encoded as + \xED\xA0\x80 - \xED\xAF\xBF. */ + ch1 = ch = u[i]; + if (ch1 != 0xdcED) continue; + ch2 = u[i + 1]; + if (!(0xdcA0 <= ch2 && ch2 <= 0xdcAF)) continue; + ch3 = u[i + 2]; + if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue; + high = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F); + assert(Py_UNICODE_IS_HIGH_SURROGATE(high)); + /* High surrogates U+DC00 - U+DFFF are encoded as + \xED\xB0\x80 - \xED\xBF\xBF. */ + ch1 = u[i + 3]; + if (ch1 != 0xdcED) continue; + ch2 = u[i + 4]; + if (!(0xdcB0 <= ch2 && ch2 <= 0xdcBF)) continue; + ch3 = u[i + 5]; + if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue; + low = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F); + assert(Py_UNICODE_IS_HIGH_SURROGATE(high)); + ch = Py_UNICODE_JOIN_SURROGATES(high, low); + i += 5; + } + r = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, u, j); + PyMem_Free(u); return r; }