From a26215db11cfcf7b5f55cab9e91396761a0e0bcf Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 15 Nov 2020 18:16:59 +0200 Subject: [PATCH] bpo-42318: Fix support of non-BMP characters in Tkinter on macOS (GH-23281) --- Lib/test/test_tcl.py | 46 +++++++++++++--- .../2020-11-14-13-46-27.bpo-42318.wYAcBD.rst | 1 + Modules/_tkinter.c | 54 ++++++++++++++++++- 3 files changed, 94 insertions(+), 7 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst diff --git a/Lib/test/test_tcl.py b/Lib/test/test_tcl.py index cd2a30e533a..d104eb89104 100644 --- a/Lib/test/test_tcl.py +++ b/Lib/test/test_tcl.py @@ -1,4 +1,5 @@ import unittest +import locale import re import subprocess import sys @@ -61,6 +62,10 @@ class TclTest(unittest.TestCase): tcl = self.interp self.assertEqual(tcl.eval('set a "a\\0b"'), 'a\x00b') + def test_eval_surrogates_in_result(self): + tcl = self.interp + self.assertIn(tcl.eval(r'set a "<\ud83d\udcbb>"'), '<\U0001f4bb>') + def testEvalException(self): tcl = self.interp self.assertRaises(TclError,tcl.eval,'set a') @@ -193,29 +198,48 @@ class TclTest(unittest.TestCase): def testEvalFile(self): tcl = self.interp - with open(os_helper.TESTFN, 'w') as f: - self.addCleanup(os_helper.unlink, os_helper.TESTFN) + filename = os_helper.TESTFN_ASCII + self.addCleanup(os_helper.unlink, filename) + with open(filename, 'w') as f: f.write("""set a 1 set b 2 set c [ expr $a + $b ] """) - tcl.evalfile(os_helper.TESTFN) + tcl.evalfile(filename) self.assertEqual(tcl.eval('set a'),'1') self.assertEqual(tcl.eval('set b'),'2') self.assertEqual(tcl.eval('set c'),'3') def test_evalfile_null_in_result(self): tcl = self.interp - with open(os_helper.TESTFN, 'w') as f: - self.addCleanup(os_helper.unlink, os_helper.TESTFN) + filename = os_helper.TESTFN_ASCII + self.addCleanup(os_helper.unlink, filename) + with open(filename, 'w') as f: f.write(""" set a "a\0b" set b "a\\0b" """) - tcl.evalfile(os_helper.TESTFN) + tcl.evalfile(filename) self.assertEqual(tcl.eval('set a'), 'a\x00b') self.assertEqual(tcl.eval('set b'), 'a\x00b') + def test_evalfile_surrogates_in_result(self): + tcl = self.interp + encoding = tcl.call('encoding', 'system') + self.addCleanup(tcl.call, 'encoding', 'system', encoding) + tcl.call('encoding', 'system', 'utf-8') + + filename = os_helper.TESTFN_ASCII + self.addCleanup(os_helper.unlink, filename) + with open(filename, 'wb') as f: + f.write(b""" + set a "<\xed\xa0\xbd\xed\xb2\xbb>" + set b "<\\ud83d\\udcbb>" + """) + tcl.evalfile(filename) + self.assertEqual(tcl.eval('set a'), '<\U0001f4bb>') + self.assertEqual(tcl.eval('set b'), '<\U0001f4bb>') + def testEvalFileException(self): tcl = self.interp filename = "doesnotexists" @@ -438,6 +462,11 @@ class TclTest(unittest.TestCase): self.assertEqual(passValue('str\x00ing\u20ac'), 'str\x00ing\u20ac') self.assertEqual(passValue('str\x00ing\U0001f4bb'), 'str\x00ing\U0001f4bb') + if sys.platform != 'win32': + self.assertEqual(passValue('<\udce2\udc82\udcac>'), + '<\u20ac>') + self.assertEqual(passValue('<\udced\udca0\udcbd\udced\udcb2\udcbb>'), + '<\U0001f4bb>') self.assertEqual(passValue(b'str\x00ing'), b'str\x00ing' if self.wantobjects else 'str\x00ing') self.assertEqual(passValue(b'str\xc0\x80ing'), @@ -497,6 +526,9 @@ class TclTest(unittest.TestCase): check('string\xbd') check('string\u20ac') check('string\U0001f4bb') + if sys.platform != 'win32': + check('<\udce2\udc82\udcac>', '<\u20ac>') + check('<\udced\udca0\udcbd\udced\udcb2\udcbb>', '<\U0001f4bb>') check('') check(b'string', 'string') check(b'string\xe2\x82\xac', 'string\xe2\x82\xac') @@ -540,6 +572,8 @@ class TclTest(unittest.TestCase): ('a \u20ac', ('a', '\u20ac')), ('a \U0001f4bb', ('a', '\U0001f4bb')), (b'a \xe2\x82\xac', ('a', '\u20ac')), + (b'a \xf0\x9f\x92\xbb', ('a', '\U0001f4bb')), + (b'a \xed\xa0\xbd\xed\xb2\xbb', ('a', '\U0001f4bb')), (b'a\xc0\x80b c\xc0\x80d', ('a\x00b', 'c\x00d')), ('a {b c}', ('a', 'b c')), (r'a b\ c', ('a', 'b c')), diff --git a/Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst b/Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst new file mode 100644 index 00000000000..e72daebb2f1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst @@ -0,0 +1 @@ +Fixed support of non-BMP characters in :mod:`tkinter` on macOS. diff --git a/Modules/_tkinter.c b/Modules/_tkinter.c index 793c5e71548..b30141d4497 100644 --- a/Modules/_tkinter.c +++ b/Modules/_tkinter.c @@ -395,7 +395,8 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size) char *buf = NULL; PyErr_Clear(); - /* Tcl encodes null character as \xc0\x80 */ + /* Tcl encodes null character as \xc0\x80. + https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 */ if (memchr(s, '\xc0', size)) { char *q; const char *e = s + size; @@ -419,6 +420,57 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size) if (buf != NULL) { PyMem_Free(buf); } + if (r == NULL || PyUnicode_KIND(r) == PyUnicode_1BYTE_KIND) { + return r; + } + + /* In CESU-8 non-BMP characters are represented as a surrogate pair, + like in UTF-16, and then each surrogate code point is encoded in UTF-8. + https://en.wikipedia.org/wiki/CESU-8 */ + Py_ssize_t len = PyUnicode_GET_LENGTH(r); + Py_ssize_t i, j; + /* All encoded surrogate characters start with \xED. */ + i = PyUnicode_FindChar(r, 0xdcED, 0, len, 1); + if (i == -2) { + Py_DECREF(r); + return NULL; + } + if (i == -1) { + return r; + } + Py_UCS4 *u = PyUnicode_AsUCS4Copy(r); + Py_DECREF(r); + if (u == NULL) { + return NULL; + } + Py_UCS4 ch; + for (j = i; i < len; i++, u[j++] = ch) { + Py_UCS4 ch1, ch2, ch3, high, low; + /* Low surrogates U+D800 - U+DBFF are encoded as + \xED\xA0\x80 - \xED\xAF\xBF. */ + ch1 = ch = u[i]; + if (ch1 != 0xdcED) continue; + ch2 = u[i + 1]; + if (!(0xdcA0 <= ch2 && ch2 <= 0xdcAF)) continue; + ch3 = u[i + 2]; + if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue; + high = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F); + assert(Py_UNICODE_IS_HIGH_SURROGATE(high)); + /* High surrogates U+DC00 - U+DFFF are encoded as + \xED\xB0\x80 - \xED\xBF\xBF. */ + ch1 = u[i + 3]; + if (ch1 != 0xdcED) continue; + ch2 = u[i + 4]; + if (!(0xdcB0 <= ch2 && ch2 <= 0xdcBF)) continue; + ch3 = u[i + 5]; + if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue; + low = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F); + assert(Py_UNICODE_IS_HIGH_SURROGATE(high)); + ch = Py_UNICODE_JOIN_SURROGATES(high, low); + i += 5; + } + r = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, u, j); + PyMem_Free(u); return r; }