Issue #3280: like chr() already does, the "%c" format now accepts the full unicode range

even on "narrow Unicode" builds; the result is a pair of UTF-16 surrogates.
2008-07-04 21:26:43 +00:00 · 2008-07-04 21:26:43 +00:00 · a4db68622c
parent 142957ce95
commit a4db68622c
4 changed files with 43 additions and 27 deletions
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -717,7 +717,10 @@ class UnicodeTest(
        self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')

        self.assertEqual('%c' % 0x1234, '\u1234')
-        self.assertRaises(OverflowError, "%c".__mod__, (sys.maxunicode+1,))
+        self.assertEqual('%c' % 0x21483, '\U00021483')
+        self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
+        self.assertEqual('%c' % '\U00021483', '\U00021483')
+        self.assertRaises(TypeError, "%c".__mod__, "aa")

        # formatting jobs delegated from the string implementation:
        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -12,6 +12,11 @@ What's new in Python 3.0b2?
 Core and Builtins
 -----------------

+- Issue #3280: like chr(), the "%c" format now accepts unicode code points
+  beyond the Basic Multilingual Plane (above 0xffff) on all configurations. On
+  "narrow Unicode" builds, the result is a string of 2 code units, forming a
+  UTF-16 surrogate pair.
+
 - Issue #3282: str.isprintable() should return False for undefined
  Unicode characters.

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -8730,11 +8730,28 @@ formatchar(Py_UNICODE *buf,
           size_t buflen,
           PyObject *v)
 {
-    /* presume that the buffer is at least 2 characters long */
+    /* presume that the buffer is at least 3 characters long */
    if (PyUnicode_Check(v)) {
-	if (PyUnicode_GET_SIZE(v) != 1)
-	    goto onError;
-	buf[0] = PyUnicode_AS_UNICODE(v)[0];
+	if (PyUnicode_GET_SIZE(v) == 1) {
+	    buf[0] = PyUnicode_AS_UNICODE(v)[0];
+	    buf[1] = '\0';
+	    return 1;
+	}
+#ifndef Py_UNICODE_WIDE
+	if (PyUnicode_GET_SIZE(v) == 2) {
+	    /* Decode a valid surrogate pair */
+	    int c0 = PyUnicode_AS_UNICODE(v)[0];
+	    int c1 = PyUnicode_AS_UNICODE(v)[1];
+	    if (0xD800 <= c0 && c0 <= 0xDBFF &&
+		0xDC00 <= c1 && c1 <= 0xDFFF) {
+		buf[0] = c0;
+		buf[1] = c1;
+		buf[2] = '\0';
+		return 2;
+	    }
+	}
+#endif
+	goto onError;
    }
    else {
 	/* Integer input truncated to a character */
@ -8742,25 +8759,25 @@ formatchar(Py_UNICODE *buf,
 	x = PyLong_AsLong(v);
 	if (x == -1 && PyErr_Occurred())
 	    goto onError;
-#ifdef Py_UNICODE_WIDE
+
 	if (x < 0 || x > 0x10ffff) {
 	    PyErr_SetString(PyExc_OverflowError,
-			    "%c arg not in range(0x110000) "
-			    "(wide Python build)");
+			    "%c arg not in range(0x110000)");
 	    return -1;
 	}
-#else
-	if (x < 0 || x > 0xffff) {
-	    PyErr_SetString(PyExc_OverflowError,
-			    "%c arg not in range(0x10000) "
-			    "(narrow Python build)");
-	    return -1;
+
+#ifndef Py_UNICODE_WIDE
+	if (x > 0xffff) {
+	    x -= 0x10000;
+	    buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
+	    buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
+	    return 2;
 	}
 #endif
 	buf[0] = (Py_UNICODE) x;
+	buf[1] = '\0';
+	return 1;
    }
-    buf[1] = '\0';
-    return 1;

 onError:
    PyErr_SetString(PyExc_TypeError,
--- a/Python/modsupport.c
+++ b/Python/modsupport.c
@ -294,21 +294,12 @@ do_mkvalue(const char **p_format, va_list *p_va, int flags)
 		case 'C':
 		{
 			int i = va_arg(*p_va, int);
-			Py_UNICODE c;
 			if (i < 0 || i > PyUnicode_GetMax()) {
-#ifdef Py_UNICODE_WIDE
 				PyErr_SetString(PyExc_OverflowError,
-				                "%c arg not in range(0x110000) "
-				                "(wide Python build)");
-#else
-				PyErr_SetString(PyExc_OverflowError,
-				                "%c arg not in range(0x10000) "
-				                "(narrow Python build)");
-#endif
+				                "%c arg not in range(0x110000)";
 				return NULL;
 			}
-			c = i;
-			return PyUnicode_FromUnicode(&c, 1);
+			return PyUnicode_FromOrdinal(i);
 		}

 		case 's':