Add a workaround for a problem that UTF-8 strings can be corrupted

or broken by basic ctype functions in 4.4BSD descendants. This will be fixed in their future development branches but they'll keep the POSIX-incompatibility for their backward-compatiblities in near future.
2004-08-04 06:33:51 +00:00 · 2004-08-04 06:33:51 +00:00 · b5047fd019
parent 6db15d7307
commit b5047fd019
3 changed files with 71 additions and 0 deletions
--- a/Include/pyport.h
+++ b/Include/pyport.h
@ -411,6 +411,39 @@ extern int fdatasync(int);
 extern double hypot(double, double);
 #endif

+
+/*******************************************************************
+On 4.4BSD-descendants, ctype functions serves the whole range of
+wchar_t character set rather than single byte code points only.
+This characteristic can break some operations of string object
+including str.upper() and str.split() on UTF-8 locales.  This
+workaround was provided by Tim Robbins of FreeBSD project.  He said
+the incompatibility will be fixed in FreeBSD 6.
+********************************************************************/
+
+#ifdef __FreeBSD__
+#include <osreldate.h>
+#if __FreeBSD_version > 500039
+#include <ctype.h>
+#include <wctype.h>
+#undef isalnum
+#define isalnum(c) iswalnum(btowc(c))
+#undef isalpha
+#define isalpha(c) iswalpha(btowc(c))
+#undef islower
+#define islower(c) iswlower(btowc(c))
+#undef isspace
+#define isspace(c) iswspace(btowc(c))
+#undef isupper
+#define isupper(c) iswupper(btowc(c))
+#undef tolower
+#define tolower(c) towlower(btowc(c))
+#undef toupper
+#define toupper(c) towupper(btowc(c))
+#endif
+#endif
+
+
 /* Declarations for symbol visibility.

  PyAPI_FUNC(type): Declares a public Python API function and return type
--- a/Lib/test/test_locale.py
+++ b/Lib/test/test_locale.py
@ -47,3 +47,38 @@ try:
    locale.getpreferredencoding()
 finally:
    locale.setlocale(locale.LC_NUMERIC, oldlocale)
+
+
+# Test BSD Rune locale's bug for isctype functions.
+def teststrop(s, method, output):
+    if verbose:
+        print "%s.%s() =? %s ..." % (repr(s), method, repr(output)),
+    result = getattr(s, method)()
+    if result != output:
+        if verbose:
+            print "no"
+        print "%s.%s() == %s != %s" % (repr(s), method, repr(result),
+                                       repr(output))
+    elif verbose:
+        print "yes"
+
+try:
+    oldlocale = locale.setlocale(locale.LC_CTYPE)
+    locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8')
+except locale.Error:
+    pass
+else:
+    try:
+        teststrop('\x20', 'isspace', True)
+        teststrop('\xa0', 'isspace', False)
+        teststrop('\xa1', 'isspace', False)
+        teststrop('\xc0', 'isalpha', False)
+        teststrop('\xc0', 'isalnum', False)
+        teststrop('\xc0', 'isupper', False)
+        teststrop('\xc0', 'islower', False)
+        teststrop('\xec\xa0\xbc', 'split', ['\xec\xa0\xbc'])
+        teststrop('\xed\x95\xa0', 'strip', '\xed\x95\xa0')
+        teststrop('\xcc\x85', 'lower', '\xcc\x85')
+        teststrop('\xed\x95\xa0', 'upper', '\xed\x95\xa0')
+    finally:
+        locale.setlocale(locale.LC_CTYPE, oldlocale)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -64,6 +64,9 @@ Core and builtins

 - Implemented bind_textdomain_codeset() in locale module.

+- Added a workaround for proper string operations in BSDs.  str.split
+  and str.is* methods can now work correctly with UTF-8 locales.
+
 Extension modules
 -----------------