mirror of https://github.com/python/cpython
Add a workaround for a problem that UTF-8 strings can be corrupted
or broken by basic ctype functions in 4.4BSD descendants. This will be fixed in their future development branches but they'll keep the POSIX-incompatibility for their backward-compatiblities in near future.
This commit is contained in:
parent
6db15d7307
commit
b5047fd019
|
@ -411,6 +411,39 @@ extern int fdatasync(int);
|
||||||
extern double hypot(double, double);
|
extern double hypot(double, double);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/*******************************************************************
|
||||||
|
On 4.4BSD-descendants, ctype functions serves the whole range of
|
||||||
|
wchar_t character set rather than single byte code points only.
|
||||||
|
This characteristic can break some operations of string object
|
||||||
|
including str.upper() and str.split() on UTF-8 locales. This
|
||||||
|
workaround was provided by Tim Robbins of FreeBSD project. He said
|
||||||
|
the incompatibility will be fixed in FreeBSD 6.
|
||||||
|
********************************************************************/
|
||||||
|
|
||||||
|
#ifdef __FreeBSD__
|
||||||
|
#include <osreldate.h>
|
||||||
|
#if __FreeBSD_version > 500039
|
||||||
|
#include <ctype.h>
|
||||||
|
#include <wctype.h>
|
||||||
|
#undef isalnum
|
||||||
|
#define isalnum(c) iswalnum(btowc(c))
|
||||||
|
#undef isalpha
|
||||||
|
#define isalpha(c) iswalpha(btowc(c))
|
||||||
|
#undef islower
|
||||||
|
#define islower(c) iswlower(btowc(c))
|
||||||
|
#undef isspace
|
||||||
|
#define isspace(c) iswspace(btowc(c))
|
||||||
|
#undef isupper
|
||||||
|
#define isupper(c) iswupper(btowc(c))
|
||||||
|
#undef tolower
|
||||||
|
#define tolower(c) towlower(btowc(c))
|
||||||
|
#undef toupper
|
||||||
|
#define toupper(c) towupper(btowc(c))
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/* Declarations for symbol visibility.
|
/* Declarations for symbol visibility.
|
||||||
|
|
||||||
PyAPI_FUNC(type): Declares a public Python API function and return type
|
PyAPI_FUNC(type): Declares a public Python API function and return type
|
||||||
|
|
|
@ -47,3 +47,38 @@ try:
|
||||||
locale.getpreferredencoding()
|
locale.getpreferredencoding()
|
||||||
finally:
|
finally:
|
||||||
locale.setlocale(locale.LC_NUMERIC, oldlocale)
|
locale.setlocale(locale.LC_NUMERIC, oldlocale)
|
||||||
|
|
||||||
|
|
||||||
|
# Test BSD Rune locale's bug for isctype functions.
|
||||||
|
def teststrop(s, method, output):
|
||||||
|
if verbose:
|
||||||
|
print "%s.%s() =? %s ..." % (repr(s), method, repr(output)),
|
||||||
|
result = getattr(s, method)()
|
||||||
|
if result != output:
|
||||||
|
if verbose:
|
||||||
|
print "no"
|
||||||
|
print "%s.%s() == %s != %s" % (repr(s), method, repr(result),
|
||||||
|
repr(output))
|
||||||
|
elif verbose:
|
||||||
|
print "yes"
|
||||||
|
|
||||||
|
try:
|
||||||
|
oldlocale = locale.setlocale(locale.LC_CTYPE)
|
||||||
|
locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8')
|
||||||
|
except locale.Error:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
teststrop('\x20', 'isspace', True)
|
||||||
|
teststrop('\xa0', 'isspace', False)
|
||||||
|
teststrop('\xa1', 'isspace', False)
|
||||||
|
teststrop('\xc0', 'isalpha', False)
|
||||||
|
teststrop('\xc0', 'isalnum', False)
|
||||||
|
teststrop('\xc0', 'isupper', False)
|
||||||
|
teststrop('\xc0', 'islower', False)
|
||||||
|
teststrop('\xec\xa0\xbc', 'split', ['\xec\xa0\xbc'])
|
||||||
|
teststrop('\xed\x95\xa0', 'strip', '\xed\x95\xa0')
|
||||||
|
teststrop('\xcc\x85', 'lower', '\xcc\x85')
|
||||||
|
teststrop('\xed\x95\xa0', 'upper', '\xed\x95\xa0')
|
||||||
|
finally:
|
||||||
|
locale.setlocale(locale.LC_CTYPE, oldlocale)
|
||||||
|
|
|
@ -64,6 +64,9 @@ Core and builtins
|
||||||
|
|
||||||
- Implemented bind_textdomain_codeset() in locale module.
|
- Implemented bind_textdomain_codeset() in locale module.
|
||||||
|
|
||||||
|
- Added a workaround for proper string operations in BSDs. str.split
|
||||||
|
and str.is* methods can now work correctly with UTF-8 locales.
|
||||||
|
|
||||||
Extension modules
|
Extension modules
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue