Add a workaround for a problem that UTF-8 strings can be corrupted

or broken by basic ctype functions in 4.4BSD descendants.  This
will be fixed in their future development branches but they'll keep
the POSIX-incompatibility for their backward-compatiblities in near
future.
This commit is contained in:
Hye-Shik Chang 2004-08-04 06:33:51 +00:00
parent 6db15d7307
commit b5047fd019
3 changed files with 71 additions and 0 deletions

View File

@ -411,6 +411,39 @@ extern int fdatasync(int);
extern double hypot(double, double);
#endif
/*******************************************************************
On 4.4BSD-descendants, ctype functions serves the whole range of
wchar_t character set rather than single byte code points only.
This characteristic can break some operations of string object
including str.upper() and str.split() on UTF-8 locales. This
workaround was provided by Tim Robbins of FreeBSD project. He said
the incompatibility will be fixed in FreeBSD 6.
********************************************************************/
#ifdef __FreeBSD__
#include <osreldate.h>
#if __FreeBSD_version > 500039
#include <ctype.h>
#include <wctype.h>
#undef isalnum
#define isalnum(c) iswalnum(btowc(c))
#undef isalpha
#define isalpha(c) iswalpha(btowc(c))
#undef islower
#define islower(c) iswlower(btowc(c))
#undef isspace
#define isspace(c) iswspace(btowc(c))
#undef isupper
#define isupper(c) iswupper(btowc(c))
#undef tolower
#define tolower(c) towlower(btowc(c))
#undef toupper
#define toupper(c) towupper(btowc(c))
#endif
#endif
/* Declarations for symbol visibility.
PyAPI_FUNC(type): Declares a public Python API function and return type

View File

@ -47,3 +47,38 @@ try:
locale.getpreferredencoding()
finally:
locale.setlocale(locale.LC_NUMERIC, oldlocale)
# Test BSD Rune locale's bug for isctype functions.
def teststrop(s, method, output):
if verbose:
print "%s.%s() =? %s ..." % (repr(s), method, repr(output)),
result = getattr(s, method)()
if result != output:
if verbose:
print "no"
print "%s.%s() == %s != %s" % (repr(s), method, repr(result),
repr(output))
elif verbose:
print "yes"
try:
oldlocale = locale.setlocale(locale.LC_CTYPE)
locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8')
except locale.Error:
pass
else:
try:
teststrop('\x20', 'isspace', True)
teststrop('\xa0', 'isspace', False)
teststrop('\xa1', 'isspace', False)
teststrop('\xc0', 'isalpha', False)
teststrop('\xc0', 'isalnum', False)
teststrop('\xc0', 'isupper', False)
teststrop('\xc0', 'islower', False)
teststrop('\xec\xa0\xbc', 'split', ['\xec\xa0\xbc'])
teststrop('\xed\x95\xa0', 'strip', '\xed\x95\xa0')
teststrop('\xcc\x85', 'lower', '\xcc\x85')
teststrop('\xed\x95\xa0', 'upper', '\xed\x95\xa0')
finally:
locale.setlocale(locale.LC_CTYPE, oldlocale)

View File

@ -64,6 +64,9 @@ Core and builtins
- Implemented bind_textdomain_codeset() in locale module.
- Added a workaround for proper string operations in BSDs. str.split
and str.is* methods can now work correctly with UTF-8 locales.
Extension modules
-----------------