From b5047fd01948ab108edcc1b3c2c901d915814cfd Mon Sep 17 00:00:00 2001 From: Hye-Shik Chang Date: Wed, 4 Aug 2004 06:33:51 +0000 Subject: [PATCH] Add a workaround for a problem that UTF-8 strings can be corrupted or broken by basic ctype functions in 4.4BSD descendants. This will be fixed in their future development branches but they'll keep the POSIX-incompatibility for their backward-compatiblities in near future. --- Include/pyport.h | 33 +++++++++++++++++++++++++++++++++ Lib/test/test_locale.py | 35 +++++++++++++++++++++++++++++++++++ Misc/NEWS | 3 +++ 3 files changed, 71 insertions(+) diff --git a/Include/pyport.h b/Include/pyport.h index 0ee42f0f4ed..b20bc15291f 100644 --- a/Include/pyport.h +++ b/Include/pyport.h @@ -411,6 +411,39 @@ extern int fdatasync(int); extern double hypot(double, double); #endif + +/******************************************************************* +On 4.4BSD-descendants, ctype functions serves the whole range of +wchar_t character set rather than single byte code points only. +This characteristic can break some operations of string object +including str.upper() and str.split() on UTF-8 locales. This +workaround was provided by Tim Robbins of FreeBSD project. He said +the incompatibility will be fixed in FreeBSD 6. +********************************************************************/ + +#ifdef __FreeBSD__ +#include +#if __FreeBSD_version > 500039 +#include +#include +#undef isalnum +#define isalnum(c) iswalnum(btowc(c)) +#undef isalpha +#define isalpha(c) iswalpha(btowc(c)) +#undef islower +#define islower(c) iswlower(btowc(c)) +#undef isspace +#define isspace(c) iswspace(btowc(c)) +#undef isupper +#define isupper(c) iswupper(btowc(c)) +#undef tolower +#define tolower(c) towlower(btowc(c)) +#undef toupper +#define toupper(c) towupper(btowc(c)) +#endif +#endif + + /* Declarations for symbol visibility. PyAPI_FUNC(type): Declares a public Python API function and return type diff --git a/Lib/test/test_locale.py b/Lib/test/test_locale.py index 9187c9e84b5..d8f79250525 100644 --- a/Lib/test/test_locale.py +++ b/Lib/test/test_locale.py @@ -47,3 +47,38 @@ try: locale.getpreferredencoding() finally: locale.setlocale(locale.LC_NUMERIC, oldlocale) + + +# Test BSD Rune locale's bug for isctype functions. +def teststrop(s, method, output): + if verbose: + print "%s.%s() =? %s ..." % (repr(s), method, repr(output)), + result = getattr(s, method)() + if result != output: + if verbose: + print "no" + print "%s.%s() == %s != %s" % (repr(s), method, repr(result), + repr(output)) + elif verbose: + print "yes" + +try: + oldlocale = locale.setlocale(locale.LC_CTYPE) + locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8') +except locale.Error: + pass +else: + try: + teststrop('\x20', 'isspace', True) + teststrop('\xa0', 'isspace', False) + teststrop('\xa1', 'isspace', False) + teststrop('\xc0', 'isalpha', False) + teststrop('\xc0', 'isalnum', False) + teststrop('\xc0', 'isupper', False) + teststrop('\xc0', 'islower', False) + teststrop('\xec\xa0\xbc', 'split', ['\xec\xa0\xbc']) + teststrop('\xed\x95\xa0', 'strip', '\xed\x95\xa0') + teststrop('\xcc\x85', 'lower', '\xcc\x85') + teststrop('\xed\x95\xa0', 'upper', '\xed\x95\xa0') + finally: + locale.setlocale(locale.LC_CTYPE, oldlocale) diff --git a/Misc/NEWS b/Misc/NEWS index c450f63dcbe..40c58b3d0e4 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -64,6 +64,9 @@ Core and builtins - Implemented bind_textdomain_codeset() in locale module. +- Added a workaround for proper string operations in BSDs. str.split + and str.is* methods can now work correctly with UTF-8 locales. + Extension modules -----------------