diff --git a/Include/fileutils.h b/Include/fileutils.h index e4bf6d4db95..370878469df 100644 --- a/Include/fileutils.h +++ b/Include/fileutils.h @@ -170,6 +170,11 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric( #endif /* Py_LIMITED_API */ + +#ifdef Py_BUILD_CORE +PyAPI_FUNC(int) _Py_GetForceASCII(void); +#endif + #ifdef __cplusplus } #endif diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-08-28-10-49-55.bpo-34403.4Q3LzP.rst b/Misc/NEWS.d/next/Core and Builtins/2018-08-28-10-49-55.bpo-34403.4Q3LzP.rst new file mode 100644 index 00000000000..d70be82b0ab --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2018-08-28-10-49-55.bpo-34403.4Q3LzP.rst @@ -0,0 +1,3 @@ +On HP-UX with C or POSIX locale, sys.getfilesystemencoding() now returns +"ascii" instead of "roman8" (when the UTF-8 Mode is disabled and the C locale +is not coerced). diff --git a/Python/coreconfig.c b/Python/coreconfig.c index acf46451f15..d08d75b196a 100644 --- a/Python/coreconfig.c +++ b/Python/coreconfig.c @@ -828,18 +828,21 @@ config_read_complex_options(_PyCoreConfig *config) static void config_init_locale(_PyCoreConfig *config) { - if (_Py_LegacyLocaleDetected()) { + if (config->coerce_c_locale < 0) { /* The C locale enables the C locale coercion (PEP 538) */ - if (config->coerce_c_locale < 0) { + if (_Py_LegacyLocaleDetected()) { config->coerce_c_locale = 1; } } + #ifndef MS_WINDOWS - const char *ctype_loc = setlocale(LC_CTYPE, NULL); - if (ctype_loc != NULL - && (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "POSIX") == 0)) { + if (config->utf8_mode < 0) { /* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */ - if (config->utf8_mode < 0) { + const char *ctype_loc = setlocale(LC_CTYPE, NULL); + if (ctype_loc != NULL + && (strcmp(ctype_loc, "C") == 0 + || strcmp(ctype_loc, "POSIX") == 0)) + { config->utf8_mode = 1; } } diff --git a/Python/fileutils.c b/Python/fileutils.c index b413f4e1e68..e756c260cdc 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -72,8 +72,8 @@ _Py_device_encoding(int fd) extern int _Py_normalize_encoding(const char *, char *, size_t); -/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale. - On these operating systems, nl_langinfo(CODESET) announces an alias of the +/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale + and POSIX locale. nl_langinfo(CODESET) announces an alias of the ASCII encoding, whereas mbstowcs() and wcstombs() functions use the ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use locale.getpreferredencoding() codec. For example, if command line arguments @@ -86,6 +86,10 @@ extern int _Py_normalize_encoding(const char *, char *, size_t); workaround is also enabled on error, for example if getting the locale failed. + On HP-UX with the C locale or the POSIX locale, nl_langinfo(CODESET) + announces "roman8" but mbstowcs() uses Latin1 in practice. Force also the + ASCII encoding in this case. + Values of force_ascii: 1: the workaround is used: Py_EncodeLocale() uses @@ -100,13 +104,46 @@ static int force_ascii = -1; static int check_force_ascii(void) { - char *loc; + char *loc = setlocale(LC_CTYPE, NULL); + if (loc == NULL) { + goto error; + } + if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) { + /* the LC_CTYPE locale is different than C and POSIX */ + return 0; + } + #if defined(HAVE_LANGINFO_H) && defined(CODESET) - char *codeset, **alias; + const char *codeset = nl_langinfo(CODESET); + if (!codeset || codeset[0] == '\0') { + /* CODESET is not set or empty */ + goto error; + } + char encoding[20]; /* longest name: "iso_646.irv_1991\0" */ - int is_ascii; - unsigned int i; - char* ascii_aliases[] = { + if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) { + goto error; + } + +#ifdef __hpux + if (strcmp(encoding, "roman8") == 0) { + unsigned char ch; + wchar_t wch; + size_t res; + + ch = (unsigned char)0xA7; + res = mbstowcs(&wch, (char*)&ch, 1); + if (res != (size_t)-1 && wch == L'\xA7') { + /* On HP-UX withe C locale or the POSIX locale, + nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses + Latin1 encoding in practice. Force ASCII in this case. + + Roman8 decodes 0xA7 to U+00CF. Latin1 decodes 0xA7 to U+00A7. */ + return 1; + } + } +#else + const char* ascii_aliases[] = { "ascii", /* Aliases from Lib/encodings/aliases.py */ "646", @@ -123,27 +160,9 @@ check_force_ascii(void) "us_ascii", NULL }; -#endif - loc = setlocale(LC_CTYPE, NULL); - if (loc == NULL) - goto error; - if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) { - /* the LC_CTYPE locale is different than C */ - return 0; - } - -#if defined(HAVE_LANGINFO_H) && defined(CODESET) - codeset = nl_langinfo(CODESET); - if (!codeset || codeset[0] == '\0') { - /* CODESET is not set or empty */ - goto error; - } - if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) - goto error; - - is_ascii = 0; - for (alias=ascii_aliases; *alias != NULL; alias++) { + int is_ascii = 0; + for (const char **alias=ascii_aliases; *alias != NULL; alias++) { if (strcmp(encoding, *alias) == 0) { is_ascii = 1; break; @@ -154,13 +173,14 @@ check_force_ascii(void) return 0; } - for (i=0x80; i<0xff; i++) { - unsigned char ch; - wchar_t wch; + for (unsigned int i=0x80; i<=0xff; i++) { + char ch[1]; + wchar_t wch[1]; size_t res; - ch = (unsigned char)i; - res = mbstowcs(&wch, (char*)&ch, 1); + unsigned uch = (unsigned char)i; + ch[0] = (char)uch; + res = mbstowcs(wch, ch, 1); if (res != (size_t)-1) { /* decoding a non-ASCII character from the locale encoding succeed: the locale encoding is not ASCII, force ASCII */ @@ -169,17 +189,29 @@ check_force_ascii(void) } /* None of the bytes in the range 0x80-0xff can be decoded from the locale encoding: the locale encoding is really ASCII */ +#endif /* !defined(__hpux) */ return 0; #else /* nl_langinfo(CODESET) is not available: always force ASCII */ return 1; -#endif +#endif /* defined(HAVE_LANGINFO_H) && defined(CODESET) */ error: /* if an error occurred, force the ASCII encoding */ return 1; } + +int +_Py_GetForceASCII(void) +{ + if (force_ascii == -1) { + force_ascii = check_force_ascii(); + } + return force_ascii; +} + + static int encode_ascii(const wchar_t *text, char **str, size_t *error_pos, const char **reason, @@ -234,6 +266,12 @@ encode_ascii(const wchar_t *text, char **str, *str = result; return 0; } +#else +int +_Py_GetForceASCII(void) +{ + return 0; +} #endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */ diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 28704c1c228..cc64cf956d2 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1576,21 +1576,25 @@ initfsencoding(PyInterpreterState *interp) Py_FileSystemDefaultEncodeErrors = "surrogatepass"; } #else - if (Py_FileSystemDefaultEncoding == NULL && - interp->core_config.utf8_mode) - { - Py_FileSystemDefaultEncoding = "utf-8"; - Py_HasFileSystemDefaultEncoding = 1; - } - else if (Py_FileSystemDefaultEncoding == NULL) { - Py_FileSystemDefaultEncoding = get_locale_encoding(); - if (Py_FileSystemDefaultEncoding == NULL) { - return _Py_INIT_ERR("Unable to get the locale encoding"); + if (Py_FileSystemDefaultEncoding == NULL) { + if (interp->core_config.utf8_mode) { + Py_FileSystemDefaultEncoding = "utf-8"; + Py_HasFileSystemDefaultEncoding = 1; } + else if (_Py_GetForceASCII()) { + Py_FileSystemDefaultEncoding = "ascii"; + Py_HasFileSystemDefaultEncoding = 1; + } + else { + Py_FileSystemDefaultEncoding = get_locale_encoding(); + if (Py_FileSystemDefaultEncoding == NULL) { + return _Py_INIT_ERR("Unable to get the locale encoding"); + } - Py_HasFileSystemDefaultEncoding = 0; - interp->fscodec_initialized = 1; - return _Py_INIT_OK(); + Py_HasFileSystemDefaultEncoding = 0; + interp->fscodec_initialized = 1; + return _Py_INIT_OK(); + } } #endif