diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index ba73e562acf..d7c9fa773c7 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -224,7 +224,7 @@ typedef struct { PyUnicode_4BYTE_KIND * compact = 1 * ready = 1 - * (ascii = 0) + * ascii = 0 - string created by the legacy API (not ready): @@ -236,7 +236,7 @@ typedef struct { * data.any is NULL * utf8 is NULL * interned = SSTATE_NOT_INTERNED - * (ascii = 0) + * ascii = 0 - string created by the legacy API, ready: @@ -246,7 +246,6 @@ typedef struct { * compact = 0 * ready = 1 * data.any is not NULL - * (ascii = 0) String created by the legacy API becomes ready when calling PyUnicode_READY(). @@ -278,8 +277,9 @@ typedef struct { one block for the PyUnicodeObject struct and another for its data buffer. */ unsigned int compact:1; - /* Compact objects which are ASCII-only also have the state.compact - flag set, and use the PyASCIIObject struct. */ + /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII + characters. If ascii is 1 and compact is 1, use the PyASCIIObject + structure. */ unsigned int ascii:1; /* The ready flag indicates whether the object layout is initialized completely. This means that this is either a compact object, or @@ -304,7 +304,7 @@ typedef struct { /* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the PyUnicodeObject structure. The actual string data is initially in the wstr - block, and copied into the data block using PyUnicode_Ready. */ + block, and copied into the data block using _PyUnicode_Ready. */ typedef struct { PyCompactUnicodeObject _base; union { @@ -327,7 +327,7 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; #ifndef Py_LIMITED_API #define PyUnicode_WSTR_LENGTH(op) \ - (((PyASCIIObject*)op)->state.ascii ? \ + (PyUnicode_IS_COMPACT_ASCII(op) ? \ ((PyASCIIObject*)op)->length : \ ((PyCompactUnicodeObject*)op)->wstr_length) @@ -369,10 +369,24 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; #define SSTATE_INTERNED_MORTAL 1 #define SSTATE_INTERNED_IMMORTAL 2 -#define PyUnicode_IS_COMPACT_ASCII(op) (((PyASCIIObject*)op)->state.ascii) +/* Return true if the string contains only ASCII characters, or 0 if not. The + string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks + or Ready calls are performed. */ +#define PyUnicode_IS_ASCII(op) \ + (((PyASCIIObject*)op)->state.ascii) + +/* Return true if the string is compact or 0 if not. + No type checks or Ready calls are performed. */ +#define PyUnicode_IS_COMPACT(op) \ + (((PyASCIIObject*)(op))->state.compact) + +/* Return true if the string is a compact ASCII string (use PyASCIIObject + structure), or 0 if not. No type checks or Ready calls are performed. */ +#define PyUnicode_IS_COMPACT_ASCII(op) \ + (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op)) /* String contains only wstr byte characters. This is only possible - when the string was created with a legacy API and PyUnicode_Ready() + when the string was created with a legacy API and _PyUnicode_Ready() has not been called yet. */ #define PyUnicode_WCHAR_KIND 0 @@ -399,11 +413,6 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; #define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) #define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) -/* Return true if the string is compact or 0 if not. - No type checks or Ready calls are performed. */ -#define PyUnicode_IS_COMPACT(op) \ - (((PyASCIIObject*)(op))->state.compact) - /* Return one of the PyUnicode_*_KIND values defined above. */ #define PyUnicode_KIND(op) \ (assert(PyUnicode_Check(op)), \ @@ -500,9 +509,9 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; #define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready) -/* PyUnicode_READY() does less work than PyUnicode_Ready() in the best +/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best case. If the canonical representation is not yet set, it will still call - PyUnicode_Ready(). + _PyUnicode_Ready(). Returns 0 on success and -1 on errors. */ #define PyUnicode_READY(op) \ (assert(PyUnicode_Check(op)), \ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7d604850edd..cc6b41697d5 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -288,16 +288,14 @@ _PyUnicode_CheckConsistency(void *op) ascii = (PyASCIIObject *)op; kind = ascii->state.kind; - if (ascii->state.ascii == 1) { + if (ascii->state.ascii == 1 && ascii->state.compact == 1) { assert(kind == PyUnicode_1BYTE_KIND); - assert(ascii->state.compact == 1); assert(ascii->state.ready == 1); } else if (ascii->state.compact == 1) { assert(kind == PyUnicode_1BYTE_KIND || kind == PyUnicode_2BYTE_KIND || kind == PyUnicode_4BYTE_KIND); - assert(ascii->state.compact == 1); assert(ascii->state.ascii == 0); assert(ascii->state.ready == 1); } else { @@ -305,9 +303,9 @@ _PyUnicode_CheckConsistency(void *op) PyUnicodeObject *unicode = (PyUnicodeObject *)op; if (kind == PyUnicode_WCHAR_KIND) { - assert(!ascii->state.compact == 1); + assert(ascii->state.compact == 0); assert(ascii->state.ascii == 0); - assert(!ascii->state.ready == 1); + assert(ascii->state.ready == 0); assert(ascii->wstr != NULL); assert(unicode->data.any == NULL); assert(compact->utf8 == NULL); @@ -317,10 +315,9 @@ _PyUnicode_CheckConsistency(void *op) assert(kind == PyUnicode_1BYTE_KIND || kind == PyUnicode_2BYTE_KIND || kind == PyUnicode_4BYTE_KIND); - assert(!ascii->state.compact == 1); + assert(ascii->state.compact == 0); assert(ascii->state.ready == 1); assert(unicode->data.any != NULL); - assert(ascii->state.ascii == 0); } } return 1; @@ -638,7 +635,7 @@ unicode_kind_name(PyObject *unicode) switch(PyUnicode_KIND(unicode)) { case PyUnicode_1BYTE_KIND: - if (PyUnicode_IS_COMPACT_ASCII(unicode)) + if (PyUnicode_IS_ASCII(unicode)) return "legacy ascii"; else return "legacy latin1"; @@ -654,14 +651,14 @@ unicode_kind_name(PyObject *unicode) switch(PyUnicode_KIND(unicode)) { case PyUnicode_1BYTE_KIND: - if (PyUnicode_IS_COMPACT_ASCII(unicode)) + if (PyUnicode_IS_ASCII(unicode)) return "ascii"; else - return "compact latin1"; + return "latin1"; case PyUnicode_2BYTE_KIND: - return "compact UCS2"; + return "UCS2"; case PyUnicode_4BYTE_KIND: - return "compact UCS4"; + return "UCS4"; default: return ""; } @@ -703,7 +700,7 @@ _PyUnicode_Dump(PyObject *op) if (ascii->wstr == data) printf("shared "); printf("wstr=%p", ascii->wstr); - if (!ascii->state.ascii) { + if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { printf(" (%zu), ", compact->wstr_length); if (!ascii->state.compact && compact->utf8 == unicode->data.any) printf("shared "); @@ -954,9 +951,9 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, /* check if max_char(from substring) <= max_char(to) */ if (from_kind > to_kind /* latin1 => ascii */ - || (PyUnicode_IS_COMPACT_ASCII(to) + || (PyUnicode_IS_ASCII(to) && to_kind == PyUnicode_1BYTE_KIND - && !PyUnicode_IS_COMPACT_ASCII(from))) + && !PyUnicode_IS_ASCII(from))) { /* slow path to check for character overflow */ const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); @@ -1115,10 +1112,12 @@ unicode_ready(PyObject **p_obj, int replace) _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; if (maxchar < 128) { + _PyUnicode_STATE(unicode).ascii = 1; _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); } else { + _PyUnicode_STATE(unicode).ascii = 0; _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; } diff --git a/Tools/gdb/libpython.py b/Tools/gdb/libpython.py index b5183d45fcb..4b42c8bff83 100644 --- a/Tools/gdb/libpython.py +++ b/Tools/gdb/libpython.py @@ -1132,15 +1132,16 @@ class PyUnicodeObjectPtr(PyObjectPtr): compact = self.field('_base') ascii = compact['_base'] state = ascii['state'] + is_compact_ascii = (int(state['ascii']) and int(state['compact'])) field_length = long(ascii['length']) if not int(state['ready']): # string is not ready may_have_surrogates = True field_str = ascii['wstr'] - if not int(state['ascii']): + if not is_compact_ascii: field_length = compact('wstr_length') else: - if int(state['ascii']): + if is_compact_ascii: field_str = ascii.address + 1 elif int(state['compact']): field_str = compact.address + 1