Add _PyUnicode_CheckConsistency() macro to help debugging

* Document Unicode string states
 * Use _PyUnicode_CheckConsistency() to ensure that objects are always
   consistent.
This commit is contained in:
Victor Stinner 2011-10-03 03:20:16 +02:00
parent 4fae54cb0e
commit 910337b42e
2 changed files with 144 additions and 37 deletions

View File

@ -206,6 +206,52 @@ extern "C" {
immediately follow the structure. utf8_length and wstr_length can be found
in the length field; the utf8 pointer is equal to the data pointer. */
typedef struct {
/* Unicode strings can be in 4 states:
- compact ascii:
* structure = PyASCIIObject
* kind = PyUnicode_1BYTE_KIND
* compact = 1
* ascii = 1
* ready = 1
* utf8 = data
- compact:
* structure = PyCompactUnicodeObject
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 1
* ready = 1
* (ascii = 0)
- string created by the legacy API (not ready):
* structure = PyUnicodeObject
* kind = PyUnicode_WCHAR_KIND
* compact = 0
* ready = 0
* wstr is not NULL
* data.any is NULL
* utf8 is NULL
* interned = SSTATE_NOT_INTERNED
* (ascii = 0)
- string created by the legacy API, ready:
* structure = PyUnicodeObject structure
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 0
* ready = 1
* data.any is not NULL
* (ascii = 0)
String created by the legacy API becomes ready when calling
PyUnicode_READY().
See also _PyUnicode_CheckConsistency(). */
PyObject_HEAD
Py_ssize_t length; /* Number of code points in the string */
Py_hash_t hash; /* Hash value; -1 if not set */

View File

@ -89,25 +89,16 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
extern "C" {
#endif
/* Generic helper macro to convert characters of different types.
from_type and to_type have to be valid type names, begin and end
are pointers to the source characters which should be of type
"from_type *". to is a pointer of type "to_type *" and points to the
buffer where the result characters are written to. */
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
do { \
const from_type *iter_; to_type *to_; \
for (iter_ = (begin), to_ = (to_type *)(to); \
iter_ < (end); \
++iter_, ++to_) { \
*to_ = (to_type)*iter_; \
} \
} while (0)
#ifdef Py_DEBUG
# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
#else
# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
#endif
#define _PyUnicode_UTF8(op) \
(((PyCompactUnicodeObject*)(op))->utf8)
#define PyUnicode_UTF8(op) \
(assert(PyUnicode_Check(op)), \
(assert(_PyUnicode_CHECK(op)), \
assert(PyUnicode_IS_READY(op)), \
PyUnicode_IS_COMPACT_ASCII(op) ? \
((char*)((PyASCIIObject*)(op) + 1)) : \
@ -115,7 +106,7 @@ extern "C" {
#define _PyUnicode_UTF8_LENGTH(op) \
(((PyCompactUnicodeObject*)(op))->utf8_length)
#define PyUnicode_UTF8_LENGTH(op) \
(assert(PyUnicode_Check(op)), \
(assert(_PyUnicode_CHECK(op)), \
assert(PyUnicode_IS_READY(op)), \
PyUnicode_IS_COMPACT_ASCII(op) ? \
((PyASCIIObject*)(op))->length : \
@ -125,22 +116,42 @@ extern "C" {
#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
#define _PyUnicode_KIND(op) \
(assert(PyUnicode_Check(op)), \
#define _PyUnicode_KIND(op) \
(assert(_PyUnicode_CHECK(op)), \
((PyASCIIObject *)(op))->state.kind)
#define _PyUnicode_GET_LENGTH(op) \
(assert(PyUnicode_Check(op)), \
#define _PyUnicode_GET_LENGTH(op) \
(assert(_PyUnicode_CHECK(op)), \
((PyASCIIObject *)(op))->length)
#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
#undef PyUnicode_READY
#define PyUnicode_READY(op) \
(assert(_PyUnicode_CHECK(op)), \
(PyUnicode_IS_READY(op) ? \
0 : _PyUnicode_Ready((PyObject *)(op))))
/* true if the Unicode object has an allocated UTF-8 memory block
(not shared with other data) */
#define _PyUnicode_HAS_UTF8_MEMORY(op) \
(assert(PyUnicode_Check(op)), \
(!PyUnicode_IS_COMPACT_ASCII(op) \
&& _PyUnicode_UTF8(op) \
#define _PyUnicode_HAS_UTF8_MEMORY(op) \
(assert(_PyUnicode_CHECK(op)), \
(!PyUnicode_IS_COMPACT_ASCII(op) \
&& _PyUnicode_UTF8(op) \
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
/* Generic helper macro to convert characters of different types.
from_type and to_type have to be valid type names, begin and end
are pointers to the source characters which should be of type
"from_type *". to is a pointer of type "to_type *" and points to the
buffer where the result characters are written to. */
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
do { \
const from_type *iter_; to_type *to_; \
for (iter_ = (begin), to_ = (to_type *)(to); \
iter_ < (end); \
++iter_, ++to_) { \
*to_ = (to_type)*iter_; \
} \
} while (0)
/* The Unicode string has been modified: reset the hash */
#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
@ -250,6 +261,57 @@ PyUnicode_GetMax(void)
#endif
}
#ifdef Py_DEBUG
static int
_PyUnicode_CheckConsistency(void *op)
{
PyASCIIObject *ascii;
unsigned int kind;
assert(PyUnicode_Check(op));
ascii = (PyASCIIObject *)op;
kind = ascii->state.kind;
if (ascii->state.ascii == 1) {
assert(kind == PyUnicode_1BYTE_KIND);
assert(ascii->state.compact == 1);
assert(ascii->state.ready == 1);
}
else if (ascii->state.compact == 1) {
assert(kind == PyUnicode_1BYTE_KIND
|| kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND);
assert(ascii->state.compact == 1);
assert(ascii->state.ascii == 0);
assert(ascii->state.ready == 1);
} else {
PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
PyUnicodeObject *unicode = (PyUnicodeObject *)op;
if (kind == PyUnicode_WCHAR_KIND) {
assert(!ascii->state.compact == 1);
assert(ascii->state.ascii == 0);
assert(!ascii->state.ready == 1);
assert(ascii->wstr != NULL);
assert(unicode->data.any == NULL);
assert(compact->utf8 == NULL);
assert(ascii->state.interned == SSTATE_NOT_INTERNED);
}
else {
assert(kind == PyUnicode_1BYTE_KIND
|| kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND);
assert(!ascii->state.compact == 1);
assert(ascii->state.ready == 1);
assert(unicode->data.any != NULL);
assert(ascii->state.ascii == 0);
}
}
return 1;
}
#endif
/* --- Bloom Filters ----------------------------------------------------- */
/* stuff to implement simple "bloom filters" for Unicode characters.
@ -542,7 +604,7 @@ _PyUnicode_New(Py_ssize_t length)
static const char*
unicode_kind_name(PyObject *unicode)
{
assert(PyUnicode_Check(unicode));
assert(_PyUnicode_CHECK(unicode));
if (!PyUnicode_IS_COMPACT(unicode))
{
if (!PyUnicode_IS_READY(unicode))
@ -744,7 +806,8 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
const wchar_t *iter;
Py_UCS4 *ucs4_out;
assert(unicode && PyUnicode_Check(unicode));
assert(unicode != NULL);
assert(_PyUnicode_CHECK(unicode));
assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
ucs4_out = PyUnicode_4BYTE_DATA(unicode);
@ -771,7 +834,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
static int
_PyUnicode_Dirty(PyObject *unicode)
{
assert(PyUnicode_Check(unicode));
assert(_PyUnicode_CHECK(unicode));
if (Py_REFCNT(unicode) != 1) {
PyErr_SetString(PyExc_ValueError,
"Cannot modify a string having more than 1 reference");
@ -966,10 +1029,8 @@ _PyUnicode_Ready(PyObject *obj)
strings were created using _PyObject_New() and where no canonical
representation (the str field) has been set yet aka strings
which are not yet ready. */
assert(PyUnicode_Check(obj));
assert(!PyUnicode_IS_READY(obj));
assert(!PyUnicode_IS_COMPACT(obj));
assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
assert(_PyUnicode_CHECK(unicode));
assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
assert(_PyUnicode_WSTR(unicode) != NULL);
assert(_PyUnicode_DATA_ANY(unicode) == NULL);
assert(_PyUnicode_UTF8(unicode) == NULL);
@ -1154,7 +1215,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
assert(PyUnicode_Check(unicode));
assert(0 <= length);
if (!PyUnicode_IS_COMPACT(unicode) && !PyUnicode_IS_READY(unicode))
if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
old_length = PyUnicode_WSTR_LENGTH(unicode);
else
old_length = PyUnicode_GET_LENGTH(unicode);
@ -1907,7 +1968,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
case 'U':
{
PyObject *obj = va_arg(count, PyObject *);
assert(obj && PyUnicode_Check(obj));
assert(obj && _PyUnicode_CHECK(obj));
if (PyUnicode_READY(obj) == -1)
goto fail;
argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
@ -1921,7 +1982,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
const char *str = va_arg(count, const char *);
PyObject *str_obj;
assert(obj || str);
assert(!obj || PyUnicode_Check(obj));
assert(!obj || _PyUnicode_CHECK(obj));
if (obj) {
if (PyUnicode_READY(obj) == -1)
goto fail;
@ -9570,7 +9631,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
void *data;
Py_UCS4 chr;
assert(PyUnicode_Check(uni));
assert(_PyUnicode_CHECK(uni));
if (PyUnicode_READY(uni) == -1)
return -1;
kind = PyUnicode_KIND(uni);
@ -12698,7 +12759,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
if (unicode == NULL)
return NULL;
assert(PyUnicode_Check(unicode));
assert(_PyUnicode_CHECK(unicode));
if (PyUnicode_READY(unicode))
return NULL;
@ -13054,7 +13115,7 @@ unicodeiter_next(unicodeiterobject *it)
seq = it->it_seq;
if (seq == NULL)
return NULL;
assert(PyUnicode_Check(seq));
assert(_PyUnicode_CHECK(seq));
if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
int kind = PyUnicode_KIND(seq);