mirror of https://github.com/python/cpython
415 lines
14 KiB
C
415 lines
14 KiB
C
#ifndef Py_INTERNAL_UNICODEOBJECT_H
|
|
#define Py_INTERNAL_UNICODEOBJECT_H
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
#ifndef Py_BUILD_CORE
|
|
# error "this header requires Py_BUILD_CORE define"
|
|
#endif
|
|
|
|
#include "pycore_fileutils.h" // _Py_error_handler
|
|
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
|
|
|
|
PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
|
|
PyObject *op,
|
|
int check_content);
|
|
|
|
extern void _PyUnicode_ExactDealloc(PyObject *op);
|
|
extern Py_ssize_t _PyUnicode_InternedSize(void);
|
|
|
|
/* Get a copy of a Unicode string. */
|
|
PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
|
|
PyObject *unicode
|
|
);
|
|
|
|
/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
|
|
if parameters are invalid (e.g. if length is longer than the string). */
|
|
extern void _PyUnicode_FastFill(
|
|
PyObject *unicode,
|
|
Py_ssize_t start,
|
|
Py_ssize_t length,
|
|
Py_UCS4 fill_char
|
|
);
|
|
|
|
/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
|
|
may crash if parameters are invalid (e.g. if the output string
|
|
is too short). */
|
|
extern void _PyUnicode_FastCopyCharacters(
|
|
PyObject *to,
|
|
Py_ssize_t to_start,
|
|
PyObject *from,
|
|
Py_ssize_t from_start,
|
|
Py_ssize_t how_many
|
|
);
|
|
|
|
/* Create a new string from a buffer of ASCII characters.
|
|
WARNING: Don't check if the string contains any non-ASCII character. */
|
|
extern PyObject* _PyUnicode_FromASCII(
|
|
const char *buffer,
|
|
Py_ssize_t size);
|
|
|
|
/* Compute the maximum character of the substring unicode[start:end].
|
|
Return 127 for an empty string. */
|
|
extern Py_UCS4 _PyUnicode_FindMaxChar (
|
|
PyObject *unicode,
|
|
Py_ssize_t start,
|
|
Py_ssize_t end);
|
|
|
|
/* --- _PyUnicodeWriter API ----------------------------------------------- */
|
|
|
|
typedef struct {
|
|
PyObject *buffer;
|
|
void *data;
|
|
int kind;
|
|
Py_UCS4 maxchar;
|
|
Py_ssize_t size;
|
|
Py_ssize_t pos;
|
|
|
|
/* minimum number of allocated characters (default: 0) */
|
|
Py_ssize_t min_length;
|
|
|
|
/* minimum character (default: 127, ASCII) */
|
|
Py_UCS4 min_char;
|
|
|
|
/* If non-zero, overallocate the buffer (default: 0). */
|
|
unsigned char overallocate;
|
|
|
|
/* If readonly is 1, buffer is a shared string (cannot be modified)
|
|
and size is set to 0. */
|
|
unsigned char readonly;
|
|
} _PyUnicodeWriter ;
|
|
|
|
/* Initialize a Unicode writer.
|
|
*
|
|
* By default, the minimum buffer size is 0 character and overallocation is
|
|
* disabled. Set min_length, min_char and overallocate attributes to control
|
|
* the allocation of the buffer. */
|
|
PyAPI_FUNC(void)
|
|
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
|
|
|
|
/* Prepare the buffer to write 'length' characters
|
|
with the specified maximum character.
|
|
|
|
Return 0 on success, raise an exception and return -1 on error. */
|
|
#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
|
|
(((MAXCHAR) <= (WRITER)->maxchar \
|
|
&& (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
|
|
? 0 \
|
|
: (((LENGTH) == 0) \
|
|
? 0 \
|
|
: _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
|
|
|
|
/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
|
|
instead. */
|
|
PyAPI_FUNC(int)
|
|
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
|
|
Py_ssize_t length, Py_UCS4 maxchar);
|
|
|
|
/* Prepare the buffer to have at least the kind KIND.
|
|
For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
|
|
support characters in range U+000-U+FFFF.
|
|
|
|
Return 0 on success, raise an exception and return -1 on error. */
|
|
#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
|
|
((KIND) <= (WRITER)->kind \
|
|
? 0 \
|
|
: _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
|
|
|
|
/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
|
|
macro instead. */
|
|
PyAPI_FUNC(int)
|
|
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
|
|
int kind);
|
|
|
|
/* Append a Unicode character.
|
|
Return 0 on success, raise an exception and return -1 on error. */
|
|
PyAPI_FUNC(int)
|
|
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
|
|
Py_UCS4 ch
|
|
);
|
|
|
|
/* Append a Unicode string.
|
|
Return 0 on success, raise an exception and return -1 on error. */
|
|
PyAPI_FUNC(int)
|
|
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
|
|
PyObject *str /* Unicode string */
|
|
);
|
|
|
|
/* Append a substring of a Unicode string.
|
|
Return 0 on success, raise an exception and return -1 on error. */
|
|
PyAPI_FUNC(int)
|
|
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
|
|
PyObject *str, /* Unicode string */
|
|
Py_ssize_t start,
|
|
Py_ssize_t end
|
|
);
|
|
|
|
/* Append an ASCII-encoded byte string.
|
|
Return 0 on success, raise an exception and return -1 on error. */
|
|
PyAPI_FUNC(int)
|
|
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
|
|
const char *str, /* ASCII-encoded byte string */
|
|
Py_ssize_t len /* number of bytes, or -1 if unknown */
|
|
);
|
|
|
|
/* Append a latin1-encoded byte string.
|
|
Return 0 on success, raise an exception and return -1 on error. */
|
|
PyAPI_FUNC(int)
|
|
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
|
|
const char *str, /* latin1-encoded byte string */
|
|
Py_ssize_t len /* length in bytes */
|
|
);
|
|
|
|
/* Get the value of the writer as a Unicode string. Clear the
|
|
buffer of the writer. Raise an exception and return NULL
|
|
on error. */
|
|
PyAPI_FUNC(PyObject *)
|
|
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
|
|
|
|
/* Deallocate memory of a writer (clear its internal buffer). */
|
|
PyAPI_FUNC(void)
|
|
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
|
|
|
|
|
|
/* Format the object based on the format_spec, as defined in PEP 3101
|
|
(Advanced String Formatting). */
|
|
PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
|
|
_PyUnicodeWriter *writer,
|
|
PyObject *obj,
|
|
PyObject *format_spec,
|
|
Py_ssize_t start,
|
|
Py_ssize_t end);
|
|
|
|
/* --- UTF-7 Codecs ------------------------------------------------------- */
|
|
|
|
extern PyObject* _PyUnicode_EncodeUTF7(
|
|
PyObject *unicode, /* Unicode object */
|
|
int base64SetO, /* Encode RFC2152 Set O characters in base64 */
|
|
int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
|
|
const char *errors); /* error handling */
|
|
|
|
/* --- UTF-8 Codecs ------------------------------------------------------- */
|
|
|
|
PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
|
|
PyObject *unicode,
|
|
const char *errors);
|
|
|
|
/* --- UTF-32 Codecs ------------------------------------------------------ */
|
|
|
|
PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
|
|
PyObject *object, /* Unicode object */
|
|
const char *errors, /* error handling */
|
|
int byteorder); /* byteorder to use 0=BOM+native;-1=LE,1=BE */
|
|
|
|
/* --- UTF-16 Codecs ------------------------------------------------------ */
|
|
|
|
/* Returns a Python string object holding the UTF-16 encoded value of
|
|
the Unicode data.
|
|
|
|
If byteorder is not 0, output is written according to the following
|
|
byte order:
|
|
|
|
byteorder == -1: little endian
|
|
byteorder == 0: native byte order (writes a BOM mark)
|
|
byteorder == 1: big endian
|
|
|
|
If byteorder is 0, the output string will always start with the
|
|
Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
|
|
prepended.
|
|
*/
|
|
PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
|
|
PyObject* unicode, /* Unicode object */
|
|
const char *errors, /* error handling */
|
|
int byteorder); /* byteorder to use 0=BOM+native;-1=LE,1=BE */
|
|
|
|
/* --- Unicode-Escape Codecs ---------------------------------------------- */
|
|
|
|
/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
|
|
extern PyObject* _PyUnicode_DecodeUnicodeEscapeStateful(
|
|
const char *string, /* Unicode-Escape encoded string */
|
|
Py_ssize_t length, /* size of string */
|
|
const char *errors, /* error handling */
|
|
Py_ssize_t *consumed); /* bytes consumed */
|
|
|
|
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
|
|
chars. */
|
|
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
|
|
const char *string, /* Unicode-Escape encoded string */
|
|
Py_ssize_t length, /* size of string */
|
|
const char *errors, /* error handling */
|
|
Py_ssize_t *consumed, /* bytes consumed */
|
|
const char **first_invalid_escape); /* on return, points to first
|
|
invalid escaped char in
|
|
string. */
|
|
|
|
/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
|
|
|
|
/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
|
|
extern PyObject* _PyUnicode_DecodeRawUnicodeEscapeStateful(
|
|
const char *string, /* Unicode-Escape encoded string */
|
|
Py_ssize_t length, /* size of string */
|
|
const char *errors, /* error handling */
|
|
Py_ssize_t *consumed); /* bytes consumed */
|
|
|
|
/* --- Latin-1 Codecs ----------------------------------------------------- */
|
|
|
|
extern PyObject* _PyUnicode_AsLatin1String(
|
|
PyObject* unicode,
|
|
const char* errors);
|
|
|
|
/* --- ASCII Codecs ------------------------------------------------------- */
|
|
|
|
extern PyObject* _PyUnicode_AsASCIIString(
|
|
PyObject* unicode,
|
|
const char* errors);
|
|
|
|
/* --- Character Map Codecs ----------------------------------------------- */
|
|
|
|
/* Translate an Unicode object by applying a character mapping table to
|
|
it and return the resulting Unicode object.
|
|
|
|
The mapping table must map Unicode ordinal integers to Unicode strings,
|
|
Unicode ordinal integers or None (causing deletion of the character).
|
|
|
|
Mapping tables may be dictionaries or sequences. Unmapped character
|
|
ordinals (ones which cause a LookupError) are left untouched and
|
|
are copied as-is.
|
|
*/
|
|
extern PyObject* _PyUnicode_EncodeCharmap(
|
|
PyObject *unicode, /* Unicode object */
|
|
PyObject *mapping, /* encoding mapping */
|
|
const char *errors); /* error handling */
|
|
|
|
/* --- Decimal Encoder ---------------------------------------------------- */
|
|
|
|
/* Coverts a Unicode object holding a decimal value to an ASCII string
|
|
for using in int, float and complex parsers.
|
|
Transforms code points that have decimal digit property to the
|
|
corresponding ASCII digit code points. Transforms spaces to ASCII.
|
|
Transforms code points starting from the first non-ASCII code point that
|
|
is neither a decimal digit nor a space to the end into '?'. */
|
|
|
|
PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
|
|
PyObject *unicode); /* Unicode object */
|
|
|
|
/* --- Methods & Slots ---------------------------------------------------- */
|
|
|
|
extern PyObject* _PyUnicode_JoinArray(
|
|
PyObject *separator,
|
|
PyObject *const *items,
|
|
Py_ssize_t seqlen
|
|
);
|
|
|
|
/* Test whether a unicode is equal to ASCII identifier. Return 1 if true,
|
|
0 otherwise. The right argument must be ASCII identifier.
|
|
Any error occurs inside will be cleared before return. */
|
|
extern int _PyUnicode_EqualToASCIIId(
|
|
PyObject *left, /* Left string */
|
|
_Py_Identifier *right /* Right identifier */
|
|
);
|
|
|
|
/* Test whether a unicode is equal to ASCII string. Return 1 if true,
|
|
0 otherwise. The right argument must be ASCII-encoded string.
|
|
Any error occurs inside will be cleared before return. */
|
|
PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
|
|
PyObject *left,
|
|
const char *right /* ASCII-encoded string */
|
|
);
|
|
|
|
/* Externally visible for str.strip(unicode) */
|
|
extern PyObject* _PyUnicode_XStrip(
|
|
PyObject *self,
|
|
int striptype,
|
|
PyObject *sepobj
|
|
);
|
|
|
|
|
|
/* Using explicit passed-in values, insert the thousands grouping
|
|
into the string pointed to by buffer. For the argument descriptions,
|
|
see Objects/stringlib/localeutil.h */
|
|
extern Py_ssize_t _PyUnicode_InsertThousandsGrouping(
|
|
_PyUnicodeWriter *writer,
|
|
Py_ssize_t n_buffer,
|
|
PyObject *digits,
|
|
Py_ssize_t d_pos,
|
|
Py_ssize_t n_digits,
|
|
Py_ssize_t min_width,
|
|
const char *grouping,
|
|
PyObject *thousands_sep,
|
|
Py_UCS4 *maxchar);
|
|
|
|
/* --- Misc functions ----------------------------------------------------- */
|
|
|
|
extern PyObject* _PyUnicode_FormatLong(PyObject *, int, int, int);
|
|
|
|
/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
|
|
PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
|
|
|
|
/* Fast equality check when the inputs are known to be exact unicode types
|
|
and where the hash values are equal (i.e. a very probable match) */
|
|
extern int _PyUnicode_EQ(PyObject *, PyObject *);
|
|
|
|
/* Equality check. */
|
|
PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
|
|
|
|
extern int _PyUnicode_WideCharString_Converter(PyObject *, void *);
|
|
extern int _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
|
|
|
|
PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
|
|
|
|
/* --- Runtime lifecycle -------------------------------------------------- */
|
|
|
|
extern void _PyUnicode_InitState(PyInterpreterState *);
|
|
extern PyStatus _PyUnicode_InitGlobalObjects(PyInterpreterState *);
|
|
extern PyStatus _PyUnicode_InitTypes(PyInterpreterState *);
|
|
extern void _PyUnicode_Fini(PyInterpreterState *);
|
|
extern void _PyUnicode_FiniTypes(PyInterpreterState *);
|
|
|
|
extern PyTypeObject _PyUnicodeASCIIIter_Type;
|
|
|
|
/* --- Other API ---------------------------------------------------------- */
|
|
|
|
struct _Py_unicode_runtime_ids {
|
|
PyThread_type_lock lock;
|
|
// next_index value must be preserved when Py_Initialize()/Py_Finalize()
|
|
// is called multiple times: see _PyUnicode_FromId() implementation.
|
|
Py_ssize_t next_index;
|
|
};
|
|
|
|
struct _Py_unicode_runtime_state {
|
|
struct _Py_unicode_runtime_ids ids;
|
|
};
|
|
|
|
/* fs_codec.encoding is initialized to NULL.
|
|
Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */
|
|
struct _Py_unicode_fs_codec {
|
|
char *encoding; // Filesystem encoding (encoded to UTF-8)
|
|
int utf8; // encoding=="utf-8"?
|
|
char *errors; // Filesystem errors (encoded to UTF-8)
|
|
_Py_error_handler error_handler;
|
|
};
|
|
|
|
struct _Py_unicode_ids {
|
|
Py_ssize_t size;
|
|
PyObject **array;
|
|
};
|
|
|
|
struct _Py_unicode_state {
|
|
struct _Py_unicode_fs_codec fs_codec;
|
|
|
|
_PyUnicode_Name_CAPI *ucnhash_capi;
|
|
|
|
// Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId()
|
|
struct _Py_unicode_ids ids;
|
|
};
|
|
|
|
extern void _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p);
|
|
extern void _PyUnicode_ClearInterned(PyInterpreterState *interp);
|
|
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
#endif /* !Py_INTERNAL_UNICODEOBJECT_H */
|