Repository URL to install this package:
#ifndef Py_CPYTHON_UNICODEOBJECT_H
# error "this header file must not be included directly"
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* Py_UNICODE was the native Unicode storage format (code unit) used by
Python and represents a single Unicode element in the Unicode type.
With PEP 393, Py_UNICODE is deprecated and replaced with a
typedef to wchar_t. */
#define PY_UNICODE_TYPE wchar_t
/* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
/* --- Internal Unicode Operations ---------------------------------------- */
/* Since splitting on whitespace is an important use case, and
whitespace in most situations is solely ASCII whitespace, we
optimize for the common case by using a quick look-up table
_Py_ascii_whitespace (see below) with an inlined check.
*/
#define Py_UNICODE_ISSPACE(ch) \
((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
#define Py_UNICODE_ISALNUM(ch) \
(Py_UNICODE_ISALPHA(ch) || \
Py_UNICODE_ISDECIMAL(ch) || \
Py_UNICODE_ISDIGIT(ch) || \
Py_UNICODE_ISNUMERIC(ch))
#define Py_UNICODE_COPY(target, source, length) \
memcpy((target), (source), (length)*sizeof(Py_UNICODE))
#define Py_UNICODE_FILL(target, value, length) \
do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
} while (0)
/* macros to work with surrogates */
#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
/* Join two surrogate characters and return a single Py_UCS4 value. */
#define Py_UNICODE_JOIN_SURROGATES(high, low) \
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
((Py_UCS4)(low) & 0x03FF)) + 0x10000)
/* high surrogate = top 10 bits added to D800 */
#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
/* low surrogate = bottom 10 bits added to DC00 */
#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
/* Check if substring matches at given offset. The offset must be
valid, and the substring must not be empty. */
#define Py_UNICODE_MATCH(string, offset, substring) \
((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
!memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
/* --- Unicode Type ------------------------------------------------------- */
/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
structure. state.ascii and state.compact are set, and the data
immediately follow the structure. utf8_length and wstr_length can be found
in the length field; the utf8 pointer is equal to the data pointer. */
typedef struct {
/* There are 4 forms of Unicode strings:
- compact ascii:
* structure = PyASCIIObject
* test: PyUnicode_IS_COMPACT_ASCII(op)
* kind = PyUnicode_1BYTE_KIND
* compact = 1
* ascii = 1
* ready = 1
* (length is the length of the utf8 and wstr strings)
* (data starts just after the structure)
* (since ASCII is decoded from UTF-8, the utf8 string are the data)
- compact:
* structure = PyCompactUnicodeObject
* test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 1
* ready = 1
* ascii = 0
* utf8 is not shared with data
* utf8_length = 0 if utf8 is NULL
* wstr is shared with data and wstr_length=length
if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
* wstr_length = 0 if wstr is NULL
* (data starts just after the structure)
- legacy string, not ready:
* structure = PyUnicodeObject
* test: kind == PyUnicode_WCHAR_KIND
* length = 0 (use wstr_length)
* hash = -1
* kind = PyUnicode_WCHAR_KIND
* compact = 0
* ascii = 0
* ready = 0
* interned = SSTATE_NOT_INTERNED
* wstr is not NULL
* data.any is NULL
* utf8 is NULL
* utf8_length = 0
- legacy string, ready:
* structure = PyUnicodeObject structure
* test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 0
* ready = 1
* data.any is not NULL
* utf8 is shared and utf8_length = length with data.any if ascii = 1
* utf8_length = 0 if utf8 is NULL
* wstr is shared with data.any and wstr_length = length
if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
* wstr_length = 0 if wstr is NULL
Compact strings use only one memory block (structure + characters),
whereas legacy strings use one block for the structure and one block
for characters.
Legacy strings are created by PyUnicode_FromUnicode() and
PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
when PyUnicode_READY() is called.
See also _PyUnicode_CheckConsistency().
*/
PyObject_HEAD
Py_ssize_t length; /* Number of code points in the string */
Py_hash_t hash; /* Hash value; -1 if not set */
struct {
/*
SSTATE_NOT_INTERNED (0)
SSTATE_INTERNED_MORTAL (1)
SSTATE_INTERNED_IMMORTAL (2)
If interned != SSTATE_NOT_INTERNED, the two references from the
dictionary to this object are *not* counted in ob_refcnt.
*/
unsigned int interned:2;
/* Character size:
- PyUnicode_WCHAR_KIND (0):
* character type = wchar_t (16 or 32 bits, depending on the
platform)
- PyUnicode_1BYTE_KIND (1):
* character type = Py_UCS1 (8 bits, unsigned)
* all characters are in the range U+0000-U+00FF (latin1)
* if ascii is set, all characters are in the range U+0000-U+007F
(ASCII), otherwise at least one character is in the range
U+0080-U+00FF
- PyUnicode_2BYTE_KIND (2):
* character type = Py_UCS2 (16 bits, unsigned)
* all characters are in the range U+0000-U+FFFF (BMP)
* at least one character is in the range U+0100-U+FFFF
- PyUnicode_4BYTE_KIND (4):
* character type = Py_UCS4 (32 bits, unsigned)
* all characters are in the range U+0000-U+10FFFF
* at least one character is in the range U+10000-U+10FFFF
*/
unsigned int kind:3;
/* Compact is with respect to the allocation scheme. Compact unicode
objects only require one memory block while non-compact objects use
one block for the PyUnicodeObject struct and another for its data
buffer. */
unsigned int compact:1;
/* The string only contains characters in the range U+0000-U+007F (ASCII)
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
set, use the PyASCIIObject structure. */
unsigned int ascii:1;
/* The ready flag indicates whether the object layout is initialized
completely. This means that this is either a compact object, or
the data pointer is filled out. The bit is redundant, and helps
to minimize the test in PyUnicode_IS_READY(). */
unsigned int ready:1;
/* Padding to ensure that PyUnicode_DATA() is always aligned to
4 bytes (see issue #19537 on m68k). */
unsigned int :24;
} state;
wchar_t *wstr; /* wchar_t representation (null-terminated) */
} PyASCIIObject;
/* Non-ASCII strings allocated through PyUnicode_New use the
PyCompactUnicodeObject structure. state.compact is set, and the data
immediately follow the structure. */
typedef struct {
PyASCIIObject _base;
Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
* terminating \0. */
char *utf8; /* UTF-8 representation (null-terminated) */
Py_ssize_t wstr_length; /* Number of code points in wstr, possible
* surrogates count as two code points. */
} PyCompactUnicodeObject;
/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
PyUnicodeObject structure. The actual string data is initially in the wstr
block, and copied into the data block using _PyUnicode_Ready. */
typedef struct {
PyCompactUnicodeObject _base;
union {
void *any;
Py_UCS1 *latin1;
Py_UCS2 *ucs2;
Py_UCS4 *ucs4;
} data; /* Canonical, smallest-form Unicode buffer */
} PyUnicodeObject;
PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
PyObject *op,
int check_content);
/* Fast access macros */
#define PyUnicode_WSTR_LENGTH(op) \
(PyUnicode_IS_COMPACT_ASCII(op) ? \
((PyASCIIObject*)op)->length : \
((PyCompactUnicodeObject*)op)->wstr_length)
/* Returns the deprecated Py_UNICODE representation's size in code units
(this includes surrogate pairs as 2 units).
If the Py_UNICODE representation is not available, it will be computed
on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
/* Py_DEPRECATED(3.3) */
#define PyUnicode_GET_SIZE(op) \
(assert(PyUnicode_Check(op)), \
(((PyASCIIObject *)(op))->wstr) ? \
PyUnicode_WSTR_LENGTH(op) : \
((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\
assert(((PyASCIIObject *)(op))->wstr), \
PyUnicode_WSTR_LENGTH(op)))
/* Py_DEPRECATED(3.3) */
#define PyUnicode_GET_DATA_SIZE(op) \
(PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
representation on demand. Using this macro is very inefficient now,
try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
use PyUnicode_WRITE() and PyUnicode_READ(). */
/* Py_DEPRECATED(3.3) */
#define PyUnicode_AS_UNICODE(op) \
(assert(PyUnicode_Check(op)), \
(((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
PyUnicode_AsUnicode(_PyObject_CAST(op)))
/* Py_DEPRECATED(3.3) */
#define PyUnicode_AS_DATA(op) \
((const char *)(PyUnicode_AS_UNICODE(op)))
/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
/* Values for PyASCIIObject.state: */
/* Interning state. */
#define SSTATE_NOT_INTERNED 0
#define SSTATE_INTERNED_MORTAL 1
#define SSTATE_INTERNED_IMMORTAL 2
/* Return true if the string contains only ASCII characters, or 0 if not. The
string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
ready. */
#define PyUnicode_IS_ASCII(op) \
(assert(PyUnicode_Check(op)), \
assert(PyUnicode_IS_READY(op)), \
((PyASCIIObject*)op)->state.ascii)
/* Return true if the string is compact or 0 if not.
No type checks or Ready calls are performed. */
#define PyUnicode_IS_COMPACT(op) \
(((PyASCIIObject*)(op))->state.compact)
/* Return true if the string is a compact ASCII string (use PyASCIIObject
structure), or 0 if not. No type checks or Ready calls are performed. */
#define PyUnicode_IS_COMPACT_ASCII(op) \
(((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
enum PyUnicode_Kind {
/* String contains only wstr byte characters. This is only possible
when the string was created with a legacy API and _PyUnicode_Ready()
has not been called yet. */
PyUnicode_WCHAR_KIND = 0,
/* Return values of the PyUnicode_KIND() macro: */
PyUnicode_1BYTE_KIND = 1,
PyUnicode_2BYTE_KIND = 2,
PyUnicode_4BYTE_KIND = 4
};
/* Return pointers to the canonical representation cast to unsigned char,
Py_UCS2, or Py_UCS4 for direct character access.
No checks are performed, use PyUnicode_KIND() before to ensure
these will work correctly. */
#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
/* Return one of the PyUnicode_*_KIND values defined above. */
#define PyUnicode_KIND(op) \
(assert(PyUnicode_Check(op)), \
assert(PyUnicode_IS_READY(op)), \
((PyASCIIObject *)(op))->state.kind)
Loading ...