2021-08-24 13:50:05 -03:00
|
|
|
#include "Python.h"
|
|
|
|
#include "../Parser/tokenizer.h"
|
|
|
|
|
|
|
|
static struct PyModuleDef _tokenizemodule;
|
|
|
|
|
|
|
|
typedef struct {
|
2021-08-25 09:41:14 -03:00
|
|
|
PyTypeObject *TokenizerIter;
|
2021-08-24 13:50:05 -03:00
|
|
|
} tokenize_state;
|
|
|
|
|
2021-08-25 09:41:14 -03:00
|
|
|
static tokenize_state *
|
|
|
|
get_tokenize_state(PyObject *module) {
|
|
|
|
return (tokenize_state *)PyModule_GetState(module);
|
2021-08-24 13:50:05 -03:00
|
|
|
}
|
|
|
|
|
2021-08-25 09:41:14 -03:00
|
|
|
#define _tokenize_get_state_by_type(type) \
|
2022-02-11 12:22:11 -04:00
|
|
|
get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
|
2021-08-24 13:50:05 -03:00
|
|
|
|
2022-08-11 18:25:49 -03:00
|
|
|
#include "pycore_runtime.h"
|
2021-08-24 13:50:05 -03:00
|
|
|
#include "clinic/Python-tokenize.c.h"
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
module _tokenizer
|
|
|
|
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
|
|
|
|
[clinic start generated code]*/
|
|
|
|
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
|
|
|
|
|
2021-08-25 09:41:14 -03:00
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
PyObject_HEAD struct tok_state *tok;
|
2021-08-24 13:50:05 -03:00
|
|
|
} tokenizeriterobject;
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
@classmethod
|
|
|
|
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
|
|
|
|
|
|
|
|
source: str
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
tokenizeriter_new_impl(PyTypeObject *type, const char *source)
|
|
|
|
/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
|
|
|
|
{
|
2021-08-25 09:41:14 -03:00
|
|
|
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
|
2021-08-24 13:50:05 -03:00
|
|
|
if (self == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2021-08-25 09:41:14 -03:00
|
|
|
PyObject *filename = PyUnicode_FromString("<string>");
|
2021-08-24 13:50:05 -03:00
|
|
|
if (filename == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2021-10-13 12:22:14 -03:00
|
|
|
self->tok = _PyTokenizer_FromUTF8(source, 1);
|
2021-08-24 13:50:05 -03:00
|
|
|
if (self->tok == NULL) {
|
2021-08-25 09:41:14 -03:00
|
|
|
Py_DECREF(filename);
|
2021-08-24 13:50:05 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
self->tok->filename = filename;
|
2021-08-25 09:41:14 -03:00
|
|
|
return (PyObject *)self;
|
2021-08-24 13:50:05 -03:00
|
|
|
}
|
|
|
|
|
2021-08-25 09:41:14 -03:00
|
|
|
static PyObject *
|
|
|
|
tokenizeriter_next(tokenizeriterobject *it)
|
2021-08-24 13:50:05 -03:00
|
|
|
{
|
2021-08-25 09:41:14 -03:00
|
|
|
const char *start;
|
|
|
|
const char *end;
|
2021-10-13 12:22:14 -03:00
|
|
|
int type = _PyTokenizer_Get(it->tok, &start, &end);
|
2021-08-24 13:50:05 -03:00
|
|
|
if (type == ERRORTOKEN && PyErr_Occurred()) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (type == ERRORTOKEN || type == ENDMARKER) {
|
|
|
|
PyErr_SetString(PyExc_StopIteration, "EOF");
|
|
|
|
return NULL;
|
|
|
|
}
|
2021-08-25 09:41:14 -03:00
|
|
|
PyObject *str = NULL;
|
2021-08-24 13:50:05 -03:00
|
|
|
if (start == NULL || end == NULL) {
|
|
|
|
str = PyUnicode_FromString("");
|
2021-08-25 09:41:14 -03:00
|
|
|
}
|
|
|
|
else {
|
2021-08-24 13:50:05 -03:00
|
|
|
str = PyUnicode_FromStringAndSize(start, end - start);
|
|
|
|
}
|
|
|
|
if (str == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
Py_ssize_t size = it->tok->inp - it->tok->buf;
|
2021-08-25 09:41:14 -03:00
|
|
|
PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
|
2021-08-24 13:50:05 -03:00
|
|
|
if (line == NULL) {
|
|
|
|
Py_DECREF(str);
|
|
|
|
return NULL;
|
|
|
|
}
|
2021-08-25 09:41:14 -03:00
|
|
|
const char *line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
|
2021-08-24 13:50:05 -03:00
|
|
|
int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
|
|
|
|
int end_lineno = it->tok->lineno;
|
|
|
|
int col_offset = -1;
|
|
|
|
int end_col_offset = -1;
|
|
|
|
if (start != NULL && start >= line_start) {
|
|
|
|
col_offset = (int)(start - line_start);
|
|
|
|
}
|
|
|
|
if (end != NULL && end >= it->tok->line_start) {
|
|
|
|
end_col_offset = (int)(end - it->tok->line_start);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2021-08-25 09:41:14 -03:00
|
|
|
tokenizeriter_dealloc(tokenizeriterobject *it)
|
2021-08-24 13:50:05 -03:00
|
|
|
{
|
2021-08-25 09:41:14 -03:00
|
|
|
PyTypeObject *tp = Py_TYPE(it);
|
2021-10-13 12:22:14 -03:00
|
|
|
_PyTokenizer_Free(it->tok);
|
2021-08-24 13:50:05 -03:00
|
|
|
tp->tp_free(it);
|
|
|
|
Py_DECREF(tp);
|
|
|
|
}
|
|
|
|
|
|
|
|
static PyType_Slot tokenizeriter_slots[] = {
|
2021-08-25 09:41:14 -03:00
|
|
|
{Py_tp_new, tokenizeriter_new},
|
|
|
|
{Py_tp_dealloc, tokenizeriter_dealloc},
|
|
|
|
{Py_tp_getattro, PyObject_GenericGetAttr},
|
|
|
|
{Py_tp_iter, PyObject_SelfIter},
|
|
|
|
{Py_tp_iternext, tokenizeriter_next},
|
|
|
|
{0, NULL},
|
2021-08-24 13:50:05 -03:00
|
|
|
};
|
|
|
|
|
|
|
|
static PyType_Spec tokenizeriter_spec = {
|
2021-08-25 09:41:14 -03:00
|
|
|
.name = "_tokenize.TokenizerIter",
|
|
|
|
.basicsize = sizeof(tokenizeriterobject),
|
|
|
|
.flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
|
|
|
|
.slots = tokenizeriter_slots,
|
2021-08-24 13:50:05 -03:00
|
|
|
};
|
|
|
|
|
|
|
|
static int
|
2021-08-25 09:41:14 -03:00
|
|
|
tokenizemodule_exec(PyObject *m)
|
2021-08-24 13:50:05 -03:00
|
|
|
{
|
2021-08-25 09:41:14 -03:00
|
|
|
tokenize_state *state = get_tokenize_state(m);
|
2021-08-24 13:50:05 -03:00
|
|
|
if (state == NULL) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2021-08-25 09:41:14 -03:00
|
|
|
state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
|
2021-08-24 13:50:05 -03:00
|
|
|
if (state->TokenizerIter == NULL) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (PyModule_AddType(m, state->TokenizerIter) < 0) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static PyMethodDef tokenize_methods[] = {
|
2021-08-25 09:41:14 -03:00
|
|
|
{NULL, NULL, 0, NULL} /* Sentinel */
|
2021-08-24 13:50:05 -03:00
|
|
|
};
|
|
|
|
|
|
|
|
static PyModuleDef_Slot tokenizemodule_slots[] = {
|
2021-10-03 10:58:14 -03:00
|
|
|
{Py_mod_exec, tokenizemodule_exec},
|
2021-08-24 13:50:05 -03:00
|
|
|
{0, NULL}
|
|
|
|
};
|
|
|
|
|
|
|
|
static int
|
|
|
|
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
|
|
|
|
{
|
|
|
|
tokenize_state *state = get_tokenize_state(m);
|
|
|
|
Py_VISIT(state->TokenizerIter);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
tokenizemodule_clear(PyObject *m)
|
|
|
|
{
|
|
|
|
tokenize_state *state = get_tokenize_state(m);
|
|
|
|
Py_CLEAR(state->TokenizerIter);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
tokenizemodule_free(void *m)
|
|
|
|
{
|
|
|
|
tokenizemodule_clear((PyObject *)m);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct PyModuleDef _tokenizemodule = {
|
2021-08-25 09:41:14 -03:00
|
|
|
PyModuleDef_HEAD_INIT,
|
|
|
|
.m_name = "_tokenize",
|
|
|
|
.m_size = sizeof(tokenize_state),
|
|
|
|
.m_slots = tokenizemodule_slots,
|
|
|
|
.m_methods = tokenize_methods,
|
|
|
|
.m_traverse = tokenizemodule_traverse,
|
|
|
|
.m_clear = tokenizemodule_clear,
|
|
|
|
.m_free = tokenizemodule_free,
|
2021-08-24 13:50:05 -03:00
|
|
|
};
|
|
|
|
|
|
|
|
PyMODINIT_FUNC
|
|
|
|
PyInit__tokenize(void)
|
|
|
|
{
|
|
|
|
return PyModuleDef_Init(&_tokenizemodule);
|
|
|
|
}
|