2021-08-24 13:50:05 -03:00
|
|
|
#include "Python.h"
|
2023-05-20 21:03:02 -03:00
|
|
|
#include "errcode.h"
|
2023-10-11 12:14:44 -03:00
|
|
|
#include "../Parser/lexer/state.h"
|
|
|
|
#include "../Parser/lexer/lexer.h"
|
|
|
|
#include "../Parser/tokenizer/tokenizer.h"
|
2023-05-20 21:03:02 -03:00
|
|
|
#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
|
2021-08-24 13:50:05 -03:00
|
|
|
|
|
|
|
static struct PyModuleDef _tokenizemodule;
|
|
|
|
|
|
|
|
typedef struct {
|
2021-08-25 09:41:14 -03:00
|
|
|
PyTypeObject *TokenizerIter;
|
2021-08-24 13:50:05 -03:00
|
|
|
} tokenize_state;
|
|
|
|
|
2021-08-25 09:41:14 -03:00
|
|
|
static tokenize_state *
|
|
|
|
get_tokenize_state(PyObject *module) {
|
|
|
|
return (tokenize_state *)PyModule_GetState(module);
|
2021-08-24 13:50:05 -03:00
|
|
|
}
|
|
|
|
|
2021-08-25 09:41:14 -03:00
|
|
|
#define _tokenize_get_state_by_type(type) \
|
2022-02-11 12:22:11 -04:00
|
|
|
get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
|
2021-08-24 13:50:05 -03:00
|
|
|
|
2022-08-11 18:25:49 -03:00
|
|
|
#include "pycore_runtime.h"
|
2021-08-24 13:50:05 -03:00
|
|
|
#include "clinic/Python-tokenize.c.h"
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
module _tokenizer
|
|
|
|
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
|
|
|
|
[clinic start generated code]*/
|
|
|
|
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
|
|
|
|
|
2021-08-25 09:41:14 -03:00
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
PyObject_HEAD struct tok_state *tok;
|
2023-05-26 18:02:26 -03:00
|
|
|
int done;
|
2021-08-24 13:50:05 -03:00
|
|
|
} tokenizeriterobject;
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
@classmethod
|
|
|
|
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
|
|
|
|
|
2023-05-30 18:43:34 -03:00
|
|
|
readline: object
|
|
|
|
/
|
2023-05-20 21:03:02 -03:00
|
|
|
*
|
|
|
|
extra_tokens: bool
|
2023-05-30 18:43:34 -03:00
|
|
|
encoding: str(c_default="NULL") = 'utf-8'
|
2021-08-24 13:50:05 -03:00
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static PyObject *
|
2023-05-30 18:43:34 -03:00
|
|
|
tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
|
|
|
|
int extra_tokens, const char *encoding)
|
|
|
|
/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
|
2021-08-24 13:50:05 -03:00
|
|
|
{
|
2021-08-25 09:41:14 -03:00
|
|
|
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
|
2021-08-24 13:50:05 -03:00
|
|
|
if (self == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2021-08-25 09:41:14 -03:00
|
|
|
PyObject *filename = PyUnicode_FromString("<string>");
|
2021-08-24 13:50:05 -03:00
|
|
|
if (filename == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2023-05-30 18:43:34 -03:00
|
|
|
self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
|
2021-08-24 13:50:05 -03:00
|
|
|
if (self->tok == NULL) {
|
2021-08-25 09:41:14 -03:00
|
|
|
Py_DECREF(filename);
|
2021-08-24 13:50:05 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
self->tok->filename = filename;
|
2023-05-20 21:03:02 -03:00
|
|
|
if (extra_tokens) {
|
|
|
|
self->tok->tok_extra_tokens = 1;
|
|
|
|
}
|
2023-05-26 18:02:26 -03:00
|
|
|
self->done = 0;
|
2021-08-25 09:41:14 -03:00
|
|
|
return (PyObject *)self;
|
2021-08-24 13:50:05 -03:00
|
|
|
}
|
|
|
|
|
2023-05-20 21:03:02 -03:00
|
|
|
static int
|
|
|
|
_tokenizer_error(struct tok_state *tok)
|
|
|
|
{
|
|
|
|
if (PyErr_Occurred()) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *msg = NULL;
|
|
|
|
PyObject* errtype = PyExc_SyntaxError;
|
|
|
|
switch (tok->done) {
|
|
|
|
case E_TOKEN:
|
|
|
|
msg = "invalid token";
|
|
|
|
break;
|
|
|
|
case E_EOF:
|
2023-06-07 08:04:40 -03:00
|
|
|
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
|
2023-06-07 17:20:43 -03:00
|
|
|
PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
|
|
|
|
tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
|
2023-05-20 21:03:02 -03:00
|
|
|
return -1;
|
|
|
|
case E_DEDENT:
|
2023-05-22 08:30:18 -03:00
|
|
|
msg = "unindent does not match any outer indentation level";
|
|
|
|
errtype = PyExc_IndentationError;
|
|
|
|
break;
|
2023-05-20 21:03:02 -03:00
|
|
|
case E_INTR:
|
|
|
|
if (!PyErr_Occurred()) {
|
|
|
|
PyErr_SetNone(PyExc_KeyboardInterrupt);
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
case E_NOMEM:
|
|
|
|
PyErr_NoMemory();
|
|
|
|
return -1;
|
|
|
|
case E_TABSPACE:
|
|
|
|
errtype = PyExc_TabError;
|
|
|
|
msg = "inconsistent use of tabs and spaces in indentation";
|
|
|
|
break;
|
|
|
|
case E_TOODEEP:
|
|
|
|
errtype = PyExc_IndentationError;
|
|
|
|
msg = "too many levels of indentation";
|
|
|
|
break;
|
|
|
|
case E_LINECONT: {
|
|
|
|
msg = "unexpected character after line continuation character";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
msg = "unknown tokenization error";
|
|
|
|
}
|
|
|
|
|
|
|
|
PyObject* errstr = NULL;
|
|
|
|
PyObject* error_line = NULL;
|
|
|
|
PyObject* tmp = NULL;
|
|
|
|
PyObject* value = NULL;
|
|
|
|
int result = 0;
|
|
|
|
|
|
|
|
Py_ssize_t size = tok->inp - tok->buf;
|
2023-05-24 06:59:18 -03:00
|
|
|
assert(tok->buf[size-1] == '\n');
|
|
|
|
size -= 1; // Remove the newline character from the end of the line
|
2023-05-20 21:03:02 -03:00
|
|
|
error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
|
|
|
|
if (!error_line) {
|
|
|
|
result = -1;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
2023-05-22 08:30:18 -03:00
|
|
|
Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
|
|
|
|
if (offset == -1) {
|
|
|
|
result = -1;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
|
2023-05-20 21:03:02 -03:00
|
|
|
if (!tmp) {
|
|
|
|
result = -1;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
errstr = PyUnicode_FromString(msg);
|
|
|
|
if (!errstr) {
|
|
|
|
result = -1;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
value = PyTuple_Pack(2, errstr, tmp);
|
|
|
|
if (!value) {
|
|
|
|
result = -1;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
PyErr_SetObject(errtype, value);
|
|
|
|
|
|
|
|
exit:
|
|
|
|
Py_XDECREF(errstr);
|
|
|
|
Py_XDECREF(error_line);
|
|
|
|
Py_XDECREF(tmp);
|
|
|
|
Py_XDECREF(value);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2021-08-25 09:41:14 -03:00
|
|
|
static PyObject *
|
|
|
|
tokenizeriter_next(tokenizeriterobject *it)
|
2021-08-24 13:50:05 -03:00
|
|
|
{
|
2023-05-20 21:03:02 -03:00
|
|
|
PyObject* result = NULL;
|
2022-10-06 20:07:17 -03:00
|
|
|
struct token token;
|
2023-05-20 21:03:02 -03:00
|
|
|
_PyToken_Init(&token);
|
|
|
|
|
2022-10-06 20:07:17 -03:00
|
|
|
int type = _PyTokenizer_Get(it->tok, &token);
|
2023-05-20 21:03:02 -03:00
|
|
|
if (type == ERRORTOKEN) {
|
|
|
|
if(!PyErr_Occurred()) {
|
|
|
|
_tokenizer_error(it->tok);
|
|
|
|
assert(PyErr_Occurred());
|
|
|
|
}
|
|
|
|
goto exit;
|
2021-08-24 13:50:05 -03:00
|
|
|
}
|
2023-05-26 18:02:26 -03:00
|
|
|
if (it->done || type == ERRORTOKEN) {
|
2021-08-24 13:50:05 -03:00
|
|
|
PyErr_SetString(PyExc_StopIteration, "EOF");
|
2023-05-26 18:02:26 -03:00
|
|
|
it->done = 1;
|
2023-05-20 21:03:02 -03:00
|
|
|
goto exit;
|
2021-08-24 13:50:05 -03:00
|
|
|
}
|
2021-08-25 09:41:14 -03:00
|
|
|
PyObject *str = NULL;
|
2022-10-06 20:07:17 -03:00
|
|
|
if (token.start == NULL || token.end == NULL) {
|
2021-08-24 13:50:05 -03:00
|
|
|
str = PyUnicode_FromString("");
|
2021-08-25 09:41:14 -03:00
|
|
|
}
|
|
|
|
else {
|
2022-10-06 20:07:17 -03:00
|
|
|
str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
|
2021-08-24 13:50:05 -03:00
|
|
|
}
|
|
|
|
if (str == NULL) {
|
2023-05-20 21:03:02 -03:00
|
|
|
goto exit;
|
2021-08-24 13:50:05 -03:00
|
|
|
}
|
|
|
|
|
2023-05-26 18:02:26 -03:00
|
|
|
int is_trailing_token = 0;
|
|
|
|
if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
|
|
|
|
is_trailing_token = 1;
|
|
|
|
}
|
|
|
|
|
2023-05-26 11:46:22 -03:00
|
|
|
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
|
2023-05-26 18:02:26 -03:00
|
|
|
PyObject* line = NULL;
|
|
|
|
if (it->tok->tok_extra_tokens && is_trailing_token) {
|
|
|
|
line = PyUnicode_FromString("");
|
|
|
|
} else {
|
|
|
|
Py_ssize_t size = it->tok->inp - line_start;
|
2023-06-09 13:01:26 -03:00
|
|
|
if (size >= 1 && it->tok->implicit_newline) {
|
|
|
|
size -= 1;
|
|
|
|
}
|
2023-05-26 18:02:26 -03:00
|
|
|
line = PyUnicode_DecodeUTF8(line_start, size, "replace");
|
|
|
|
}
|
2021-08-24 13:50:05 -03:00
|
|
|
if (line == NULL) {
|
|
|
|
Py_DECREF(str);
|
2023-05-20 21:03:02 -03:00
|
|
|
goto exit;
|
2021-08-24 13:50:05 -03:00
|
|
|
}
|
2023-05-26 11:46:22 -03:00
|
|
|
|
2023-05-20 21:03:02 -03:00
|
|
|
Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
|
|
|
|
Py_ssize_t end_lineno = it->tok->lineno;
|
|
|
|
Py_ssize_t col_offset = -1;
|
|
|
|
Py_ssize_t end_col_offset = -1;
|
2022-10-06 20:07:17 -03:00
|
|
|
if (token.start != NULL && token.start >= line_start) {
|
2023-05-20 21:03:02 -03:00
|
|
|
col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
|
2021-08-24 13:50:05 -03:00
|
|
|
}
|
2022-10-06 20:07:17 -03:00
|
|
|
if (token.end != NULL && token.end >= it->tok->line_start) {
|
2023-12-11 07:44:22 -04:00
|
|
|
end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
|
2021-08-24 13:50:05 -03:00
|
|
|
}
|
|
|
|
|
2023-05-21 21:29:04 -03:00
|
|
|
if (it->tok->tok_extra_tokens) {
|
2023-05-26 18:02:26 -03:00
|
|
|
if (is_trailing_token) {
|
|
|
|
lineno = end_lineno = lineno + 1;
|
|
|
|
col_offset = end_col_offset = 0;
|
|
|
|
}
|
2023-05-21 21:29:04 -03:00
|
|
|
// Necessary adjustments to match the original Python tokenize
|
|
|
|
// implementation
|
|
|
|
if (type > DEDENT && type < OP) {
|
|
|
|
type = OP;
|
|
|
|
}
|
|
|
|
else if (type == NEWLINE) {
|
2023-05-28 11:15:53 -03:00
|
|
|
Py_DECREF(str);
|
2023-06-06 08:52:16 -03:00
|
|
|
if (!it->tok->implicit_newline) {
|
|
|
|
if (it->tok->start[0] == '\r') {
|
|
|
|
str = PyUnicode_FromString("\r\n");
|
|
|
|
} else {
|
|
|
|
str = PyUnicode_FromString("\n");
|
|
|
|
}
|
2023-05-28 11:15:53 -03:00
|
|
|
}
|
2023-05-21 21:29:04 -03:00
|
|
|
end_col_offset++;
|
|
|
|
}
|
2023-06-07 09:31:48 -03:00
|
|
|
else if (type == NL) {
|
|
|
|
if (it->tok->implicit_newline) {
|
|
|
|
Py_DECREF(str);
|
|
|
|
str = PyUnicode_FromString("");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (str == NULL) {
|
|
|
|
Py_DECREF(line);
|
|
|
|
goto exit;
|
|
|
|
}
|
2023-05-21 21:29:04 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
|
2023-05-20 21:03:02 -03:00
|
|
|
exit:
|
|
|
|
_PyToken_Free(&token);
|
2023-05-26 18:02:26 -03:00
|
|
|
if (type == ENDMARKER) {
|
|
|
|
it->done = 1;
|
|
|
|
}
|
2023-05-20 21:03:02 -03:00
|
|
|
return result;
|
2021-08-24 13:50:05 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2021-08-25 09:41:14 -03:00
|
|
|
tokenizeriter_dealloc(tokenizeriterobject *it)
|
2021-08-24 13:50:05 -03:00
|
|
|
{
|
2021-08-25 09:41:14 -03:00
|
|
|
PyTypeObject *tp = Py_TYPE(it);
|
2021-10-13 12:22:14 -03:00
|
|
|
_PyTokenizer_Free(it->tok);
|
2021-08-24 13:50:05 -03:00
|
|
|
tp->tp_free(it);
|
|
|
|
Py_DECREF(tp);
|
|
|
|
}
|
|
|
|
|
|
|
|
static PyType_Slot tokenizeriter_slots[] = {
|
2021-08-25 09:41:14 -03:00
|
|
|
{Py_tp_new, tokenizeriter_new},
|
|
|
|
{Py_tp_dealloc, tokenizeriter_dealloc},
|
|
|
|
{Py_tp_getattro, PyObject_GenericGetAttr},
|
|
|
|
{Py_tp_iter, PyObject_SelfIter},
|
|
|
|
{Py_tp_iternext, tokenizeriter_next},
|
|
|
|
{0, NULL},
|
2021-08-24 13:50:05 -03:00
|
|
|
};
|
|
|
|
|
|
|
|
static PyType_Spec tokenizeriter_spec = {
|
2021-08-25 09:41:14 -03:00
|
|
|
.name = "_tokenize.TokenizerIter",
|
|
|
|
.basicsize = sizeof(tokenizeriterobject),
|
|
|
|
.flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
|
|
|
|
.slots = tokenizeriter_slots,
|
2021-08-24 13:50:05 -03:00
|
|
|
};
|
|
|
|
|
|
|
|
static int
|
2021-08-25 09:41:14 -03:00
|
|
|
tokenizemodule_exec(PyObject *m)
|
2021-08-24 13:50:05 -03:00
|
|
|
{
|
2021-08-25 09:41:14 -03:00
|
|
|
tokenize_state *state = get_tokenize_state(m);
|
2021-08-24 13:50:05 -03:00
|
|
|
if (state == NULL) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2021-08-25 09:41:14 -03:00
|
|
|
state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
|
2021-08-24 13:50:05 -03:00
|
|
|
if (state->TokenizerIter == NULL) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (PyModule_AddType(m, state->TokenizerIter) < 0) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static PyMethodDef tokenize_methods[] = {
|
2021-08-25 09:41:14 -03:00
|
|
|
{NULL, NULL, 0, NULL} /* Sentinel */
|
2021-08-24 13:50:05 -03:00
|
|
|
};
|
|
|
|
|
|
|
|
static PyModuleDef_Slot tokenizemodule_slots[] = {
|
2021-10-03 10:58:14 -03:00
|
|
|
{Py_mod_exec, tokenizemodule_exec},
|
2023-05-05 18:11:27 -03:00
|
|
|
{Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
|
2024-05-03 12:30:55 -03:00
|
|
|
{Py_mod_gil, Py_MOD_GIL_NOT_USED},
|
2021-08-24 13:50:05 -03:00
|
|
|
{0, NULL}
|
|
|
|
};
|
|
|
|
|
|
|
|
static int
|
|
|
|
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
|
|
|
|
{
|
|
|
|
tokenize_state *state = get_tokenize_state(m);
|
|
|
|
Py_VISIT(state->TokenizerIter);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
tokenizemodule_clear(PyObject *m)
|
|
|
|
{
|
|
|
|
tokenize_state *state = get_tokenize_state(m);
|
|
|
|
Py_CLEAR(state->TokenizerIter);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
tokenizemodule_free(void *m)
|
|
|
|
{
|
|
|
|
tokenizemodule_clear((PyObject *)m);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct PyModuleDef _tokenizemodule = {
|
2021-08-25 09:41:14 -03:00
|
|
|
PyModuleDef_HEAD_INIT,
|
|
|
|
.m_name = "_tokenize",
|
|
|
|
.m_size = sizeof(tokenize_state),
|
|
|
|
.m_slots = tokenizemodule_slots,
|
|
|
|
.m_methods = tokenize_methods,
|
|
|
|
.m_traverse = tokenizemodule_traverse,
|
|
|
|
.m_clear = tokenizemodule_clear,
|
|
|
|
.m_free = tokenizemodule_free,
|
2021-08-24 13:50:05 -03:00
|
|
|
};
|
|
|
|
|
|
|
|
PyMODINIT_FUNC
|
|
|
|
PyInit__tokenize(void)
|
|
|
|
{
|
|
|
|
return PyModuleDef_Init(&_tokenizemodule);
|
|
|
|
}
|