mirror of https://github.com/python/cpython
gh-97973: Return all necessary information from the tokenizer (GH-97984)
Right now, the tokenizer only returns type and two pointers to the start and end of the token. This PR modifies the tokenizer to return the type and set all of the necessary information, so that the parser does not have to this.
This commit is contained in:
parent
b9d2e81716
commit
cbf0afd8a1
|
@ -0,0 +1 @@
|
|||
Modify the tokenizer to return all necessary information the parser needs to set location information in the AST nodes, so that the parser does not have to calculate those doing pointer arithmetic.
|
|
@ -123,16 +123,18 @@ growable_comment_array_deallocate(growable_comment_array *arr) {
|
|||
}
|
||||
|
||||
static int
|
||||
_get_keyword_or_name_type(Parser *p, const char *name, int name_len)
|
||||
_get_keyword_or_name_type(Parser *p, struct token *new_token)
|
||||
{
|
||||
int name_len = new_token->end_col_offset - new_token->col_offset;
|
||||
assert(name_len > 0);
|
||||
|
||||
if (name_len >= p->n_keyword_lists ||
|
||||
p->keywords[name_len] == NULL ||
|
||||
p->keywords[name_len]->type == -1) {
|
||||
return NAME;
|
||||
}
|
||||
for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
|
||||
if (strncmp(k->str, name, name_len) == 0) {
|
||||
if (strncmp(k->str, new_token->start, name_len) == 0) {
|
||||
return k->type;
|
||||
}
|
||||
}
|
||||
|
@ -140,33 +142,26 @@ _get_keyword_or_name_type(Parser *p, const char *name, int name_len)
|
|||
}
|
||||
|
||||
static int
|
||||
initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) {
|
||||
assert(token != NULL);
|
||||
initialize_token(Parser *p, Token *parser_token, struct token *new_token, int token_type) {
|
||||
assert(parser_token != NULL);
|
||||
|
||||
token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : token_type;
|
||||
token->bytes = PyBytes_FromStringAndSize(start, end - start);
|
||||
if (token->bytes == NULL) {
|
||||
parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type;
|
||||
parser_token->bytes = PyBytes_FromStringAndSize(new_token->start, new_token->end - new_token->start);
|
||||
if (parser_token->bytes == NULL) {
|
||||
return -1;
|
||||
}
|
||||
if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) {
|
||||
Py_DECREF(parser_token->bytes);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (_PyArena_AddPyObject(p->arena, token->bytes) < 0) {
|
||||
Py_DECREF(token->bytes);
|
||||
return -1;
|
||||
}
|
||||
|
||||
token->level = p->tok->level;
|
||||
|
||||
const char *line_start = token_type == STRING ? p->tok->multi_line_start : p->tok->line_start;
|
||||
int lineno = token_type == STRING ? p->tok->first_lineno : p->tok->lineno;
|
||||
int end_lineno = p->tok->lineno;
|
||||
|
||||
int col_offset = (start != NULL && start >= line_start) ? (int)(start - line_start) : -1;
|
||||
int end_col_offset = (end != NULL && end >= p->tok->line_start) ? (int)(end - p->tok->line_start) : -1;
|
||||
|
||||
token->lineno = lineno;
|
||||
token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + col_offset : col_offset;
|
||||
token->end_lineno = end_lineno;
|
||||
token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + end_col_offset : end_col_offset;
|
||||
parser_token->level = new_token->level;
|
||||
parser_token->lineno = new_token->lineno;
|
||||
parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
|
||||
: new_token->col_offset;
|
||||
parser_token->end_lineno = new_token->end_lineno;
|
||||
parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->end_col_offset
|
||||
: new_token->end_col_offset;
|
||||
|
||||
p->fill += 1;
|
||||
|
||||
|
@ -202,26 +197,25 @@ _resize_tokens_array(Parser *p) {
|
|||
int
|
||||
_PyPegen_fill_token(Parser *p)
|
||||
{
|
||||
const char *start;
|
||||
const char *end;
|
||||
int type = _PyTokenizer_Get(p->tok, &start, &end);
|
||||
struct token new_token;
|
||||
int type = _PyTokenizer_Get(p->tok, &new_token);
|
||||
|
||||
// Record and skip '# type: ignore' comments
|
||||
while (type == TYPE_IGNORE) {
|
||||
Py_ssize_t len = end - start;
|
||||
Py_ssize_t len = new_token.end_col_offset - new_token.col_offset;
|
||||
char *tag = PyMem_Malloc(len + 1);
|
||||
if (tag == NULL) {
|
||||
PyErr_NoMemory();
|
||||
return -1;
|
||||
}
|
||||
strncpy(tag, start, len);
|
||||
strncpy(tag, new_token.start, len);
|
||||
tag[len] = '\0';
|
||||
// Ownership of tag passes to the growable array
|
||||
if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
|
||||
PyErr_NoMemory();
|
||||
return -1;
|
||||
}
|
||||
type = _PyTokenizer_Get(p->tok, &start, &end);
|
||||
type = _PyTokenizer_Get(p->tok, &new_token);
|
||||
}
|
||||
|
||||
// If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
|
||||
|
@ -244,7 +238,7 @@ _PyPegen_fill_token(Parser *p)
|
|||
}
|
||||
|
||||
Token *t = p->tokens[p->fill];
|
||||
return initialize_token(p, t, start, end, type);
|
||||
return initialize_token(p, t, &new_token, type);
|
||||
}
|
||||
|
||||
#if defined(Py_DEBUG)
|
||||
|
|
|
@ -164,11 +164,10 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
|
|||
Py_ssize_t current_err_line = current_token->lineno;
|
||||
|
||||
int ret = 0;
|
||||
struct token new_token;
|
||||
|
||||
for (;;) {
|
||||
const char *start;
|
||||
const char *end;
|
||||
switch (_PyTokenizer_Get(p->tok, &start, &end)) {
|
||||
switch (_PyTokenizer_Get(p->tok, &new_token)) {
|
||||
case ERRORTOKEN:
|
||||
if (p->tok->level != 0) {
|
||||
int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
|
||||
|
|
|
@ -36,6 +36,8 @@
|
|||
/* Don't ever change this -- it would break the portability of Python code */
|
||||
#define TABSIZE 8
|
||||
|
||||
#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
|
||||
|
||||
/* Forward */
|
||||
static struct tok_state *tok_new(void);
|
||||
static int tok_nextc(struct tok_state *tok);
|
||||
|
@ -1174,8 +1176,6 @@ syntaxerror_known_range(struct tok_state *tok,
|
|||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static int
|
||||
indenterror(struct tok_state *tok)
|
||||
{
|
||||
|
@ -1391,12 +1391,32 @@ tok_continuation_line(struct tok_state *tok) {
|
|||
}
|
||||
|
||||
static int
|
||||
tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
||||
token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
|
||||
{
|
||||
assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
|
||||
token->level = tok->level;
|
||||
token->lineno = type == STRING ? tok->first_lineno : tok->lineno;
|
||||
token->end_lineno = tok->lineno;
|
||||
token->col_offset = -1;
|
||||
token->end_col_offset = -1;
|
||||
token->start = start;
|
||||
token->end = end;
|
||||
if (start != NULL && end != NULL) {
|
||||
const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start;
|
||||
token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1;
|
||||
token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1;
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
||||
static int
|
||||
tok_get(struct tok_state *tok, struct token *token)
|
||||
{
|
||||
int c;
|
||||
int blankline, nonascii;
|
||||
|
||||
*p_start = *p_end = NULL;
|
||||
const char *p_start = NULL;
|
||||
const char *p_end = NULL;
|
||||
nextline:
|
||||
tok->start = NULL;
|
||||
blankline = 0;
|
||||
|
@ -1426,7 +1446,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
// the level of indentation of whatever comes next.
|
||||
cont_line_col = cont_line_col ? cont_line_col : col;
|
||||
if ((c = tok_continuation_line(tok)) == -1) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
@ -1461,7 +1481,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
if (col == tok->indstack[tok->indent]) {
|
||||
/* No change */
|
||||
if (altcol != tok->altindstack[tok->indent]) {
|
||||
return indenterror(tok);
|
||||
return MAKE_TOKEN(indenterror(tok));
|
||||
}
|
||||
}
|
||||
else if (col > tok->indstack[tok->indent]) {
|
||||
|
@ -1469,10 +1489,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
if (tok->indent+1 >= MAXINDENT) {
|
||||
tok->done = E_TOODEEP;
|
||||
tok->cur = tok->inp;
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
if (altcol <= tok->altindstack[tok->indent]) {
|
||||
return indenterror(tok);
|
||||
return MAKE_TOKEN(indenterror(tok));
|
||||
}
|
||||
tok->pendin++;
|
||||
tok->indstack[++tok->indent] = col;
|
||||
|
@ -1488,10 +1508,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
if (col != tok->indstack[tok->indent]) {
|
||||
tok->done = E_DEDENT;
|
||||
tok->cur = tok->inp;
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
if (altcol != tok->altindstack[tok->indent]) {
|
||||
return indenterror(tok);
|
||||
return MAKE_TOKEN(indenterror(tok));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1503,11 +1523,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
if (tok->pendin != 0) {
|
||||
if (tok->pendin < 0) {
|
||||
tok->pendin++;
|
||||
return DEDENT;
|
||||
return MAKE_TOKEN(DEDENT);
|
||||
}
|
||||
else {
|
||||
tok->pendin--;
|
||||
return INDENT;
|
||||
return MAKE_TOKEN(INDENT);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1587,34 +1607,34 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
&& ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
|
||||
|
||||
if (is_type_ignore) {
|
||||
*p_start = ignore_end;
|
||||
*p_end = tok->cur;
|
||||
p_start = ignore_end;
|
||||
p_end = tok->cur;
|
||||
|
||||
/* If this type ignore is the only thing on the line, consume the newline also. */
|
||||
if (blankline) {
|
||||
tok_nextc(tok);
|
||||
tok->atbol = 1;
|
||||
}
|
||||
return TYPE_IGNORE;
|
||||
return MAKE_TOKEN(TYPE_IGNORE);
|
||||
} else {
|
||||
*p_start = type_start; /* after type_comment_prefix */
|
||||
*p_end = tok->cur;
|
||||
return TYPE_COMMENT;
|
||||
p_start = type_start;
|
||||
p_end = tok->cur;
|
||||
return MAKE_TOKEN(TYPE_COMMENT);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (tok->done == E_INTERACT_STOP) {
|
||||
return ENDMARKER;
|
||||
return MAKE_TOKEN(ENDMARKER);
|
||||
}
|
||||
|
||||
/* Check for EOF and errors now */
|
||||
if (c == EOF) {
|
||||
if (tok->level) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
|
||||
return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN);
|
||||
}
|
||||
|
||||
/* Identifier (most frequent token!) */
|
||||
|
@ -1654,11 +1674,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
}
|
||||
tok_backup(tok, c);
|
||||
if (nonascii && !verify_identifier(tok)) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
|
||||
*p_start = tok->start;
|
||||
*p_end = tok->cur;
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur;
|
||||
|
||||
/* async/await parsing block. */
|
||||
if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
|
||||
|
@ -1673,10 +1693,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
if (!tok->async_hacks || tok->async_def) {
|
||||
/* Always recognize the keywords. */
|
||||
if (memcmp(tok->start, "async", 5) == 0) {
|
||||
return ASYNC;
|
||||
return MAKE_TOKEN(ASYNC);
|
||||
}
|
||||
if (memcmp(tok->start, "await", 5) == 0) {
|
||||
return AWAIT;
|
||||
return MAKE_TOKEN(AWAIT);
|
||||
}
|
||||
}
|
||||
else if (memcmp(tok->start, "async", 5) == 0) {
|
||||
|
@ -1684,13 +1704,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
Look ahead one token to see if that is 'def'. */
|
||||
|
||||
struct tok_state ahead_tok;
|
||||
const char *ahead_tok_start = NULL;
|
||||
const char *ahead_tok_end = NULL;
|
||||
struct token ahead_token;
|
||||
int ahead_tok_kind;
|
||||
|
||||
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
|
||||
ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
|
||||
&ahead_tok_end);
|
||||
ahead_tok_kind = tok_get(&ahead_tok, &ahead_token);
|
||||
|
||||
if (ahead_tok_kind == NAME
|
||||
&& ahead_tok.cur - ahead_tok.start == 3
|
||||
|
@ -1700,12 +1718,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
returning a plain NAME token, return ASYNC. */
|
||||
tok->async_def_indent = tok->indent;
|
||||
tok->async_def = 1;
|
||||
return ASYNC;
|
||||
return MAKE_TOKEN(ASYNC);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return NAME;
|
||||
return MAKE_TOKEN(NAME);
|
||||
}
|
||||
|
||||
/* Newline */
|
||||
|
@ -1714,15 +1732,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
if (blankline || tok->level > 0) {
|
||||
goto nextline;
|
||||
}
|
||||
*p_start = tok->start;
|
||||
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur - 1; /* Leave '\n' out of the string */
|
||||
tok->cont_line = 0;
|
||||
if (tok->async_def) {
|
||||
/* We're somewhere inside an 'async def' function, and
|
||||
we've encountered a NEWLINE after its signature. */
|
||||
tok->async_def_nl = 1;
|
||||
}
|
||||
return NEWLINE;
|
||||
return MAKE_TOKEN(NEWLINE);
|
||||
}
|
||||
|
||||
/* Period or number starting with period? */
|
||||
|
@ -1733,9 +1751,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
} else if (c == '.') {
|
||||
c = tok_nextc(tok);
|
||||
if (c == '.') {
|
||||
*p_start = tok->start;
|
||||
*p_end = tok->cur;
|
||||
return ELLIPSIS;
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur;
|
||||
return MAKE_TOKEN(ELLIPSIS);
|
||||
}
|
||||
else {
|
||||
tok_backup(tok, c);
|
||||
|
@ -1745,9 +1763,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
else {
|
||||
tok_backup(tok, c);
|
||||
}
|
||||
*p_start = tok->start;
|
||||
*p_end = tok->cur;
|
||||
return DOT;
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur;
|
||||
return MAKE_TOKEN(DOT);
|
||||
}
|
||||
|
||||
/* Number */
|
||||
|
@ -1764,14 +1782,14 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
}
|
||||
if (!isxdigit(c)) {
|
||||
tok_backup(tok, c);
|
||||
return syntaxerror(tok, "invalid hexadecimal literal");
|
||||
return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal"));
|
||||
}
|
||||
do {
|
||||
c = tok_nextc(tok);
|
||||
} while (isxdigit(c));
|
||||
} while (c == '_');
|
||||
if (!verify_end_of_number(tok, c, "hexadecimal")) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
}
|
||||
else if (c == 'o' || c == 'O') {
|
||||
|
@ -1783,12 +1801,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
}
|
||||
if (c < '0' || c >= '8') {
|
||||
if (isdigit(c)) {
|
||||
return syntaxerror(tok,
|
||||
"invalid digit '%c' in octal literal", c);
|
||||
return MAKE_TOKEN(syntaxerror(tok,
|
||||
"invalid digit '%c' in octal literal", c));
|
||||
}
|
||||
else {
|
||||
tok_backup(tok, c);
|
||||
return syntaxerror(tok, "invalid octal literal");
|
||||
return MAKE_TOKEN(syntaxerror(tok, "invalid octal literal"));
|
||||
}
|
||||
}
|
||||
do {
|
||||
|
@ -1796,11 +1814,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
} while ('0' <= c && c < '8');
|
||||
} while (c == '_');
|
||||
if (isdigit(c)) {
|
||||
return syntaxerror(tok,
|
||||
"invalid digit '%c' in octal literal", c);
|
||||
return MAKE_TOKEN(syntaxerror(tok,
|
||||
"invalid digit '%c' in octal literal", c));
|
||||
}
|
||||
if (!verify_end_of_number(tok, c, "octal")) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
}
|
||||
else if (c == 'b' || c == 'B') {
|
||||
|
@ -1812,12 +1830,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
}
|
||||
if (c != '0' && c != '1') {
|
||||
if (isdigit(c)) {
|
||||
return syntaxerror(tok,
|
||||
"invalid digit '%c' in binary literal", c);
|
||||
return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
|
||||
}
|
||||
else {
|
||||
tok_backup(tok, c);
|
||||
return syntaxerror(tok, "invalid binary literal");
|
||||
return MAKE_TOKEN(syntaxerror(tok, "invalid binary literal"));
|
||||
}
|
||||
}
|
||||
do {
|
||||
|
@ -1825,11 +1842,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
} while (c == '0' || c == '1');
|
||||
} while (c == '_');
|
||||
if (isdigit(c)) {
|
||||
return syntaxerror(tok,
|
||||
"invalid digit '%c' in binary literal", c);
|
||||
return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
|
||||
}
|
||||
if (!verify_end_of_number(tok, c, "binary")) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
@ -1841,7 +1857,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
c = tok_nextc(tok);
|
||||
if (!isdigit(c)) {
|
||||
tok_backup(tok, c);
|
||||
return syntaxerror(tok, "invalid decimal literal");
|
||||
return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
|
||||
}
|
||||
}
|
||||
if (c != '0') {
|
||||
|
@ -1854,7 +1870,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
nonzero = 1;
|
||||
c = tok_decimal_tail(tok);
|
||||
if (c == 0) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
}
|
||||
if (c == '.') {
|
||||
|
@ -1870,15 +1886,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
else if (nonzero) {
|
||||
/* Old-style octal: now disallowed. */
|
||||
tok_backup(tok, c);
|
||||
return syntaxerror_known_range(
|
||||
return MAKE_TOKEN(syntaxerror_known_range(
|
||||
tok, (int)(tok->start + 1 - tok->line_start),
|
||||
(int)(zeros_end - tok->line_start),
|
||||
"leading zeros in decimal integer "
|
||||
"literals are not permitted; "
|
||||
"use an 0o prefix for octal integers");
|
||||
"use an 0o prefix for octal integers"));
|
||||
}
|
||||
if (!verify_end_of_number(tok, c, "decimal")) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1886,7 +1902,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
/* Decimal */
|
||||
c = tok_decimal_tail(tok);
|
||||
if (c == 0) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
{
|
||||
/* Accept floating point numbers. */
|
||||
|
@ -1897,7 +1913,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
if (isdigit(c)) {
|
||||
c = tok_decimal_tail(tok);
|
||||
if (c == 0) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1911,21 +1927,21 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
c = tok_nextc(tok);
|
||||
if (!isdigit(c)) {
|
||||
tok_backup(tok, c);
|
||||
return syntaxerror(tok, "invalid decimal literal");
|
||||
return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
|
||||
}
|
||||
} else if (!isdigit(c)) {
|
||||
tok_backup(tok, c);
|
||||
if (!verify_end_of_number(tok, e, "decimal")) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
tok_backup(tok, e);
|
||||
*p_start = tok->start;
|
||||
*p_end = tok->cur;
|
||||
return NUMBER;
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur;
|
||||
return MAKE_TOKEN(NUMBER);
|
||||
}
|
||||
c = tok_decimal_tail(tok);
|
||||
if (c == 0) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
}
|
||||
if (c == 'j' || c == 'J') {
|
||||
|
@ -1933,18 +1949,18 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
imaginary:
|
||||
c = tok_nextc(tok);
|
||||
if (!verify_end_of_number(tok, c, "imaginary")) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
}
|
||||
else if (!verify_end_of_number(tok, c, "decimal")) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
}
|
||||
}
|
||||
tok_backup(tok, c);
|
||||
*p_start = tok->start;
|
||||
*p_end = tok->cur;
|
||||
return NUMBER;
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur;
|
||||
return MAKE_TOKEN(NUMBER);
|
||||
}
|
||||
|
||||
letter_quote:
|
||||
|
@ -1997,7 +2013,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
if (c != '\n') {
|
||||
tok->done = E_EOFS;
|
||||
}
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
else {
|
||||
syntaxerror(tok, "unterminated string literal (detected at"
|
||||
|
@ -2005,7 +2021,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
if (c != '\n') {
|
||||
tok->done = E_EOLS;
|
||||
}
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
}
|
||||
if (c == quote) {
|
||||
|
@ -2019,15 +2035,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
}
|
||||
}
|
||||
|
||||
*p_start = tok->start;
|
||||
*p_end = tok->cur;
|
||||
return STRING;
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur;
|
||||
return MAKE_TOKEN(STRING);
|
||||
}
|
||||
|
||||
/* Line continuation */
|
||||
if (c == '\\') {
|
||||
if ((c = tok_continuation_line(tok)) == -1) {
|
||||
return ERRORTOKEN;
|
||||
return MAKE_TOKEN(ERRORTOKEN);
|
||||
}
|
||||
tok->cont_line = 1;
|
||||
goto again; /* Read next line */
|
||||
|
@ -2036,19 +2052,19 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
/* Check for two-character token */
|
||||
{
|
||||
int c2 = tok_nextc(tok);
|
||||
int token = _PyToken_TwoChars(c, c2);
|
||||
if (token != OP) {
|
||||
int current_token = _PyToken_TwoChars(c, c2);
|
||||
if (current_token != OP) {
|
||||
int c3 = tok_nextc(tok);
|
||||
int token3 = _PyToken_ThreeChars(c, c2, c3);
|
||||
if (token3 != OP) {
|
||||
token = token3;
|
||||
int current_token3 = _PyToken_ThreeChars(c, c2, c3);
|
||||
if (current_token3 != OP) {
|
||||
current_token = current_token3;
|
||||
}
|
||||
else {
|
||||
tok_backup(tok, c3);
|
||||
}
|
||||
*p_start = tok->start;
|
||||
*p_end = tok->cur;
|
||||
return token;
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur;
|
||||
return MAKE_TOKEN(current_token);
|
||||
}
|
||||
tok_backup(tok, c2);
|
||||
}
|
||||
|
@ -2059,7 +2075,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
case '[':
|
||||
case '{':
|
||||
if (tok->level >= MAXLEVEL) {
|
||||
return syntaxerror(tok, "too many nested parentheses");
|
||||
return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses"));
|
||||
}
|
||||
tok->parenstack[tok->level] = c;
|
||||
tok->parenlinenostack[tok->level] = tok->lineno;
|
||||
|
@ -2070,7 +2086,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
case ']':
|
||||
case '}':
|
||||
if (!tok->level) {
|
||||
return syntaxerror(tok, "unmatched '%c'", c);
|
||||
return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c));
|
||||
}
|
||||
tok->level--;
|
||||
int opening = tok->parenstack[tok->level];
|
||||
|
@ -2079,16 +2095,16 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
(opening == '{' && c == '}')))
|
||||
{
|
||||
if (tok->parenlinenostack[tok->level] != tok->lineno) {
|
||||
return syntaxerror(tok,
|
||||
return MAKE_TOKEN(syntaxerror(tok,
|
||||
"closing parenthesis '%c' does not match "
|
||||
"opening parenthesis '%c' on line %d",
|
||||
c, opening, tok->parenlinenostack[tok->level]);
|
||||
c, opening, tok->parenlinenostack[tok->level]));
|
||||
}
|
||||
else {
|
||||
return syntaxerror(tok,
|
||||
return MAKE_TOKEN(syntaxerror(tok,
|
||||
"closing parenthesis '%c' does not match "
|
||||
"opening parenthesis '%c'",
|
||||
c, opening);
|
||||
c, opening));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -2097,20 +2113,19 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
|
|||
if (!Py_UNICODE_ISPRINTABLE(c)) {
|
||||
char hex[9];
|
||||
(void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
|
||||
return syntaxerror(tok, "invalid non-printable character U+%s", hex);
|
||||
return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex));
|
||||
}
|
||||
|
||||
/* Punctuation character */
|
||||
*p_start = tok->start;
|
||||
*p_end = tok->cur;
|
||||
return _PyToken_OneChar(c);
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur;
|
||||
return MAKE_TOKEN(_PyToken_OneChar(c));
|
||||
}
|
||||
|
||||
int
|
||||
_PyTokenizer_Get(struct tok_state *tok,
|
||||
const char **p_start, const char **p_end)
|
||||
_PyTokenizer_Get(struct tok_state *tok, struct token *token)
|
||||
{
|
||||
int result = tok_get(tok, p_start, p_end);
|
||||
int result = tok_get(tok, token);
|
||||
if (tok->decoding_erred) {
|
||||
result = ERRORTOKEN;
|
||||
tok->done = E_DECODE;
|
||||
|
@ -2166,8 +2181,6 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
|
|||
{
|
||||
struct tok_state *tok;
|
||||
FILE *fp;
|
||||
const char *p_start = NULL;
|
||||
const char *p_end = NULL;
|
||||
char *encoding = NULL;
|
||||
|
||||
fp = fdopen_borrow(fd);
|
||||
|
@ -2191,8 +2204,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
|
|||
return encoding;
|
||||
}
|
||||
}
|
||||
struct token token;
|
||||
while (tok->lineno < 2 && tok->done == E_OK) {
|
||||
_PyTokenizer_Get(tok, &p_start, &p_end);
|
||||
_PyTokenizer_Get(tok, &token);
|
||||
}
|
||||
fclose(fp);
|
||||
if (tok->encoding) {
|
||||
|
|
|
@ -27,6 +27,12 @@ enum interactive_underflow_t {
|
|||
IUNDERFLOW_STOP,
|
||||
};
|
||||
|
||||
struct token {
|
||||
int level;
|
||||
int lineno, col_offset, end_lineno, end_col_offset;
|
||||
const char *start, *end;
|
||||
};
|
||||
|
||||
/* Tokenizer state */
|
||||
struct tok_state {
|
||||
/* Input state; buf <= cur <= inp <= end */
|
||||
|
@ -94,7 +100,7 @@ extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
|
|||
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
|
||||
const char *, const char *);
|
||||
extern void _PyTokenizer_Free(struct tok_state *);
|
||||
extern int _PyTokenizer_Get(struct tok_state *, const char **, const char **);
|
||||
extern int _PyTokenizer_Get(struct tok_state *, struct token *);
|
||||
|
||||
#define tok_dump _Py_tok_dump
|
||||
|
||||
|
|
|
@ -60,9 +60,8 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source)
|
|||
static PyObject *
|
||||
tokenizeriter_next(tokenizeriterobject *it)
|
||||
{
|
||||
const char *start;
|
||||
const char *end;
|
||||
int type = _PyTokenizer_Get(it->tok, &start, &end);
|
||||
struct token token;
|
||||
int type = _PyTokenizer_Get(it->tok, &token);
|
||||
if (type == ERRORTOKEN && PyErr_Occurred()) {
|
||||
return NULL;
|
||||
}
|
||||
|
@ -71,11 +70,11 @@ tokenizeriter_next(tokenizeriterobject *it)
|
|||
return NULL;
|
||||
}
|
||||
PyObject *str = NULL;
|
||||
if (start == NULL || end == NULL) {
|
||||
if (token.start == NULL || token.end == NULL) {
|
||||
str = PyUnicode_FromString("");
|
||||
}
|
||||
else {
|
||||
str = PyUnicode_FromStringAndSize(start, end - start);
|
||||
str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
|
||||
}
|
||||
if (str == NULL) {
|
||||
return NULL;
|
||||
|
@ -92,11 +91,11 @@ tokenizeriter_next(tokenizeriterobject *it)
|
|||
int end_lineno = it->tok->lineno;
|
||||
int col_offset = -1;
|
||||
int end_col_offset = -1;
|
||||
if (start != NULL && start >= line_start) {
|
||||
col_offset = (int)(start - line_start);
|
||||
if (token.start != NULL && token.start >= line_start) {
|
||||
col_offset = (int)(token.start - line_start);
|
||||
}
|
||||
if (end != NULL && end >= it->tok->line_start) {
|
||||
end_col_offset = (int)(end - it->tok->line_start);
|
||||
if (token.end != NULL && token.end >= it->tok->line_start) {
|
||||
end_col_offset = (int)(token.end - it->tok->line_start);
|
||||
}
|
||||
|
||||
return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
|
||||
|
|
Loading…
Reference in New Issue