gh-97997: Add col_offset field to tokenizer and use that for AST nodes (#98000)

2022-10-07 14:38:35 -07:00 · 2022-10-07 14:38:35 -07:00 · 3de08ce8c1
parent c06276402b
commit 3de08ce8c1
3 changed files with 44 additions and 11 deletions
--- a/Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst
+++ b/Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst
@ -0,0 +1 @@
+Add running column offset to the tokenizer state to avoid calculating AST column information with pointer arithmetic.
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -37,6 +37,11 @@
 #define TABSIZE 8

 #define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
+#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
+                type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
+#define ADVANCE_LINENO() \
+            tok->lineno++; \
+            tok->col_offset = 0;

 /* Forward */
 static struct tok_state *tok_new(void);
@ -73,6 +78,8 @@ tok_new(void)
    tok->pendin = 0;
    tok->prompt = tok->nextprompt = NULL;
    tok->lineno = 0;
+    tok->starting_col_offset = -1;
+    tok->col_offset = -1;
    tok->level = 0;
    tok->altindstack[0] = 0;
    tok->decoding_state = STATE_INIT;
@ -871,7 +878,7 @@ tok_underflow_string(struct tok_state *tok) {
        tok->buf = tok->cur;
    }
    tok->line_start = tok->cur;
-    tok->lineno++;
+    ADVANCE_LINENO();
    tok->inp = end;
    return 1;
 }
@ -930,7 +937,7 @@ tok_underflow_interactive(struct tok_state *tok) {
    else if (tok->start != NULL) {
        Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
        size_t size = strlen(newtok);
-        tok->lineno++;
+        ADVANCE_LINENO();
        if (!tok_reserve_buf(tok, size + 1)) {
            PyMem_Free(tok->buf);
            tok->buf = NULL;
@ -943,7 +950,7 @@ tok_underflow_interactive(struct tok_state *tok) {
        tok->multi_line_start = tok->buf + cur_multi_line_start;
    }
    else {
-        tok->lineno++;
+        ADVANCE_LINENO();
        PyMem_Free(tok->buf);
        tok->buf = newtok;
        tok->cur = tok->buf;
@ -998,7 +1005,7 @@ tok_underflow_file(struct tok_state *tok) {
        *tok->inp = '\0';
    }

-    tok->lineno++;
+    ADVANCE_LINENO();
    if (tok->decoding_state != STATE_NORMAL) {
        if (tok->lineno > 2) {
            tok->decoding_state = STATE_NORMAL;
@ -1056,6 +1063,7 @@ tok_nextc(struct tok_state *tok)
    int rc;
    for (;;) {
        if (tok->cur != tok->inp) {
+            tok->col_offset++;
            return Py_CHARMASK(*tok->cur++); /* Fast path */
        }
        if (tok->done != E_OK) {
@ -1104,6 +1112,7 @@ tok_backup(struct tok_state *tok, int c)
        if ((int)(unsigned char)*tok->cur != c) {
            Py_FatalError("tok_backup: wrong character");
        }
+        tok->col_offset--;
    }
 }

@ -1390,6 +1399,19 @@ tok_continuation_line(struct tok_state *tok) {
    return c;
 }

+static int
+type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
+                         int end_col_offset, const char *start, const char *end)
+{
+    token->level = tok->level;
+    token->lineno = token->end_lineno = tok->lineno;
+    token->col_offset = col_offset;
+    token->end_col_offset = end_col_offset;
+    token->start = start;
+    token->end = end;
+    return type;
+}
+
 static int
 token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
 {
@ -1397,14 +1419,13 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
    token->level = tok->level;
    token->lineno = type == STRING ? tok->first_lineno : tok->lineno;
    token->end_lineno = tok->lineno;
-    token->col_offset = -1;
-    token->end_col_offset = -1;
+    token->col_offset = token->end_col_offset = -1;
    token->start = start;
    token->end = end;
+
    if (start != NULL && end != NULL) {
-        const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start;
-        token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1;
-        token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1;
+        token->col_offset = tok->starting_col_offset;
+        token->end_col_offset = tok->col_offset;
    }
    return type;
 }
@ -1419,6 +1440,7 @@ tok_get(struct tok_state *tok, struct token *token)
    const char *p_end = NULL;
  nextline:
    tok->start = NULL;
+    tok->starting_col_offset = -1;
    blankline = 0;

    /* Get indentation level */
@ -1518,6 +1540,7 @@ tok_get(struct tok_state *tok, struct token *token)
    }

    tok->start = tok->cur;
+    tok->starting_col_offset = tok->col_offset;

    /* Return pending indents/dedents */
    if (tok->pendin != 0) {
@ -1565,10 +1588,12 @@ tok_get(struct tok_state *tok, struct token *token)

    /* Set start of current token */
    tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
+    tok->starting_col_offset = tok->col_offset - 1;

    /* Skip comment, unless it's a type comment */
    if (c == '#') {
        const char *prefix, *p, *type_start;
+        int current_starting_col_offset;

        while (c != EOF && c != '\n') {
            c = tok_nextc(tok);
@ -1576,14 +1601,17 @@ tok_get(struct tok_state *tok, struct token *token)

        if (tok->type_comments) {
            p = tok->start;
+            current_starting_col_offset = tok->starting_col_offset;
            prefix = type_comment_prefix;
            while (*prefix && p < tok->cur) {
                if (*prefix == ' ') {
                    while (*p == ' ' || *p == '\t') {
                        p++;
+                        current_starting_col_offset++;
                    }
                } else if (*prefix == *p) {
                    p++;
+                    current_starting_col_offset++;
                } else {
                    break;
                }
@ -1594,7 +1622,9 @@ tok_get(struct tok_state *tok, struct token *token)
            /* This is a type comment if we matched all of type_comment_prefix. */
            if (!*prefix) {
                int is_type_ignore = 1;
+                // +6 in order to skip the word 'ignore'
                const char *ignore_end = p + 6;
+                const int ignore_end_col_offset = current_starting_col_offset + 6;
                tok_backup(tok, c);  /* don't eat the newline or EOF */

                type_start = p;
@ -1615,11 +1645,11 @@ tok_get(struct tok_state *tok, struct token *token)
                        tok_nextc(tok);
                        tok->atbol = 1;
                    }
-                    return MAKE_TOKEN(TYPE_IGNORE);
+                    return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
                } else {
                    p_start = type_start;
                    p_end = tok->cur;
-                    return MAKE_TOKEN(TYPE_COMMENT);
+                    return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset);
                }
            }
        }
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@ -57,6 +57,8 @@ struct tok_state {
    int lineno;         /* Current line number */
    int first_lineno;   /* First line of a single line or multi line string
                           expression (cf. issue 16806) */
+    int starting_col_offset; /* The column offset at the beginning of a token */
+    int col_offset;     /* Current col offset */
    int level;          /* () [] {} Parentheses nesting level */
            /* Used to allow free continuations inside them */
    char parenstack[MAXLEVEL];
				`@ -0,0 +1 @@`
				`Add running column offset to the tokenizer state to avoid calculating AST column information with pointer arithmetic.`