cpython/Parser/lexer/state.c

#include "Python.h"
#include "pycore_pystate.h"
#include "pycore_token.h"
#include "errcode.h"

#include "state.h"

/* Never change this */
#define TABSIZE 8

/* Create and initialize a new tok_state structure */
struct tok_state *
_PyTokenizer_tok_new(void)
{
    struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
                                            sizeof(struct tok_state));
    if (tok == NULL)
        return NULL;
    tok->buf = tok->cur = tok->inp = NULL;
    tok->fp_interactive = 0;
    tok->interactive_src_start = NULL;
    tok->interactive_src_end = NULL;
    tok->start = NULL;
    tok->end = NULL;
    tok->done = E_OK;
    tok->fp = NULL;
    tok->input = NULL;
    tok->tabsize = TABSIZE;
    tok->indent = 0;
    tok->indstack[0] = 0;
    tok->atbol = 1;
    tok->pendin = 0;
    tok->prompt = tok->nextprompt = NULL;
    tok->lineno = 0;
    tok->starting_col_offset = -1;
    tok->col_offset = -1;
    tok->level = 0;
    tok->altindstack[0] = 0;
    tok->decoding_state = STATE_INIT;
    tok->decoding_erred = 0;
    tok->enc = NULL;
    tok->encoding = NULL;
    tok->cont_line = 0;
    tok->filename = NULL;
    tok->decoding_readline = NULL;
    tok->decoding_buffer = NULL;
    tok->readline = NULL;
    tok->type_comments = 0;
    tok->interactive_underflow = IUNDERFLOW_NORMAL;
    tok->underflow = NULL;
    tok->str = NULL;
    tok->report_warnings = 1;
    tok->tok_extra_tokens = 0;
    tok->comment_newline = 0;
    tok->implicit_newline = 0;
    tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
    tok->tok_mode_stack_index = 0;
#ifdef Py_DEBUG
    tok->debug = _Py_GetConfig()->parser_debug;
#endif
    return tok;
}

static void
free_fstring_expressions(struct tok_state *tok)
{
    int index;
    tokenizer_mode *mode;

    for (index = tok->tok_mode_stack_index; index >= 0; --index) {
        mode = &(tok->tok_mode_stack[index]);
        if (mode->last_expr_buffer != NULL) {
            PyMem_Free(mode->last_expr_buffer);
            mode->last_expr_buffer = NULL;
            mode->last_expr_size = 0;
            mode->last_expr_end = -1;
        }
    }
}

/* Free a tok_state structure */
void
_PyTokenizer_Free(struct tok_state *tok)
{
    if (tok->encoding != NULL) {
        PyMem_Free(tok->encoding);
    }
    Py_XDECREF(tok->decoding_readline);
    Py_XDECREF(tok->decoding_buffer);
    Py_XDECREF(tok->readline);
    Py_XDECREF(tok->filename);
    if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
        PyMem_Free(tok->buf);
    }
    if (tok->input) {
        PyMem_Free(tok->input);
    }
    if (tok->interactive_src_start != NULL) {
        PyMem_Free(tok->interactive_src_start);
    }
    free_fstring_expressions(tok);
    PyMem_Free(tok);
}

void
_PyToken_Free(struct token *token) {
    Py_XDECREF(token->metadata);
}

void
_PyToken_Init(struct token *token) {
    token->metadata = NULL;
}

int
_PyLexer_type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
                         int end_col_offset, const char *start, const char *end)
{
    token->level = tok->level;
    token->lineno = token->end_lineno = tok->lineno;
    token->col_offset = col_offset;
    token->end_col_offset = end_col_offset;
    token->start = start;
    token->end = end;
    return type;
}

int
_PyLexer_token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
{
    assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
    token->level = tok->level;
    if (ISSTRINGLIT(type)) {
        token->lineno = tok->first_lineno;
    }
    else {
        token->lineno = tok->lineno;
    }
    token->end_lineno = tok->lineno;
    token->col_offset = token->end_col_offset = -1;
    token->start = start;
    token->end = end;

    if (start != NULL && end != NULL) {
        token->col_offset = tok->starting_col_offset;
        token->end_col_offset = tok->col_offset;
    }
    return type;
}
gh-104169: Refactor tokenizer into lexer and wrappers (#110684) * The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <pablogsal@gmail.com> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> 2023-10-11 12:14:44 -03:00			`#include "Python.h"`
			`#include "pycore_pystate.h"`
			`#include "pycore_token.h"`
			`#include "errcode.h"`

			`#include "state.h"`

			`/* Never change this */`
			`#define TABSIZE 8`

			`/* Create and initialize a new tok_state structure */`
			`struct tok_state *`
			`_PyTokenizer_tok_new(void)`
			`{`
			`struct tok_state tok = (struct tok_state )PyMem_Malloc(`
			`sizeof(struct tok_state));`
			`if (tok == NULL)`
			`return NULL;`
			`tok->buf = tok->cur = tok->inp = NULL;`
			`tok->fp_interactive = 0;`
			`tok->interactive_src_start = NULL;`
			`tok->interactive_src_end = NULL;`
			`tok->start = NULL;`
			`tok->end = NULL;`
			`tok->done = E_OK;`
			`tok->fp = NULL;`
			`tok->input = NULL;`
			`tok->tabsize = TABSIZE;`
			`tok->indent = 0;`
			`tok->indstack[0] = 0;`
			`tok->atbol = 1;`
			`tok->pendin = 0;`
			`tok->prompt = tok->nextprompt = NULL;`
			`tok->lineno = 0;`
			`tok->starting_col_offset = -1;`
			`tok->col_offset = -1;`
			`tok->level = 0;`
			`tok->altindstack[0] = 0;`
			`tok->decoding_state = STATE_INIT;`
			`tok->decoding_erred = 0;`
			`tok->enc = NULL;`
			`tok->encoding = NULL;`
			`tok->cont_line = 0;`
			`tok->filename = NULL;`
			`tok->decoding_readline = NULL;`
			`tok->decoding_buffer = NULL;`
			`tok->readline = NULL;`
			`tok->type_comments = 0;`
			`tok->interactive_underflow = IUNDERFLOW_NORMAL;`
			`tok->underflow = NULL;`
			`tok->str = NULL;`
			`tok->report_warnings = 1;`
			`tok->tok_extra_tokens = 0;`
			`tok->comment_newline = 0;`
			`tok->implicit_newline = 0;`
			`tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};`
			`tok->tok_mode_stack_index = 0;`
			`#ifdef Py_DEBUG`
			`tok->debug = _Py_GetConfig()->parser_debug;`
			`#endif`
			`return tok;`
			`}`

			`static void`
			`free_fstring_expressions(struct tok_state *tok)`
			`{`
			`int index;`
			`tokenizer_mode *mode;`

			`for (index = tok->tok_mode_stack_index; index >= 0; --index) {`
			`mode = &(tok->tok_mode_stack[index]);`
			`if (mode->last_expr_buffer != NULL) {`
			`PyMem_Free(mode->last_expr_buffer);`
			`mode->last_expr_buffer = NULL;`
			`mode->last_expr_size = 0;`
			`mode->last_expr_end = -1;`
			`}`
			`}`
			`}`

			`/* Free a tok_state structure */`
			`void`
			`_PyTokenizer_Free(struct tok_state *tok)`
			`{`
			`if (tok->encoding != NULL) {`
			`PyMem_Free(tok->encoding);`
			`}`
			`Py_XDECREF(tok->decoding_readline);`
			`Py_XDECREF(tok->decoding_buffer);`
			`Py_XDECREF(tok->readline);`
			`Py_XDECREF(tok->filename);`
			`if ((tok->readline != NULL \|\| tok->fp != NULL ) && tok->buf != NULL) {`
			`PyMem_Free(tok->buf);`
			`}`
			`if (tok->input) {`
			`PyMem_Free(tok->input);`
			`}`
			`if (tok->interactive_src_start != NULL) {`
			`PyMem_Free(tok->interactive_src_start);`
			`}`
			`free_fstring_expressions(tok);`
			`PyMem_Free(tok);`
			`}`

			`void`
			`_PyToken_Free(struct token *token) {`
			`Py_XDECREF(token->metadata);`
			`}`

			`void`
			`_PyToken_Init(struct token *token) {`
			`token->metadata = NULL;`
			`}`

			`int`
			`_PyLexer_type_comment_token_setup(struct tok_state tok, struct token token, int type, int col_offset,`
			`int end_col_offset, const char start, const char end)`
			`{`
			`token->level = tok->level;`
			`token->lineno = token->end_lineno = tok->lineno;`
			`token->col_offset = col_offset;`
			`token->end_col_offset = end_col_offset;`
			`token->start = start;`
			`token->end = end;`
			`return type;`
			`}`

			`int`
			`_PyLexer_token_setup(struct tok_state tok, struct token token, int type, const char start, const char end)`
			`{`
			`assert((start == NULL && end == NULL) \|\| (start != NULL && end != NULL));`
			`token->level = tok->level;`
			`if (ISSTRINGLIT(type)) {`
			`token->lineno = tok->first_lineno;`
			`}`
			`else {`
			`token->lineno = tok->lineno;`
			`}`
			`token->end_lineno = tok->lineno;`
			`token->col_offset = token->end_col_offset = -1;`
			`token->start = start;`
			`token->end = end;`

			`if (start != NULL && end != NULL) {`
			`token->col_offset = tok->starting_col_offset;`
			`token->end_col_offset = tok->col_offset;`
			`}`
			`return type;`
			`}`