cpython/Parser/tokenizer.h

#ifndef Py_TOKENIZER_H
#define Py_TOKENIZER_H
#ifdef __cplusplus
extern "C" {
#endif

#include "object.h"

/* Tokenizer interface */

#include "token.h"      /* For token types */

#define MAXINDENT 100   /* Max indentation level */

enum decoding_state {
    STATE_INIT,
    STATE_RAW,
    STATE_NORMAL        /* have a codec associated with input */
};

/* Tokenizer state */
struct tok_state {
    /* Input state; buf <= cur <= inp <= end */
    /* NB an entire line is held in the buffer */
    char *buf;          /* Input buffer, or NULL; malloc'ed if fp != NULL */
    char *cur;          /* Next character in buffer */
    char *inp;          /* End of data in buffer */
    char *end;          /* End of input buffer if buf != NULL */
    char *start;        /* Start of current token if not NULL */
    int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
    /* NB If done != E_OK, cur must be == inp!!! */
    FILE *fp;           /* Rest of input; NULL if tokenizing a string */
    int tabsize;        /* Tab spacing */
    int indent;         /* Current indentation index */
    int indstack[MAXINDENT];            /* Stack of indents */
    int atbol;          /* Nonzero if at begin of new line */
    int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
    const char *prompt, *nextprompt;          /* For interactive prompting */
    int lineno;         /* Current line number */
    int level;          /* () [] {} Parentheses nesting level */
            /* Used to allow free continuations inside them */
    /* Stuff for checking on different tab sizes */
#ifndef PGEN
    /* pgen doesn't have access to Python codecs, it cannot decode the input
       filename. The bytes filename might be kept, but it is only used by
       indenterror() and it is not really needed: pgen only compiles one file
       (Grammar/Grammar). */
    PyObject *filename;
#endif
    int altwarning;     /* Issue warning if alternate tabs don't match */
    int alterror;       /* Issue error if alternate tabs don't match */
    int alttabsize;     /* Alternate tab spacing */
    int altindstack[MAXINDENT];         /* Stack of alternate indents */
    /* Stuff for PEP 0263 */
    enum decoding_state decoding_state;
    int decoding_erred;         /* whether erred in decoding  */
    int read_coding_spec;       /* whether 'coding:...' has been read  */
    char *encoding;         /* Source encoding. */
    int cont_line;          /* whether we are in a continuation line. */
    const char* line_start;     /* pointer to start of current line */
#ifndef PGEN
    PyObject *decoding_readline; /* open(...).readline */
    PyObject *decoding_buffer;
#endif
    const char* enc;        /* Encoding for the current str. */
    const char* str;
    const char* input; /* Tokenizer's newline translated copy of the string. */

    /* async/await related fields; can be removed in 3.7 when async and await
       become normal keywords. */
    int async_def;        /* =1 if tokens are inside an 'async def' body. */
    int async_def_indent; /* Indentation level of the outermost 'async def'. */
    int async_def_nl;     /* =1 if the outermost 'async def' had at least one
                             NEWLINE token after it. */
};

extern struct tok_state *PyTokenizer_FromString(const char *, int);
extern struct tok_state *PyTokenizer_FromUTF8(const char *, int);
extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*,
                                              const char *, const char *);
extern void PyTokenizer_Free(struct tok_state *);
extern int PyTokenizer_Get(struct tok_state *, char **, char **);

#ifdef __cplusplus
}
#endif
#endif /* !Py_TOKENIZER_H */
* Added support for X11 modules. * Makefile: change location of FORMS library. * posixmodule.c: turn #if 0 into #ifdef MSDOS (stuff in unistd.h or not) * Almost all .h files: added CPP magic to avoid duplicate inclusions and to support inclusion from C++. 1993-07-28 06:05:47 -03:00			`#ifndef Py_TOKENIZER_H`
			`#define Py_TOKENIZER_H`
			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

Patch #534304: Implement phase 1 of PEP 263. 2002-08-04 14:29:52 -03:00			`#include "object.h"`
Added copyright notice. 1991-02-19 08:39:46 -04:00
Initial revision 1990-10-14 09:07:46 -03:00			`/* Tokenizer interface */`

Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`#include "token.h" /* For token types */`
Initial revision 1990-10-14 09:07:46 -03:00
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`#define MAXINDENT 100 /* Max indentation level */`
Initial revision 1990-10-14 09:07:46 -03:00
Use an enum for decoding_state. It makes the code a little more understandable. 2007-09-21 17:50:26 -03:00			`enum decoding_state {`
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`STATE_INIT,`
			`STATE_RAW,`
#10222: fix for overzealous AIX compiler. 2010-10-29 01:54:13 -03:00			`STATE_NORMAL /* have a codec associated with input */`
Use an enum for decoding_state. It makes the code a little more understandable. 2007-09-21 17:50:26 -03:00			`};`

Initial revision 1990-10-14 09:07:46 -03:00			`/* Tokenizer state */`
			`struct tok_state {`
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`/* Input state; buf <= cur <= inp <= end */`
			`/* NB an entire line is held in the buffer */`
			`char buf; / Input buffer, or NULL; malloc'ed if fp != NULL */`
			`char cur; / Next character in buffer */`
			`char inp; / End of data in buffer */`
			`char end; / End of input buffer if buf != NULL */`
			`char start; / Start of current token if not NULL */`
			`int done; /* E_OK normally, E_EOF at EOF, otherwise error code */`
			`/* NB If done != E_OK, cur must be == inp!!! */`
			`FILE fp; / Rest of input; NULL if tokenizing a string */`
			`int tabsize; /* Tab spacing */`
			`int indent; /* Current indentation index */`
			`int indstack[MAXINDENT]; /* Stack of indents */`
			`int atbol; /* Nonzero if at begin of new line */`
			`int pendin; /* Pending indents (if > 0) or dedents (if < 0) */`
Issue #1772673: The type of `char` arguments now changed to `const char`. 2013-10-19 15:03:34 -03:00			`const char prompt, nextprompt; /* For interactive prompting */`
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`int lineno; /* Current line number */`
			`int level; /* () [] {} Parentheses nesting level */`
			`/* Used to allow free continuations inside them */`
			`/* Stuff for checking on different tab sizes */`
Issue #10785: Store the filename as Unicode in the Python parser. 2011-04-04 19:39:01 -03:00			`#ifndef PGEN`
			`/* pgen doesn't have access to Python codecs, it cannot decode the input`
			`filename. The bytes filename might be kept, but it is only used by`
			`indenterror() and it is not really needed: pgen only compiles one file`
			`(Grammar/Grammar). */`
			`PyObject *filename;`
			`#endif`
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`int altwarning; /* Issue warning if alternate tabs don't match */`
			`int alterror; /* Issue error if alternate tabs don't match */`
			`int alttabsize; /* Alternate tab spacing */`
			`int altindstack[MAXINDENT]; /* Stack of alternate indents */`
			`/* Stuff for PEP 0263 */`
			`enum decoding_state decoding_state;`
			`int decoding_erred; /* whether erred in decoding */`
			`int read_coding_spec; /* whether 'coding:...' has been read */`
			`char encoding; / Source encoding. */`
			`int cont_line; /* whether we are in a continuation line. */`
			`const char* line_start; /* pointer to start of current line */`
Make pgen compile with pydebug. Duplicate normalized names, as it may be longer than the old string. 2002-08-04 17:10:29 -03:00			`#ifndef PGEN`
Issue #10095: fp_setreadl() doesn't reopen the file, reuse instead the file descriptor. 2010-10-14 09:04:34 -03:00			`PyObject decoding_readline; / open(...).readline */`
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`PyObject *decoding_buffer;`
Make pgen compile with pydebug. Duplicate normalized names, as it may be longer than the old string. 2002-08-04 17:10:29 -03:00			`#endif`
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`const char* enc; /* Encoding for the current str. */`
			`const char* str;`
			`const char* input; /* Tokenizer's newline translated copy of the string. */`
PEP 0492 -- Coroutines with async and await syntax. Issue #24017. 2015-05-11 23:57:16 -03:00
Issue #24619: Simplify async/await tokenization. This commit simplifies async/await tokenization in tokenizer.c, tokenize.py & lib2to3/tokenize.py. Previous solution was to keep a stack of async-def & def blocks, whereas the new approach is just to remember position of the outermost async-def block. This change won't bring any parsing performance improvements, but it makes the code much easier to read and validate. 2015-07-23 09:01:58 -03:00			`/* async/await related fields; can be removed in 3.7 when async and await`
			`become normal keywords. */`
			`int async_def; /* =1 if tokens are inside an 'async def' body. */`
			`int async_def_indent; /* Indentation level of the outermost 'async def'. */`
			`int async_def_nl; /* =1 if the outermost 'async def' had at least one`
			`NEWLINE token after it. */`
Initial revision 1990-10-14 09:07:46 -03:00			`};`

Merged revisions 76230 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r76230 \| benjamin.peterson \| 2009-11-12 17:39:44 -0600 (Thu, 12 Nov 2009) \| 2 lines fix several compile() issues by translating newlines in the tokenizer ........ 2009-11-12 20:17:59 -04:00			`extern struct tok_state PyTokenizer_FromString(const char , int);`
			`extern struct tok_state PyTokenizer_FromUTF8(const char , int);`
Issue #1772673: The type of `char` arguments now changed to `const char`. 2013-10-19 15:03:34 -03:00			`extern struct tok_state PyTokenizer_FromFile(FILE , const char*,`
			`const char , const char );`
Nuke all remaining occurrences of Py_PROTO and Py_FPROTO. 2000-07-09 00:09:57 -03:00			`extern void PyTokenizer_Free(struct tok_state *);`
			`extern int PyTokenizer_Get(struct tok_state , char , char *);`
* Added support for X11 modules. * Makefile: change location of FORMS library. * posixmodule.c: turn #if 0 into #ifdef MSDOS (stuff in unistd.h or not) * Almost all .h files: added CPP magic to avoid duplicate inclusions and to support inclusion from C++. 1993-07-28 06:05:47 -03:00
			`#ifdef __cplusplus`
			`}`
			`#endif`
			`#endif /* !Py_TOKENIZER_H */`