bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814)

Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser.

This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions.
This commit is contained in:
Pablo Galindo 2019-03-01 15:34:44 -08:00 committed by GitHub
parent 7eebbbd5b3
commit 1f24a719e7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 1561 additions and 2765 deletions

2
.gitignore vendored
View File

@ -73,8 +73,6 @@ PCbuild/arm32/
PCbuild/obj/
PCbuild/win32/
.purify
Parser/pgen
Parser/pgen.exe
__pycache__
autom4te.cache
build/

View File

@ -1,18 +0,0 @@
#ifndef Py_METAGRAMMAR_H
#define Py_METAGRAMMAR_H
#ifdef __cplusplus
extern "C" {
#endif
#define MSTART 256
#define RULE 257
#define RHS 258
#define ALT 259
#define ITEM 260
#define ATOM 261
#ifdef __cplusplus
}
#endif
#endif /* !Py_METAGRAMMAR_H */

View File

@ -12,10 +12,7 @@ extern "C" {
typedef struct {
int error;
#ifndef PGEN
/* The filename is useless for pgen, see comment in tok_state structure */
PyObject *filename;
#endif
int lineno;
int offset;
char *text; /* UTF-8-encoded string */

View File

@ -1,18 +0,0 @@
#ifndef Py_PGEN_H
#define Py_PGEN_H
#ifdef __cplusplus
extern "C" {
#endif
/* Parser generator interface */
extern grammar *meta_grammar(void);
struct _node;
extern grammar *pgen(struct _node *);
#ifdef __cplusplus
}
#endif
#endif /* !Py_PGEN_H */

View File

@ -290,40 +290,21 @@ LIBFFI_INCLUDEDIR= @LIBFFI_INCLUDEDIR@
##########################################################################
# Parser
PGEN= Parser/pgen$(EXE)
POBJS= \
Parser/acceler.o \
Parser/grammar1.o \
Parser/listnode.o \
Parser/node.o \
Parser/parser.o \
Parser/bitset.o \
Parser/metagrammar.o \
Parser/firstsets.o \
Parser/grammar.o \
Parser/token.o \
Parser/pgen.o
PARSER_OBJS= $(POBJS) Parser/myreadline.o Parser/parsetok.o Parser/tokenizer.o
PGOBJS= \
Objects/obmalloc.o \
Python/dynamic_annotations.o \
Python/mysnprintf.o \
Python/pyctype.o \
Parser/tokenizer_pgen.o \
Parser/printgrammar.o \
Parser/parsetok_pgen.o \
Parser/pgenmain.o
PARSER_HEADERS= \
$(srcdir)/Parser/parser.h \
$(srcdir)/Include/parsetok.h \
$(srcdir)/Parser/tokenizer.h
PGENOBJS= $(POBJS) $(PGOBJS)
##########################################################################
# Python
@ -802,31 +783,18 @@ Python/sysmodule.o: $(srcdir)/Python/sysmodule.c Makefile
$(IO_OBJS): $(IO_H)
$(PGEN): $(PGENOBJS)
$(CC) $(OPT) $(PY_CORE_LDFLAGS) $(PGENOBJS) $(LIBS) -o $(PGEN)
.PHONY: regen-grammar
regen-grammar: $(PGEN)
regen-grammar: regen-token
# Regenerate Include/graminit.h and Python/graminit.c
# from Grammar/Grammar using pgen
@$(MKDIR_P) Include
$(PGEN) $(srcdir)/Grammar/Grammar \
$(PYTHON_FOR_REGEN) -m Parser.pgen $(srcdir)/Grammar/Grammar \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Include/graminit.h.new \
$(srcdir)/Python/graminit.c.new
$(UPDATE_FILE) $(srcdir)/Include/graminit.h $(srcdir)/Include/graminit.h.new
$(UPDATE_FILE) $(srcdir)/Python/graminit.c $(srcdir)/Python/graminit.c.new
Parser/grammar.o: $(srcdir)/Parser/grammar.c \
$(srcdir)/Include/token.h \
$(srcdir)/Include/grammar.h
Parser/metagrammar.o: $(srcdir)/Parser/metagrammar.c
Parser/tokenizer_pgen.o: $(srcdir)/Parser/tokenizer.c
Parser/parsetok_pgen.o: $(srcdir)/Parser/parsetok.c
Parser/printgrammar.o: $(srcdir)/Parser/printgrammar.c
Parser/pgenmain.o: $(srcdir)/Include/parsetok.h
.PHONY=regen-ast
regen-ast:
# Regenerate Include/Python-ast.h using Parser/asdl_c.py -h
@ -1016,7 +984,6 @@ PYTHON_HEADERS= \
$(srcdir)/Include/longobject.h \
$(srcdir)/Include/marshal.h \
$(srcdir)/Include/memoryobject.h \
$(srcdir)/Include/metagrammar.h \
$(srcdir)/Include/methodobject.h \
$(srcdir)/Include/modsupport.h \
$(srcdir)/Include/moduleobject.h \
@ -1028,7 +995,6 @@ PYTHON_HEADERS= \
$(srcdir)/Include/osdefs.h \
$(srcdir)/Include/osmodule.h \
$(srcdir)/Include/patchlevel.h \
$(srcdir)/Include/pgen.h \
$(srcdir)/Include/pgenheaders.h \
$(srcdir)/Include/pyarena.h \
$(srcdir)/Include/pycapsule.h \
@ -1771,7 +1737,7 @@ profile-removal:
rm -f profile-run-stamp
clobber: clean profile-removal
-rm -f $(BUILDPYTHON) $(PGEN) $(LIBRARY) $(LDLIBRARY) $(DLLLIBRARY) \
-rm -f $(BUILDPYTHON) $(LIBRARY) $(LDLIBRARY) $(DLLLIBRARY) \
tags TAGS \
config.cache config.log pyconfig.h Modules/config.c
-rm -rf build platform

View File

@ -0,0 +1,2 @@
Retire pgen and use a modified version of pgen2 to generate the parser.
Patch by Pablo Galindo.

View File

@ -92,14 +92,6 @@ wchar_t *Py_DecodeLocale(const char* arg, size_t *size)
return w;
}
/* Parser/pgenmain.c */
grammar *getgrammar(const char *filename)
{
grammar *g;
__coverity_tainted_data_sink__(filename);
return g;
}
/* Python/marshal.c */
static Py_ssize_t r_string(char *s, Py_ssize_t n, RFILE *p)

View File

@ -161,7 +161,6 @@
<ClInclude Include="..\Include\longobject.h" />
<ClInclude Include="..\Include\marshal.h" />
<ClInclude Include="..\Include\memoryobject.h" />
<ClInclude Include="..\Include\metagrammar.h" />
<ClInclude Include="..\Include\methodobject.h" />
<ClInclude Include="..\Include\modsupport.h" />
<ClInclude Include="..\Include\moduleobject.h" />
@ -175,7 +174,6 @@
<ClInclude Include="..\Include\osmodule.h" />
<ClInclude Include="..\Include\parsetok.h" />
<ClInclude Include="..\Include\patchlevel.h" />
<ClInclude Include="..\Include\pgen.h" />
<ClInclude Include="..\Include\pgenheaders.h" />
<ClInclude Include="..\Include\pyhash.h" />
<ClInclude Include="..\Include\py_curses.h" />
@ -372,12 +370,8 @@
<ClCompile Include="..\Objects\unicodeobject.c" />
<ClCompile Include="..\Objects\weakrefobject.c" />
<ClCompile Include="..\Parser\acceler.c" />
<ClCompile Include="..\Parser\bitset.c" />
<ClCompile Include="..\Parser\firstsets.c" />
<ClCompile Include="..\Parser\grammar.c" />
<ClCompile Include="..\Parser\grammar1.c" />
<ClCompile Include="..\Parser\listnode.c" />
<ClCompile Include="..\Parser\metagrammar.c" />
<ClCompile Include="..\Parser\myreadline.c" />
<ClCompile Include="..\Parser\node.c" />
<ClCompile Include="..\Parser\parser.c" />

View File

@ -234,9 +234,6 @@
<ClInclude Include="..\Include\memoryobject.h">
<Filter>Include</Filter>
</ClInclude>
<ClInclude Include="..\Include\metagrammar.h">
<Filter>Include</Filter>
</ClInclude>
<ClInclude Include="..\Include\methodobject.h">
<Filter>Include</Filter>
</ClInclude>
@ -270,9 +267,6 @@
<ClInclude Include="..\Include\patchlevel.h">
<Filter>Include</Filter>
</ClInclude>
<ClInclude Include="..\Include\pgen.h">
<Filter>Include</Filter>
</ClInclude>
<ClInclude Include="..\Include\pgenheaders.h">
<Filter>Include</Filter>
</ClInclude>
@ -836,24 +830,12 @@
<ClCompile Include="..\Parser\acceler.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\bitset.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\firstsets.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\grammar.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\grammar1.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\listnode.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\metagrammar.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\myreadline.c">
<Filter>Parser</Filter>
</ClCompile>

View File

@ -1,66 +0,0 @@
/* Bitset primitives used by the parser generator */
#include "pgenheaders.h"
#include "bitset.h"
bitset
newbitset(int nbits)
{
int nbytes = NBYTES(nbits);
bitset ss = (char *)PyObject_MALLOC(sizeof(BYTE) * nbytes);
if (ss == NULL)
Py_FatalError("no mem for bitset");
ss += nbytes;
while (--nbytes >= 0)
*--ss = 0;
return ss;
}
void
delbitset(bitset ss)
{
PyObject_FREE(ss);
}
int
addbit(bitset ss, int ibit)
{
int ibyte = BIT2BYTE(ibit);
BYTE mask = BIT2MASK(ibit);
if (ss[ibyte] & mask)
return 0; /* Bit already set */
ss[ibyte] |= mask;
return 1;
}
#if 0 /* Now a macro */
int
testbit(bitset ss, int ibit)
{
return (ss[BIT2BYTE(ibit)] & BIT2MASK(ibit)) != 0;
}
#endif
int
samebitset(bitset ss1, bitset ss2, int nbits)
{
int i;
for (i = NBYTES(nbits); --i >= 0; )
if (*ss1++ != *ss2++)
return 0;
return 1;
}
void
mergebitset(bitset ss1, bitset ss2, int nbits)
{
int i;
for (i = NBYTES(nbits); --i >= 0; )
*ss1++ |= *ss2++;
}

View File

@ -1,113 +0,0 @@
/* Computation of FIRST stets */
#include "pgenheaders.h"
#include "grammar.h"
#include "token.h"
extern int Py_DebugFlag;
/* Forward */
static void calcfirstset(grammar *, dfa *);
void
addfirstsets(grammar *g)
{
int i;
dfa *d;
if (Py_DebugFlag)
printf("Adding FIRST sets ...\n");
for (i = 0; i < g->g_ndfas; i++) {
d = &g->g_dfa[i];
if (d->d_first == NULL)
calcfirstset(g, d);
}
}
static void
calcfirstset(grammar *g, dfa *d)
{
int i, j;
state *s;
arc *a;
int nsyms;
int *sym;
int nbits;
static bitset dummy;
bitset result;
int type;
dfa *d1;
label *l0;
if (Py_DebugFlag)
printf("Calculate FIRST set for '%s'\n", d->d_name);
if (dummy == NULL)
dummy = newbitset(1);
if (d->d_first == dummy) {
fprintf(stderr, "Left-recursion for '%s'\n", d->d_name);
return;
}
if (d->d_first != NULL) {
fprintf(stderr, "Re-calculating FIRST set for '%s' ???\n",
d->d_name);
}
d->d_first = dummy;
l0 = g->g_ll.ll_label;
nbits = g->g_ll.ll_nlabels;
result = newbitset(nbits);
sym = (int *)PyObject_MALLOC(sizeof(int));
if (sym == NULL)
Py_FatalError("no mem for new sym in calcfirstset");
nsyms = 1;
sym[0] = findlabel(&g->g_ll, d->d_type, (char *)NULL);
s = &d->d_state[d->d_initial];
for (i = 0; i < s->s_narcs; i++) {
a = &s->s_arc[i];
for (j = 0; j < nsyms; j++) {
if (sym[j] == a->a_lbl)
break;
}
if (j >= nsyms) { /* New label */
sym = (int *)PyObject_REALLOC(sym,
sizeof(int) * (nsyms + 1));
if (sym == NULL)
Py_FatalError(
"no mem to resize sym in calcfirstset");
sym[nsyms++] = a->a_lbl;
type = l0[a->a_lbl].lb_type;
if (ISNONTERMINAL(type)) {
d1 = PyGrammar_FindDFA(g, type);
if (d1->d_first == dummy) {
fprintf(stderr,
"Left-recursion below '%s'\n",
d->d_name);
}
else {
if (d1->d_first == NULL)
calcfirstset(g, d1);
mergebitset(result,
d1->d_first, nbits);
}
}
else if (ISTERMINAL(type)) {
addbit(result, a->a_lbl);
}
}
}
d->d_first = result;
if (Py_DebugFlag) {
printf("FIRST set for '%s': {", d->d_name);
for (i = 0; i < nbits; i++) {
if (testbit(result, i))
printf(" %s", PyGrammar_LabelRepr(&l0[i]));
}
printf(" }\n");
}
PyObject_FREE(sym);
}

View File

@ -1,273 +0,0 @@
/* Grammar implementation */
#include "Python.h"
#include "pgenheaders.h"
#include <ctype.h>
#include "token.h"
#include "grammar.h"
extern int Py_DebugFlag;
grammar *
newgrammar(int start)
{
grammar *g;
g = (grammar *)PyObject_MALLOC(sizeof(grammar));
if (g == NULL)
Py_FatalError("no mem for new grammar");
g->g_ndfas = 0;
g->g_dfa = NULL;
g->g_start = start;
g->g_ll.ll_nlabels = 0;
g->g_ll.ll_label = NULL;
g->g_accel = 0;
return g;
}
void
freegrammar(grammar *g)
{
int i;
for (i = 0; i < g->g_ndfas; i++) {
free(g->g_dfa[i].d_name);
for (int j = 0; j < g->g_dfa[i].d_nstates; j++)
PyObject_FREE(g->g_dfa[i].d_state[j].s_arc);
PyObject_FREE(g->g_dfa[i].d_state);
}
PyObject_FREE(g->g_dfa);
for (i = 0; i < g->g_ll.ll_nlabels; i++)
free(g->g_ll.ll_label[i].lb_str);
PyObject_FREE(g->g_ll.ll_label);
PyObject_FREE(g);
}
dfa *
adddfa(grammar *g, int type, const char *name)
{
dfa *d;
g->g_dfa = (dfa *)PyObject_REALLOC(g->g_dfa,
sizeof(dfa) * (g->g_ndfas + 1));
if (g->g_dfa == NULL)
Py_FatalError("no mem to resize dfa in adddfa");
d = &g->g_dfa[g->g_ndfas++];
d->d_type = type;
d->d_name = strdup(name);
d->d_nstates = 0;
d->d_state = NULL;
d->d_initial = -1;
d->d_first = NULL;
return d; /* Only use while fresh! */
}
int
addstate(dfa *d)
{
state *s;
d->d_state = (state *)PyObject_REALLOC(d->d_state,
sizeof(state) * (d->d_nstates + 1));
if (d->d_state == NULL)
Py_FatalError("no mem to resize state in addstate");
s = &d->d_state[d->d_nstates++];
s->s_narcs = 0;
s->s_arc = NULL;
s->s_lower = 0;
s->s_upper = 0;
s->s_accel = NULL;
s->s_accept = 0;
return Py_SAFE_DOWNCAST(s - d->d_state, intptr_t, int);
}
void
addarc(dfa *d, int from, int to, int lbl)
{
state *s;
arc *a;
assert(0 <= from && from < d->d_nstates);
assert(0 <= to && to < d->d_nstates);
s = &d->d_state[from];
s->s_arc = (arc *)PyObject_REALLOC(s->s_arc, sizeof(arc) * (s->s_narcs + 1));
if (s->s_arc == NULL)
Py_FatalError("no mem to resize arc list in addarc");
a = &s->s_arc[s->s_narcs++];
a->a_lbl = lbl;
a->a_arrow = to;
}
int
addlabel(labellist *ll, int type, const char *str)
{
int i;
label *lb;
for (i = 0; i < ll->ll_nlabels; i++) {
if (ll->ll_label[i].lb_type == type &&
strcmp(ll->ll_label[i].lb_str, str) == 0)
return i;
}
ll->ll_label = (label *)PyObject_REALLOC(ll->ll_label,
sizeof(label) * (ll->ll_nlabels + 1));
if (ll->ll_label == NULL)
Py_FatalError("no mem to resize labellist in addlabel");
lb = &ll->ll_label[ll->ll_nlabels++];
lb->lb_type = type;
lb->lb_str = strdup(str);
if (Py_DebugFlag)
printf("Label @ %8p, %d: %s\n", ll, ll->ll_nlabels,
PyGrammar_LabelRepr(lb));
return Py_SAFE_DOWNCAST(lb - ll->ll_label, intptr_t, int);
}
/* Same, but rather dies than adds */
int
findlabel(labellist *ll, int type, const char *str)
{
int i;
for (i = 0; i < ll->ll_nlabels; i++) {
if (ll->ll_label[i].lb_type == type /*&&
strcmp(ll->ll_label[i].lb_str, str) == 0*/)
return i;
}
fprintf(stderr, "Label %d/'%s' not found\n", type, str);
Py_FatalError("grammar.c:findlabel()");
/* Py_FatalError() is declared with __attribute__((__noreturn__)).
GCC emits a warning without "return 0;" (compiler bug!), but Clang is
smarter and emits a warning on the return... */
#ifndef __clang__
return 0; /* Make gcc -Wall happy */
#endif
}
/* Forward */
static void translabel(grammar *, label *);
void
translatelabels(grammar *g)
{
int i;
#ifdef Py_DEBUG
printf("Translating labels ...\n");
#endif
/* Don't translate EMPTY */
for (i = EMPTY+1; i < g->g_ll.ll_nlabels; i++)
translabel(g, &g->g_ll.ll_label[i]);
}
static void
translabel(grammar *g, label *lb)
{
int i;
if (Py_DebugFlag)
printf("Translating label %s ...\n", PyGrammar_LabelRepr(lb));
if (lb->lb_type == NAME) {
for (i = 0; i < g->g_ndfas; i++) {
if (strcmp(lb->lb_str, g->g_dfa[i].d_name) == 0) {
if (Py_DebugFlag)
printf(
"Label %s is non-terminal %d.\n",
lb->lb_str,
g->g_dfa[i].d_type);
lb->lb_type = g->g_dfa[i].d_type;
free(lb->lb_str);
lb->lb_str = NULL;
return;
}
}
for (i = 0; i < (int)N_TOKENS; i++) {
if (strcmp(lb->lb_str, _PyParser_TokenNames[i]) == 0) {
if (Py_DebugFlag)
printf("Label %s is terminal %d.\n",
lb->lb_str, i);
lb->lb_type = i;
free(lb->lb_str);
lb->lb_str = NULL;
return;
}
}
printf("Can't translate NAME label '%s'\n", lb->lb_str);
return;
}
if (lb->lb_type == STRING) {
if (isalpha(Py_CHARMASK(lb->lb_str[1])) ||
lb->lb_str[1] == '_') {
char *p;
char *src;
char *dest;
size_t name_len;
if (Py_DebugFlag)
printf("Label %s is a keyword\n", lb->lb_str);
lb->lb_type = NAME;
src = lb->lb_str + 1;
p = strchr(src, '\'');
if (p)
name_len = p - src;
else
name_len = strlen(src);
dest = (char *)malloc(name_len + 1);
if (!dest) {
printf("Can't alloc dest '%s'\n", src);
return;
}
strncpy(dest, src, name_len);
dest[name_len] = '\0';
free(lb->lb_str);
lb->lb_str = dest;
}
else if (lb->lb_str[2] == lb->lb_str[0]) {
int type = (int) PyToken_OneChar(lb->lb_str[1]);
if (type != OP) {
lb->lb_type = type;
free(lb->lb_str);
lb->lb_str = NULL;
}
else
printf("Unknown OP label %s\n",
lb->lb_str);
}
else if (lb->lb_str[2] && lb->lb_str[3] == lb->lb_str[0]) {
int type = (int) PyToken_TwoChars(lb->lb_str[1],
lb->lb_str[2]);
if (type != OP) {
lb->lb_type = type;
free(lb->lb_str);
lb->lb_str = NULL;
}
else
printf("Unknown OP label %s\n",
lb->lb_str);
}
else if (lb->lb_str[2] && lb->lb_str[3] && lb->lb_str[4] == lb->lb_str[0]) {
int type = (int) PyToken_ThreeChars(lb->lb_str[1],
lb->lb_str[2],
lb->lb_str[3]);
if (type != OP) {
lb->lb_type = type;
free(lb->lb_str);
lb->lb_str = NULL;
}
else
printf("Unknown OP label %s\n",
lb->lb_str);
}
else
printf("Can't translate STRING label %s\n",
lb->lb_str);
}
else
printf("Can't translate label '%s'\n",
PyGrammar_LabelRepr(lb));
}

View File

@ -1,159 +0,0 @@
#include "pgenheaders.h"
#include "metagrammar.h"
#include "grammar.h"
#include "pgen.h"
static arc arcs_0_0[3] = {
{2, 0},
{3, 0},
{4, 1},
};
static arc arcs_0_1[1] = {
{0, 1},
};
static state states_0[2] = {
{3, arcs_0_0},
{1, arcs_0_1},
};
static arc arcs_1_0[1] = {
{5, 1},
};
static arc arcs_1_1[1] = {
{6, 2},
};
static arc arcs_1_2[1] = {
{7, 3},
};
static arc arcs_1_3[1] = {
{3, 4},
};
static arc arcs_1_4[1] = {
{0, 4},
};
static state states_1[5] = {
{1, arcs_1_0},
{1, arcs_1_1},
{1, arcs_1_2},
{1, arcs_1_3},
{1, arcs_1_4},
};
static arc arcs_2_0[1] = {
{8, 1},
};
static arc arcs_2_1[2] = {
{9, 0},
{0, 1},
};
static state states_2[2] = {
{1, arcs_2_0},
{2, arcs_2_1},
};
static arc arcs_3_0[1] = {
{10, 1},
};
static arc arcs_3_1[2] = {
{10, 1},
{0, 1},
};
static state states_3[2] = {
{1, arcs_3_0},
{2, arcs_3_1},
};
static arc arcs_4_0[2] = {
{11, 1},
{13, 2},
};
static arc arcs_4_1[1] = {
{7, 3},
};
static arc arcs_4_2[3] = {
{14, 4},
{15, 4},
{0, 2},
};
static arc arcs_4_3[1] = {
{12, 4},
};
static arc arcs_4_4[1] = {
{0, 4},
};
static state states_4[5] = {
{2, arcs_4_0},
{1, arcs_4_1},
{3, arcs_4_2},
{1, arcs_4_3},
{1, arcs_4_4},
};
static arc arcs_5_0[3] = {
{5, 1},
{16, 1},
{17, 2},
};
static arc arcs_5_1[1] = {
{0, 1},
};
static arc arcs_5_2[1] = {
{7, 3},
};
static arc arcs_5_3[1] = {
{18, 1},
};
static state states_5[4] = {
{3, arcs_5_0},
{1, arcs_5_1},
{1, arcs_5_2},
{1, arcs_5_3},
};
static dfa dfas[6] = {
{256, "MSTART", 0, 2, states_0,
"\070\000\000"},
{257, "RULE", 0, 5, states_1,
"\040\000\000"},
{258, "RHS", 0, 2, states_2,
"\040\010\003"},
{259, "ALT", 0, 2, states_3,
"\040\010\003"},
{260, "ITEM", 0, 5, states_4,
"\040\010\003"},
{261, "ATOM", 0, 4, states_5,
"\040\000\003"},
};
static label labels[19] = {
{0, "EMPTY"},
{256, 0},
{257, 0},
{4, 0},
{0, 0},
{1, 0},
{11, 0},
{258, 0},
{259, 0},
{18, 0},
{260, 0},
{9, 0},
{10, 0},
{261, 0},
{16, 0},
{14, 0},
{3, 0},
{7, 0},
{8, 0},
};
static grammar _PyParser_Grammar = {
6,
dfas,
{19, labels},
256
};
grammar *
meta_grammar(void)
{
return &_PyParser_Grammar;
}
grammar *
Py_meta_grammar(void)
{
return meta_grammar();
}

View File

@ -99,10 +99,8 @@ PyParser_ParseStringObject(const char *s, PyObject *filename,
tok->type_comments = 1;
}
#ifndef PGEN
Py_INCREF(err_ret->filename);
tok->filename = err_ret->filename;
#endif
return parsetok(tok, g, start, err_ret, flags);
}
@ -113,7 +111,6 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename_str,
{
node *n;
PyObject *filename = NULL;
#ifndef PGEN
if (filename_str != NULL) {
filename = PyUnicode_DecodeFSDefault(filename_str);
if (filename == NULL) {
@ -121,11 +118,8 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename_str,
return NULL;
}
}
#endif
n = PyParser_ParseStringObject(s, filename, g, start, err_ret, flags);
#ifndef PGEN
Py_XDECREF(filename);
#endif
return n;
}
@ -169,10 +163,8 @@ PyParser_ParseFileObject(FILE *fp, PyObject *filename,
if (*flags & PyPARSE_TYPE_COMMENTS) {
tok->type_comments = 1;
}
#ifndef PGEN
Py_INCREF(err_ret->filename);
tok->filename = err_ret->filename;
#endif
return parsetok(tok, g, start, err_ret, flags);
}
@ -184,7 +176,6 @@ PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
{
node *n;
PyObject *fileobj = NULL;
#ifndef PGEN
if (filename != NULL) {
fileobj = PyUnicode_DecodeFSDefault(filename);
if (fileobj == NULL) {
@ -192,12 +183,9 @@ PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
return NULL;
}
}
#endif
n = PyParser_ParseFileObject(fp, fileobj, enc, g,
start, ps1, ps2, err_ret, flags);
#ifndef PGEN
Py_XDECREF(fileobj);
#endif
return n;
}
@ -371,7 +359,6 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
}
}
#ifndef PGEN
/* Check that the source for a single input statement really
is a single statement by looking at what is left in the
buffer after parsing. Trailing whitespace and comments
@ -399,7 +386,6 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
c = *++cur;
}
}
#endif
}
else
n = NULL;
@ -470,7 +456,6 @@ initerr(perrdetail *err_ret, PyObject *filename)
err_ret->text = NULL;
err_ret->token = -1;
err_ret->expected = -1;
#ifndef PGEN
if (filename) {
Py_INCREF(filename);
err_ret->filename = filename;
@ -482,6 +467,5 @@ initerr(perrdetail *err_ret, PyObject *filename)
return -1;
}
}
#endif
return 0;
}

View File

@ -1,2 +0,0 @@
#define PGEN
#include "parsetok.c"

View File

@ -1,724 +0,0 @@
/* Parser generator */
/* For a description, see the comments at end of this file */
#include "Python.h"
#include "pgenheaders.h"
#include "token.h"
#include "node.h"
#include "grammar.h"
#include "metagrammar.h"
#include "pgen.h"
extern int Py_DebugFlag;
extern int Py_IgnoreEnvironmentFlag; /* needed by Py_GETENV */
/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
typedef struct _nfaarc {
int ar_label;
int ar_arrow;
} nfaarc;
typedef struct _nfastate {
int st_narcs;
nfaarc *st_arc;
} nfastate;
typedef struct _nfa {
int nf_type;
char *nf_name;
int nf_nstates;
nfastate *nf_state;
int nf_start, nf_finish;
} nfa;
/* Forward */
static void compile_rhs(labellist *ll,
nfa *nf, node *n, int *pa, int *pb);
static void compile_alt(labellist *ll,
nfa *nf, node *n, int *pa, int *pb);
static void compile_item(labellist *ll,
nfa *nf, node *n, int *pa, int *pb);
static void compile_atom(labellist *ll,
nfa *nf, node *n, int *pa, int *pb);
static int
addnfastate(nfa *nf)
{
nfastate *st;
nf->nf_state = (nfastate *)PyObject_REALLOC(nf->nf_state,
sizeof(nfastate) * (nf->nf_nstates + 1));
if (nf->nf_state == NULL)
Py_FatalError("out of mem");
st = &nf->nf_state[nf->nf_nstates++];
st->st_narcs = 0;
st->st_arc = NULL;
return st - nf->nf_state;
}
static void
addnfaarc(nfa *nf, int from, int to, int lbl)
{
nfastate *st;
nfaarc *ar;
st = &nf->nf_state[from];
st->st_arc = (nfaarc *)PyObject_REALLOC(st->st_arc,
sizeof(nfaarc) * (st->st_narcs + 1));
if (st->st_arc == NULL)
Py_FatalError("out of mem");
ar = &st->st_arc[st->st_narcs++];
ar->ar_label = lbl;
ar->ar_arrow = to;
}
static nfa *
newnfa(char *name)
{
nfa *nf;
static int type = NT_OFFSET; /* All types will be disjunct */
nf = (nfa *)PyObject_MALLOC(sizeof(nfa));
if (nf == NULL)
Py_FatalError("no mem for new nfa");
nf->nf_type = type++;
nf->nf_name = name; /* XXX strdup(name) ??? */
nf->nf_nstates = 0;
nf->nf_state = NULL;
nf->nf_start = nf->nf_finish = -1;
return nf;
}
typedef struct _nfagrammar {
int gr_nnfas;
nfa **gr_nfa;
labellist gr_ll;
} nfagrammar;
/* Forward */
static void compile_rule(nfagrammar *gr, node *n);
static nfagrammar *
newnfagrammar(void)
{
nfagrammar *gr;
gr = (nfagrammar *)PyObject_MALLOC(sizeof(nfagrammar));
if (gr == NULL)
Py_FatalError("no mem for new nfa grammar");
gr->gr_nnfas = 0;
gr->gr_nfa = NULL;
gr->gr_ll.ll_nlabels = 0;
gr->gr_ll.ll_label = NULL;
addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
return gr;
}
static void
freenfagrammar(nfagrammar *gr)
{
for (int i = 0; i < gr->gr_nnfas; i++) {
PyObject_FREE(gr->gr_nfa[i]->nf_state);
}
PyObject_FREE(gr->gr_nfa);
PyObject_FREE(gr);
}
static nfa *
addnfa(nfagrammar *gr, char *name)
{
nfa *nf;
nf = newnfa(name);
gr->gr_nfa = (nfa **)PyObject_REALLOC(gr->gr_nfa,
sizeof(nfa*) * (gr->gr_nnfas + 1));
if (gr->gr_nfa == NULL)
Py_FatalError("out of mem");
gr->gr_nfa[gr->gr_nnfas++] = nf;
addlabel(&gr->gr_ll, NAME, nf->nf_name);
return nf;
}
#ifdef Py_DEBUG
static const char REQNFMT[] = "metacompile: less than %d children\n";
#define REQN(i, count) do { \
if (i < count) { \
fprintf(stderr, REQNFMT, count); \
Py_FatalError("REQN"); \
} \
} while (0)
#else
#define REQN(i, count) /* empty */
#endif
static nfagrammar *
metacompile(node *n)
{
nfagrammar *gr;
int i;
if (Py_DebugFlag)
printf("Compiling (meta-) parse tree into NFA grammar\n");
gr = newnfagrammar();
REQ(n, MSTART);
i = n->n_nchildren - 1; /* Last child is ENDMARKER */
n = n->n_child;
for (; --i >= 0; n++) {
if (n->n_type != NEWLINE)
compile_rule(gr, n);
}
return gr;
}
static void
compile_rule(nfagrammar *gr, node *n)
{
nfa *nf;
REQ(n, RULE);
REQN(n->n_nchildren, 4);
n = n->n_child;
REQ(n, NAME);
nf = addnfa(gr, n->n_str);
n++;
REQ(n, COLON);
n++;
REQ(n, RHS);
compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
n++;
REQ(n, NEWLINE);
}
static void
compile_rhs(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
{
int i;
int a, b;
REQ(n, RHS);
i = n->n_nchildren;
REQN(i, 1);
n = n->n_child;
REQ(n, ALT);
compile_alt(ll, nf, n, pa, pb);
if (--i <= 0)
return;
n++;
a = *pa;
b = *pb;
*pa = addnfastate(nf);
*pb = addnfastate(nf);
addnfaarc(nf, *pa, a, EMPTY);
addnfaarc(nf, b, *pb, EMPTY);
for (; --i >= 0; n++) {
REQ(n, VBAR);
REQN(i, 1);
--i;
n++;
REQ(n, ALT);
compile_alt(ll, nf, n, &a, &b);
addnfaarc(nf, *pa, a, EMPTY);
addnfaarc(nf, b, *pb, EMPTY);
}
}
static void
compile_alt(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
{
int i;
int a, b;
REQ(n, ALT);
i = n->n_nchildren;
REQN(i, 1);
n = n->n_child;
REQ(n, ITEM);
compile_item(ll, nf, n, pa, pb);
--i;
n++;
for (; --i >= 0; n++) {
REQ(n, ITEM);
compile_item(ll, nf, n, &a, &b);
addnfaarc(nf, *pb, a, EMPTY);
*pb = b;
}
}
static void
compile_item(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
{
int i;
int a, b;
REQ(n, ITEM);
i = n->n_nchildren;
REQN(i, 1);
n = n->n_child;
if (n->n_type == LSQB) {
REQN(i, 3);
n++;
REQ(n, RHS);
*pa = addnfastate(nf);
*pb = addnfastate(nf);
addnfaarc(nf, *pa, *pb, EMPTY);
compile_rhs(ll, nf, n, &a, &b);
addnfaarc(nf, *pa, a, EMPTY);
addnfaarc(nf, b, *pb, EMPTY);
REQN(i, 1);
n++;
REQ(n, RSQB);
}
else {
compile_atom(ll, nf, n, pa, pb);
if (--i <= 0)
return;
n++;
addnfaarc(nf, *pb, *pa, EMPTY);
if (n->n_type == STAR)
*pb = *pa;
else
REQ(n, PLUS);
}
}
static void
compile_atom(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
{
int i;
REQ(n, ATOM);
i = n->n_nchildren;
(void)i; /* Don't warn about set but unused */
REQN(i, 1);
n = n->n_child;
if (n->n_type == LPAR) {
REQN(i, 3);
n++;
REQ(n, RHS);
compile_rhs(ll, nf, n, pa, pb);
n++;
REQ(n, RPAR);
}
else if (n->n_type == NAME || n->n_type == STRING) {
*pa = addnfastate(nf);
*pb = addnfastate(nf);
addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
}
else
REQ(n, NAME);
}
static void
dumpstate(labellist *ll, nfa *nf, int istate)
{
nfastate *st;
int i;
nfaarc *ar;
printf("%c%2d%c",
istate == nf->nf_start ? '*' : ' ',
istate,
istate == nf->nf_finish ? '.' : ' ');
st = &nf->nf_state[istate];
ar = st->st_arc;
for (i = 0; i < st->st_narcs; i++) {
if (i > 0)
printf("\n ");
printf("-> %2d %s", ar->ar_arrow,
PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label]));
ar++;
}
printf("\n");
}
static void
dumpnfa(labellist *ll, nfa *nf)
{
int i;
printf("NFA '%s' has %d states; start %d, finish %d\n",
nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
for (i = 0; i < nf->nf_nstates; i++)
dumpstate(ll, nf, i);
}
/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
static void
addclosure(bitset ss, nfa *nf, int istate)
{
if (addbit(ss, istate)) {
nfastate *st = &nf->nf_state[istate];
nfaarc *ar = st->st_arc;
int i;
for (i = st->st_narcs; --i >= 0; ) {
if (ar->ar_label == EMPTY)
addclosure(ss, nf, ar->ar_arrow);
ar++;
}
}
}
typedef struct _ss_arc {
bitset sa_bitset;
int sa_arrow;
int sa_label;
} ss_arc;
typedef struct _ss_state {
bitset ss_ss;
int ss_narcs;
struct _ss_arc *ss_arc;
int ss_deleted;
int ss_finish;
int ss_rename;
} ss_state;
typedef struct _ss_dfa {
int sd_nstates;
ss_state *sd_state;
} ss_dfa;
/* Forward */
static void printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
labellist *ll, const char *msg);
static void simplify(int xx_nstates, ss_state *xx_state);
static void convert(dfa *d, int xx_nstates, ss_state *xx_state);
static void
makedfa(nfagrammar *gr, nfa *nf, dfa *d)
{
int nbits = nf->nf_nstates;
bitset ss;
int xx_nstates;
ss_state *xx_state, *yy;
ss_arc *zz;
int istate, jstate, iarc, jarc, ibit;
nfastate *st;
nfaarc *ar;
ss = newbitset(nbits);
addclosure(ss, nf, nf->nf_start);
xx_state = (ss_state *)PyObject_MALLOC(sizeof(ss_state));
if (xx_state == NULL)
Py_FatalError("no mem for xx_state in makedfa");
xx_nstates = 1;
yy = &xx_state[0];
yy->ss_ss = ss;
yy->ss_narcs = 0;
yy->ss_arc = NULL;
yy->ss_deleted = 0;
yy->ss_finish = testbit(ss, nf->nf_finish);
if (yy->ss_finish)
printf("Error: nonterminal '%s' may produce empty.\n",
nf->nf_name);
/* This algorithm is from a book written before
the invention of structured programming... */
/* For each unmarked state... */
for (istate = 0; istate < xx_nstates; ++istate) {
size_t size;
yy = &xx_state[istate];
ss = yy->ss_ss;
/* For all its states... */
for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
if (!testbit(ss, ibit))
continue;
st = &nf->nf_state[ibit];
/* For all non-empty arcs from this state... */
for (iarc = 0; iarc < st->st_narcs; iarc++) {
ar = &st->st_arc[iarc];
if (ar->ar_label == EMPTY)
continue;
/* Look up in list of arcs from this state */
for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
zz = &yy->ss_arc[jarc];
if (ar->ar_label == zz->sa_label)
goto found;
}
/* Add new arc for this state */
size = sizeof(ss_arc) * (yy->ss_narcs + 1);
yy->ss_arc = (ss_arc *)PyObject_REALLOC(
yy->ss_arc, size);
if (yy->ss_arc == NULL)
Py_FatalError("out of mem");
zz = &yy->ss_arc[yy->ss_narcs++];
zz->sa_label = ar->ar_label;
zz->sa_bitset = newbitset(nbits);
zz->sa_arrow = -1;
found: ;
/* Add destination */
addclosure(zz->sa_bitset, nf, ar->ar_arrow);
}
}
/* Now look up all the arrow states */
for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
zz = &xx_state[istate].ss_arc[jarc];
for (jstate = 0; jstate < xx_nstates; jstate++) {
if (samebitset(zz->sa_bitset,
xx_state[jstate].ss_ss, nbits)) {
zz->sa_arrow = jstate;
goto done;
}
}
size = sizeof(ss_state) * (xx_nstates + 1);
xx_state = (ss_state *)PyObject_REALLOC(xx_state,
size);
if (xx_state == NULL)
Py_FatalError("out of mem");
zz->sa_arrow = xx_nstates;
yy = &xx_state[xx_nstates++];
yy->ss_ss = zz->sa_bitset;
yy->ss_narcs = 0;
yy->ss_arc = NULL;
yy->ss_deleted = 0;
yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
done: ;
}
}
if (Py_DebugFlag)
printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
"before minimizing");
simplify(xx_nstates, xx_state);
if (Py_DebugFlag)
printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
"after minimizing");
convert(d, xx_nstates, xx_state);
for (int i = 0; i < xx_nstates; i++) {
for (int j = 0; j < xx_state[i].ss_narcs; j++)
delbitset(xx_state[i].ss_arc[j].sa_bitset);
PyObject_FREE(xx_state[i].ss_arc);
}
PyObject_FREE(xx_state);
}
static void
printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
labellist *ll, const char *msg)
{
int i, ibit, iarc;
ss_state *yy;
ss_arc *zz;
printf("Subset DFA %s\n", msg);
for (i = 0; i < xx_nstates; i++) {
yy = &xx_state[i];
if (yy->ss_deleted)
continue;
printf(" Subset %d", i);
if (yy->ss_finish)
printf(" (finish)");
printf(" { ");
for (ibit = 0; ibit < nbits; ibit++) {
if (testbit(yy->ss_ss, ibit))
printf("%d ", ibit);
}
printf("}\n");
for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
zz = &yy->ss_arc[iarc];
printf(" Arc to state %d, label %s\n",
zz->sa_arrow,
PyGrammar_LabelRepr(
&ll->ll_label[zz->sa_label]));
}
}
}
/* PART THREE -- SIMPLIFY DFA */
/* Simplify the DFA by repeatedly eliminating states that are
equivalent to another oner. This is NOT Algorithm 3.3 from
[Aho&Ullman 77]. It does not always finds the minimal DFA,
but it does usually make a much smaller one... (For an example
of sub-optimal behavior, try S: x a b+ | y a b+.)
*/
static int
samestate(ss_state *s1, ss_state *s2)
{
int i;
if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
return 0;
for (i = 0; i < s1->ss_narcs; i++) {
if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
return 0;
}
return 1;
}
static void
renamestates(int xx_nstates, ss_state *xx_state, int from, int to)
{
int i, j;
if (Py_DebugFlag)
printf("Rename state %d to %d.\n", from, to);
for (i = 0; i < xx_nstates; i++) {
if (xx_state[i].ss_deleted)
continue;
for (j = 0; j < xx_state[i].ss_narcs; j++) {
if (xx_state[i].ss_arc[j].sa_arrow == from)
xx_state[i].ss_arc[j].sa_arrow = to;
}
}
}
static void
simplify(int xx_nstates, ss_state *xx_state)
{
int changes;
int i, j;
do {
changes = 0;
for (i = 1; i < xx_nstates; i++) {
if (xx_state[i].ss_deleted)
continue;
for (j = 0; j < i; j++) {
if (xx_state[j].ss_deleted)
continue;
if (samestate(&xx_state[i], &xx_state[j])) {
xx_state[i].ss_deleted++;
renamestates(xx_nstates, xx_state,
i, j);
changes++;
break;
}
}
}
} while (changes);
}
/* PART FOUR -- GENERATE PARSING TABLES */
/* Convert the DFA into a grammar that can be used by our parser */
static void
convert(dfa *d, int xx_nstates, ss_state *xx_state)
{
int i, j;
ss_state *yy;
ss_arc *zz;
for (i = 0; i < xx_nstates; i++) {
yy = &xx_state[i];
if (yy->ss_deleted)
continue;
yy->ss_rename = addstate(d);
}
for (i = 0; i < xx_nstates; i++) {
yy = &xx_state[i];
if (yy->ss_deleted)
continue;
for (j = 0; j < yy->ss_narcs; j++) {
zz = &yy->ss_arc[j];
addarc(d, yy->ss_rename,
xx_state[zz->sa_arrow].ss_rename,
zz->sa_label);
}
if (yy->ss_finish)
addarc(d, yy->ss_rename, yy->ss_rename, 0);
}
d->d_initial = 0;
}
/* PART FIVE -- GLUE IT ALL TOGETHER */
static grammar *
maketables(nfagrammar *gr)
{
int i;
nfa *nf;
dfa *d;
grammar *g;
if (gr->gr_nnfas == 0)
return NULL;
g = newgrammar(gr->gr_nfa[0]->nf_type);
/* XXX first rule must be start rule */
g->g_ll = gr->gr_ll;
for (i = 0; i < gr->gr_nnfas; i++) {
nf = gr->gr_nfa[i];
if (Py_DebugFlag) {
printf("Dump of NFA for '%s' ...\n", nf->nf_name);
dumpnfa(&gr->gr_ll, nf);
printf("Making DFA for '%s' ...\n", nf->nf_name);
}
d = adddfa(g, nf->nf_type, nf->nf_name);
makedfa(gr, gr->gr_nfa[i], d);
}
return g;
}
grammar *
pgen(node *n)
{
nfagrammar *gr;
grammar *g;
gr = metacompile(n);
g = maketables(gr);
translatelabels(g);
addfirstsets(g);
freenfagrammar(gr);
return g;
}
grammar *
Py_pgen(node *n)
{
return pgen(n);
}
/*
Description
-----------
Input is a grammar in extended BNF (using * for repetition, + for
at-least-once repetition, [] for optional parts, | for alternatives and
() for grouping). This has already been parsed and turned into a parse
tree.
Each rule is considered as a regular expression in its own right.
It is turned into a Non-deterministic Finite Automaton (NFA), which
is then turned into a Deterministic Finite Automaton (DFA), which is then
optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
or similar compiler books (this technique is more often used for lexical
analyzers).
The DFA's are used by the parser as parsing tables in a special way
that's probably unique. Before they are usable, the FIRST sets of all
non-terminals are computed.
Reference
---------
[Aho&Ullman 77]
Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
(first edition)
*/

0
Parser/pgen/__init__.py Normal file
View File

34
Parser/pgen/__main__.py Normal file
View File

@ -0,0 +1,34 @@
import argparse
from .pgen import ParserGenerator
def main():
parser = argparse.ArgumentParser(description="Parser generator main program.")
parser.add_argument(
"grammar", type=str, help="The file with the grammar definition in EBNF format"
)
parser.add_argument(
"tokens", type=str, help="The file with the token definitions"
)
parser.add_argument(
"graminit_h",
type=argparse.FileType('w'),
help="The path to write the grammar's non-terminals as #defines",
)
parser.add_argument(
"graminit_c",
type=argparse.FileType('w'),
help="The path to write the grammar as initialized data",
)
parser.add_argument("--verbose", "-v", action="count")
args = parser.parse_args()
p = ParserGenerator(args.grammar, args.tokens, verbose=args.verbose)
grammar = p.make_grammar()
grammar.produce_graminit_h(args.graminit_h.write)
grammar.produce_graminit_c(args.graminit_c.write)
if __name__ == "__main__":
main()

160
Parser/pgen/grammar.py Normal file
View File

@ -0,0 +1,160 @@
import collections
class Grammar:
"""Pgen parsing tables conversion class.
Once initialized, this class supplies the grammar tables for the
parsing engine implemented by parse.py. The parsing engine
accesses the instance variables directly. The class here does not
provide initialization of the tables; several subclasses exist to
do this (see the conv and pgen modules).
The load() method reads the tables from a pickle file, which is
much faster than the other ways offered by subclasses. The pickle
file is written by calling dump() (after loading the grammar
tables using a subclass). The report() method prints a readable
representation of the tables to stdout, for debugging.
The instance variables are as follows:
symbol2number -- a dict mapping symbol names to numbers. Symbol
numbers are always 256 or higher, to distinguish
them from token numbers, which are between 0 and
255 (inclusive).
number2symbol -- a dict mapping numbers to symbol names;
these two are each other's inverse.
states -- a list of DFAs, where each DFA is a list of
states, each state is a list of arcs, and each
arc is a (i, j) pair where i is a label and j is
a state number. The DFA number is the index into
this list. (This name is slightly confusing.)
Final states are represented by a special arc of
the form (0, j) where j is its own state number.
dfas -- a dict mapping symbol numbers to (DFA, first)
pairs, where DFA is an item from the states list
above, and first is a set of tokens that can
begin this grammar rule (represented by a dict
whose values are always 1).
labels -- a list of (x, y) pairs where x is either a token
number or a symbol number, and y is either None
or a string; the strings are keywords. The label
number is the index in this list; label numbers
are used to mark state transitions (arcs) in the
DFAs.
start -- the number of the grammar's start symbol.
keywords -- a dict mapping keyword strings to arc labels.
tokens -- a dict mapping token numbers to arc labels.
"""
def __init__(self):
self.symbol2number = collections.OrderedDict()
self.number2symbol = collections.OrderedDict()
self.states = []
self.dfas = collections.OrderedDict()
self.labels = [(0, "EMPTY")]
self.keywords = collections.OrderedDict()
self.tokens = collections.OrderedDict()
self.symbol2label = collections.OrderedDict()
self.start = 256
def produce_graminit_h(self, writer):
writer("/* Generated by Parser/pgen */\n\n")
for number, symbol in self.number2symbol.items():
writer("#define {} {}\n".format(symbol, number))
def produce_graminit_c(self, writer):
writer("/* Generated by Parser/pgen */\n\n")
writer('#include "pgenheaders.h"\n')
writer('#include "grammar.h"\n')
writer("grammar _PyParser_Grammar;\n")
self.print_dfas(writer)
self.print_labels(writer)
writer("grammar _PyParser_Grammar = {\n")
writer(" {n_dfas},\n".format(n_dfas=len(self.dfas)))
writer(" dfas,\n")
writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels)))
writer(" {start_number}\n".format(start_number=self.start))
writer("};\n")
def print_labels(self, writer):
writer(
"static label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels))
)
for label, name in self.labels:
if name is None:
writer(" {{{label}, 0}},\n".format(label=label))
else:
writer(
' {{{label}, "{label_name}"}},\n'.format(
label=label, label_name=name
)
)
writer("};\n")
def print_dfas(self, writer):
self.print_states(writer)
writer("static dfa dfas[{}] = {{\n".format(len(self.dfas)))
for dfaindex, dfa_elem in enumerate(self.dfas.items()):
symbol, (dfa, first_sets) = dfa_elem
writer(
' {{{dfa_symbol}, "{symbol_name}", '.format(
dfa_symbol=symbol, symbol_name=self.number2symbol[symbol]
)
+ "0, {n_states}, states_{dfa_index},\n".format(
n_states=len(dfa), dfa_index=dfaindex
)
)
writer(' "')
k = [name for label, name in self.labels if label in first_sets]
bitset = bytearray((len(self.labels) >> 3) + 1)
for token in first_sets:
bitset[token >> 3] |= 1 << (token & 7)
for byte in bitset:
writer("\\%03o" % (byte & 0xFF))
writer('"},\n')
writer("};\n")
def print_states(self, write):
for dfaindex, dfa in enumerate(self.states):
self.print_arcs(write, dfaindex, dfa)
write(
"static state states_{dfa_index}[{n_states}] = {{\n".format(
dfa_index=dfaindex, n_states=len(dfa)
)
)
for stateindex, state in enumerate(dfa):
narcs = len(state)
write(
" {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format(
n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex
)
)
write("};\n")
def print_arcs(self, write, dfaindex, states):
for stateindex, state in enumerate(states):
narcs = len(state)
write(
"static arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format(
dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs
)
)
for a, b in state:
write(
" {{{from_label}, {to_state}}},\n".format(
from_label=a, to_state=b
)
)
write("};\n")

408
Parser/pgen/pgen.py Normal file
View File

@ -0,0 +1,408 @@
import collections
import tokenize # from stdlib
from . import grammar, token
class ParserGenerator(object):
def __init__(self, grammar_file, token_file, stream=None, verbose=False):
close_stream = None
if stream is None:
stream = open(grammar_file)
close_stream = stream.close
with open(token_file) as tok_file:
token_lines = tok_file.readlines()
self.tokens = dict(token.generate_tokens(token_lines))
self.opmap = dict(token.generate_opmap(token_lines))
# Manually add <> so it does not collide with !=
self.opmap['<>'] = "NOTEQUAL"
self.verbose = verbose
self.filename = grammar_file
self.stream = stream
self.generator = tokenize.generate_tokens(stream.readline)
self.gettoken() # Initialize lookahead
self.dfas, self.startsymbol = self.parse()
if close_stream is not None:
close_stream()
self.first = {} # map from symbol name to set of tokens
self.addfirstsets()
def make_grammar(self):
c = grammar.Grammar()
names = list(self.dfas.keys())
names.remove(self.startsymbol)
names.insert(0, self.startsymbol)
for name in names:
i = 256 + len(c.symbol2number)
c.symbol2number[name] = i
c.number2symbol[i] = name
for name in names:
self.make_label(c, name)
dfa = self.dfas[name]
states = []
for state in dfa:
arcs = []
for label, next in sorted(state.arcs.items()):
arcs.append((self.make_label(c, label), dfa.index(next)))
if state.isfinal:
arcs.append((0, dfa.index(state)))
states.append(arcs)
c.states.append(states)
c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
c.start = c.symbol2number[self.startsymbol]
if self.verbose:
print("")
print("Grammar summary")
print("===============")
print("- {n_labels} labels".format(n_labels=len(c.labels)))
print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
print(
"- Start symbol: {start_symbol}".format(
start_symbol=c.number2symbol[c.start]
)
)
return c
def make_first(self, c, name):
rawfirst = self.first[name]
first = set()
for label in sorted(rawfirst):
ilabel = self.make_label(c, label)
##assert ilabel not in first # XXX failed on <> ... !=
first.add(ilabel)
return first
def make_label(self, c, label):
# XXX Maybe this should be a method on a subclass of converter?
ilabel = len(c.labels)
if label[0].isalpha():
# Either a symbol name or a named token
if label in c.symbol2number:
# A symbol name (a non-terminal)
if label in c.symbol2label:
return c.symbol2label[label]
else:
c.labels.append((c.symbol2number[label], None))
c.symbol2label[label] = ilabel
return ilabel
else:
# A named token (NAME, NUMBER, STRING)
itoken = self.tokens.get(label, None)
assert isinstance(itoken, int), label
assert itoken in self.tokens.values(), label
if itoken in c.tokens:
return c.tokens[itoken]
else:
c.labels.append((itoken, None))
c.tokens[itoken] = ilabel
return ilabel
else:
# Either a keyword or an operator
assert label[0] in ('"', "'"), label
value = eval(label)
if value[0].isalpha():
# A keyword
if value in c.keywords:
return c.keywords[value]
else:
c.labels.append((self.tokens["NAME"], value))
c.keywords[value] = ilabel
return ilabel
else:
# An operator (any non-numeric token)
tok_name = self.opmap[value] # Fails if unknown token
itoken = self.tokens[tok_name]
if itoken in c.tokens:
return c.tokens[itoken]
else:
c.labels.append((itoken, None))
c.tokens[itoken] = ilabel
return ilabel
def addfirstsets(self):
names = list(self.dfas.keys())
for name in names:
if name not in self.first:
self.calcfirst(name)
if self.verbose:
print("First set for {dfa_name}".format(dfa_name=name))
for item in self.first[name]:
print(" - {terminal}".format(terminal=item))
def calcfirst(self, name):
dfa = self.dfas[name]
self.first[name] = None # dummy to detect left recursion
state = dfa[0]
totalset = set()
overlapcheck = {}
for label, next in state.arcs.items():
if label in self.dfas:
if label in self.first:
fset = self.first[label]
if fset is None:
raise ValueError("recursion for rule %r" % name)
else:
self.calcfirst(label)
fset = self.first[label]
totalset.update(fset)
overlapcheck[label] = fset
else:
totalset.add(label)
overlapcheck[label] = {label}
inverse = {}
for label, itsfirst in overlapcheck.items():
for symbol in itsfirst:
if symbol in inverse:
raise ValueError("rule %s is ambiguous; %s is in the"
" first sets of %s as well as %s" %
(name, symbol, label, inverse[symbol]))
inverse[symbol] = label
self.first[name] = totalset
def parse(self):
dfas = collections.OrderedDict()
startsymbol = None
# MSTART: (NEWLINE | RULE)* ENDMARKER
while self.type != tokenize.ENDMARKER:
while self.type == tokenize.NEWLINE:
self.gettoken()
# RULE: NAME ':' RHS NEWLINE
name = self.expect(tokenize.NAME)
if self.verbose:
print("Processing rule {dfa_name}".format(dfa_name=name))
self.expect(tokenize.OP, ":")
a, z = self.parse_rhs()
self.expect(tokenize.NEWLINE)
if self.verbose:
self.dump_nfa(name, a, z)
dfa = self.make_dfa(a, z)
if self.verbose:
self.dump_dfa(name, dfa)
oldlen = len(dfa)
self.simplify_dfa(dfa)
newlen = len(dfa)
dfas[name] = dfa
#print name, oldlen, newlen
if startsymbol is None:
startsymbol = name
return dfas, startsymbol
def make_dfa(self, start, finish):
# To turn an NFA into a DFA, we define the states of the DFA
# to correspond to *sets* of states of the NFA. Then do some
# state reduction. Let's represent sets as dicts with 1 for
# values.
assert isinstance(start, NFAState)
assert isinstance(finish, NFAState)
def closure(state):
base = set()
addclosure(state, base)
return base
def addclosure(state, base):
assert isinstance(state, NFAState)
if state in base:
return
base.add(state)
for label, next in state.arcs:
if label is None:
addclosure(next, base)
states = [DFAState(closure(start), finish)]
for state in states: # NB states grows while we're iterating
arcs = {}
for nfastate in state.nfaset:
for label, next in nfastate.arcs:
if label is not None:
addclosure(next, arcs.setdefault(label, set()))
for label, nfaset in sorted(arcs.items()):
for st in states:
if st.nfaset == nfaset:
break
else:
st = DFAState(nfaset, finish)
states.append(st)
state.addarc(st, label)
return states # List of DFAState instances; first one is start
def dump_nfa(self, name, start, finish):
print("Dump of NFA for", name)
todo = [start]
for i, state in enumerate(todo):
print(" State", i, state is finish and "(final)" or "")
for label, next in state.arcs:
if next in todo:
j = todo.index(next)
else:
j = len(todo)
todo.append(next)
if label is None:
print(" -> %d" % j)
else:
print(" %s -> %d" % (label, j))
def dump_dfa(self, name, dfa):
print("Dump of DFA for", name)
for i, state in enumerate(dfa):
print(" State", i, state.isfinal and "(final)" or "")
for label, next in sorted(state.arcs.items()):
print(" %s -> %d" % (label, dfa.index(next)))
def simplify_dfa(self, dfa):
# This is not theoretically optimal, but works well enough.
# Algorithm: repeatedly look for two states that have the same
# set of arcs (same labels pointing to the same nodes) and
# unify them, until things stop changing.
# dfa is a list of DFAState instances
changes = True
while changes:
changes = False
for i, state_i in enumerate(dfa):
for j in range(i+1, len(dfa)):
state_j = dfa[j]
if state_i == state_j:
#print " unify", i, j
del dfa[j]
for state in dfa:
state.unifystate(state_j, state_i)
changes = True
break
def parse_rhs(self):
# RHS: ALT ('|' ALT)*
a, z = self.parse_alt()
if self.value != "|":
return a, z
else:
aa = NFAState()
zz = NFAState()
aa.addarc(a)
z.addarc(zz)
while self.value == "|":
self.gettoken()
a, z = self.parse_alt()
aa.addarc(a)
z.addarc(zz)
return aa, zz
def parse_alt(self):
# ALT: ITEM+
a, b = self.parse_item()
while (self.value in ("(", "[") or
self.type in (tokenize.NAME, tokenize.STRING)):
c, d = self.parse_item()
b.addarc(c)
b = d
return a, b
def parse_item(self):
# ITEM: '[' RHS ']' | ATOM ['+' | '*']
if self.value == "[":
self.gettoken()
a, z = self.parse_rhs()
self.expect(tokenize.OP, "]")
a.addarc(z)
return a, z
else:
a, z = self.parse_atom()
value = self.value
if value not in ("+", "*"):
return a, z
self.gettoken()
z.addarc(a)
if value == "+":
return a, z
else:
return a, a
def parse_atom(self):
# ATOM: '(' RHS ')' | NAME | STRING
if self.value == "(":
self.gettoken()
a, z = self.parse_rhs()
self.expect(tokenize.OP, ")")
return a, z
elif self.type in (tokenize.NAME, tokenize.STRING):
a = NFAState()
z = NFAState()
a.addarc(z, self.value)
self.gettoken()
return a, z
else:
self.raise_error("expected (...) or NAME or STRING, got %s/%s",
self.type, self.value)
def expect(self, type, value=None):
if self.type != type or (value is not None and self.value != value):
self.raise_error("expected %s/%s, got %s/%s",
type, value, self.type, self.value)
value = self.value
self.gettoken()
return value
def gettoken(self):
tup = next(self.generator)
while tup[0] in (tokenize.COMMENT, tokenize.NL):
tup = next(self.generator)
self.type, self.value, self.begin, self.end, self.line = tup
# print(getattr(tokenize, 'tok_name')[self.type], repr(self.value))
def raise_error(self, msg, *args):
if args:
try:
msg = msg % args
except:
msg = " ".join([msg] + list(map(str, args)))
raise SyntaxError(msg, (self.filename, self.end[0],
self.end[1], self.line))
class NFAState(object):
def __init__(self):
self.arcs = [] # list of (label, NFAState) pairs
def addarc(self, next, label=None):
assert label is None or isinstance(label, str)
assert isinstance(next, NFAState)
self.arcs.append((label, next))
class DFAState(object):
def __init__(self, nfaset, final):
assert isinstance(nfaset, set)
assert isinstance(next(iter(nfaset)), NFAState)
assert isinstance(final, NFAState)
self.nfaset = nfaset
self.isfinal = final in nfaset
self.arcs = {} # map from label to DFAState
def addarc(self, next, label):
assert isinstance(label, str)
assert label not in self.arcs
assert isinstance(next, DFAState)
self.arcs[label] = next
def unifystate(self, old, new):
for label, next in self.arcs.items():
if next is old:
self.arcs[label] = new
def __eq__(self, other):
# Equality test -- ignore the nfaset instance variable
assert isinstance(other, DFAState)
if self.isfinal != other.isfinal:
return False
# Can't just return self.arcs == other.arcs, because that
# would invoke this method recursively, with cycles...
if len(self.arcs) != len(other.arcs):
return False
for label, next in self.arcs.items():
if next is not other.arcs.get(label):
return False
return True
__hash__ = None # For Py3 compatibility.

40
Parser/pgen/token.py Normal file
View File

@ -0,0 +1,40 @@
import itertools
def generate_tokens(tokens):
numbers = itertools.count(0)
for line in tokens:
line = line.strip()
if not line:
continue
if line.strip().startswith('#'):
continue
name = line.split()[0]
yield (name, next(numbers))
yield ('N_TOKENS', next(numbers))
yield ('NT_OFFSET', 256)
def generate_opmap(tokens):
for line in tokens:
line = line.strip()
if not line:
continue
if line.strip().startswith('#'):
continue
pieces = line.split()
if len(pieces) != 2:
continue
name, op = pieces
yield (op.strip("'"), name)
# Yield independently <>. This is needed so it does not collide
# with the token generation in "generate_tokens" because if this
# symbol is included in Grammar/Tokens, it will collide with !=
# as it has the same name (NOTEQUAL).
yield ('<>', 'NOTEQUAL')

View File

@ -1,188 +0,0 @@
/* Parser generator main program */
/* This expects a filename containing the grammar as argv[1] (UNIX)
or asks the console for such a file name (THINK C).
It writes its output on two files in the current directory:
- "graminit.c" gets the grammar as a bunch of initialized data
- "graminit.h" gets the grammar's non-terminals as #defines.
Error messages and status info during the generation process are
written to stdout, or sometimes to stderr. */
/* XXX TO DO:
- check for duplicate definitions of names (instead of fatal err)
*/
#define PGEN
#include "Python.h"
#include "pycore_pymem.h"
#include "pycore_pystate.h"
#include "pgenheaders.h"
#include "grammar.h"
#include "node.h"
#include "parsetok.h"
#include "pgen.h"
int Py_DebugFlag = 0;
int Py_VerboseFlag = 0;
int Py_IgnoreEnvironmentFlag = 0;
_PyRuntimeState _PyRuntime = _PyRuntimeState_INIT;
/* Forward */
grammar *getgrammar(const char *filename);
void
Py_Exit(int sts)
{
exit(sts);
}
/* Needed by obmalloc.c */
int PyGILState_Check(void)
{ return 1; }
void _PyMem_DumpTraceback(int fd, const void *ptr)
{}
int
main(int argc, char **argv)
{
grammar *g;
FILE *fp;
char *filename, *graminit_h, *graminit_c;
if (argc != 4) {
fprintf(stderr,
"usage: %s grammar graminit.h graminit.c\n", argv[0]);
Py_Exit(2);
}
filename = argv[1];
graminit_h = argv[2];
graminit_c = argv[3];
g = getgrammar(filename);
fp = fopen(graminit_c, "w");
if (fp == NULL) {
perror(graminit_c);
Py_Exit(1);
}
if (Py_DebugFlag)
printf("Writing %s ...\n", graminit_c);
printgrammar(g, fp);
fclose(fp);
fp = fopen(graminit_h, "w");
if (fp == NULL) {
perror(graminit_h);
Py_Exit(1);
}
if (Py_DebugFlag)
printf("Writing %s ...\n", graminit_h);
printnonterminals(g, fp);
fclose(fp);
freegrammar(g);
Py_Exit(0);
return 0; /* Make gcc -Wall happy */
}
grammar *
getgrammar(const char *filename)
{
FILE *fp;
node *n;
grammar *g0, *g;
perrdetail err;
fp = fopen(filename, "r");
if (fp == NULL) {
perror(filename);
Py_Exit(1);
}
g0 = meta_grammar();
n = PyParser_ParseFile(fp, filename, g0, g0->g_start,
(char *)NULL, (char *)NULL, &err);
fclose(fp);
if (n == NULL) {
fprintf(stderr, "Parsing error %d, line %d.\n",
err.error, err.lineno);
if (err.text != NULL) {
size_t len;
int i;
fprintf(stderr, "%s", err.text);
len = strlen(err.text);
if (len == 0 || err.text[len-1] != '\n')
fprintf(stderr, "\n");
for (i = 0; i < err.offset; i++) {
if (err.text[i] == '\t')
putc('\t', stderr);
else
putc(' ', stderr);
}
fprintf(stderr, "^\n");
PyObject_FREE(err.text);
}
Py_Exit(1);
}
g = pgen(n);
PyNode_Free(n);
if (g == NULL) {
printf("Bad grammar.\n");
Py_Exit(1);
}
return g;
}
/* Can't happen in pgen */
PyObject*
PyErr_Occurred()
{
return 0;
}
void
Py_FatalError(const char *msg)
{
fprintf(stderr, "pgen: FATAL ERROR: %s\n", msg);
Py_Exit(1);
}
/* No-nonsense my_readline() for tokenizer.c */
char *
PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
{
size_t n = 1000;
char *p = (char *)PyMem_MALLOC(n);
char *q;
if (p == NULL)
return NULL;
fprintf(stderr, "%s", prompt);
q = fgets(p, n, sys_stdin);
if (q == NULL) {
*p = '\0';
return p;
}
n = strlen(p);
if (n > 0 && p[n-1] != '\n')
p[n-1] = '\n';
return (char *)PyMem_REALLOC(p, n+1);
}
/* No-nonsense fgets */
char *
Py_UniversalNewlineFgets(char *buf, int n, FILE *stream, PyObject *fobj)
{
return fgets(buf, n, stream);
}
#include <stdarg.h>
void
PySys_WriteStderr(const char *format, ...)
{
va_list va;
va_start(va, format);
vfprintf(stderr, format, va);
va_end(va);
}

View File

@ -1,120 +0,0 @@
/* Print a bunch of C initializers that represent a grammar */
#define PGEN
#include "pgenheaders.h"
#include "grammar.h"
/* Forward */
static void printarcs(int, dfa *, FILE *);
static void printstates(grammar *, FILE *);
static void printdfas(grammar *, FILE *);
static void printlabels(grammar *, FILE *);
void
printgrammar(grammar *g, FILE *fp)
{
fprintf(fp, "/* Generated by Parser/pgen */\n\n");
fprintf(fp, "#include \"pgenheaders.h\"\n");
fprintf(fp, "#include \"grammar.h\"\n");
fprintf(fp, "grammar _PyParser_Grammar;\n");
printdfas(g, fp);
printlabels(g, fp);
fprintf(fp, "grammar _PyParser_Grammar = {\n");
fprintf(fp, " %d,\n", g->g_ndfas);
fprintf(fp, " dfas,\n");
fprintf(fp, " {%d, labels},\n", g->g_ll.ll_nlabels);
fprintf(fp, " %d\n", g->g_start);
fprintf(fp, "};\n");
}
void
printnonterminals(grammar *g, FILE *fp)
{
dfa *d;
int i;
fprintf(fp, "/* Generated by Parser/pgen */\n\n");
d = g->g_dfa;
for (i = g->g_ndfas; --i >= 0; d++)
fprintf(fp, "#define %s %d\n", d->d_name, d->d_type);
}
static void
printarcs(int i, dfa *d, FILE *fp)
{
arc *a;
state *s;
int j, k;
s = d->d_state;
for (j = 0; j < d->d_nstates; j++, s++) {
fprintf(fp, "static arc arcs_%d_%d[%d] = {\n",
i, j, s->s_narcs);
a = s->s_arc;
for (k = 0; k < s->s_narcs; k++, a++)
fprintf(fp, " {%d, %d},\n", a->a_lbl, a->a_arrow);
fprintf(fp, "};\n");
}
}
static void
printstates(grammar *g, FILE *fp)
{
state *s;
dfa *d;
int i, j;
d = g->g_dfa;
for (i = 0; i < g->g_ndfas; i++, d++) {
printarcs(i, d, fp);
fprintf(fp, "static state states_%d[%d] = {\n",
i, d->d_nstates);
s = d->d_state;
for (j = 0; j < d->d_nstates; j++, s++)
fprintf(fp, " {%d, arcs_%d_%d},\n",
s->s_narcs, i, j);
fprintf(fp, "};\n");
}
}
static void
printdfas(grammar *g, FILE *fp)
{
dfa *d;
int i, j, n;
printstates(g, fp);
fprintf(fp, "static dfa dfas[%d] = {\n", g->g_ndfas);
d = g->g_dfa;
for (i = 0; i < g->g_ndfas; i++, d++) {
fprintf(fp, " {%d, \"%s\", %d, %d, states_%d,\n",
d->d_type, d->d_name, d->d_initial, d->d_nstates, i);
fprintf(fp, " \"");
n = NBYTES(g->g_ll.ll_nlabels);
for (j = 0; j < n; j++)
fprintf(fp, "\\%03o", d->d_first[j] & 0xff);
fprintf(fp, "\"},\n");
}
fprintf(fp, "};\n");
}
static void
printlabels(grammar *g, FILE *fp)
{
label *l;
int i;
fprintf(fp, "static label labels[%d] = {\n", g->g_ll.ll_nlabels);
l = g->g_ll.ll_label;
for (i = g->g_ll.ll_nlabels; --i >= 0; l++) {
if (l->lb_str == NULL)
fprintf(fp, " {%d, 0},\n", l->lb_type);
else
fprintf(fp, " {%d, \"%s\"},\n",
l->lb_type, l->lb_str);
}
fprintf(fp, "};\n");
}

View File

@ -10,13 +10,11 @@
#include "tokenizer.h"
#include "errcode.h"
#ifndef PGEN
#include "unicodeobject.h"
#include "bytesobject.h"
#include "fileobject.h"
#include "codecs.h"
#include "abstract.h"
#endif /* PGEN */
/* Alternate tab spacing */
#define ALTTABSIZE 1
@ -81,11 +79,9 @@ tok_new(void)
tok->enc = NULL;
tok->encoding = NULL;
tok->cont_line = 0;
#ifndef PGEN
tok->filename = NULL;
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
#endif
tok->type_comments = 0;
return tok;
@ -104,28 +100,6 @@ new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
return result;
}
#ifdef PGEN
static char *
decoding_fgets(char *s, int size, struct tok_state *tok)
{
return fgets(s, size, tok->fp);
}
static int
decoding_feof(struct tok_state *tok)
{
return feof(tok->fp);
}
static char *
decode_str(const char *str, int exec_input, struct tok_state *tok)
{
return new_string(str, strlen(str), tok);
}
#else /* PGEN */
static char *
error_ret(struct tok_state *tok) /* XXX */
{
@ -551,7 +525,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
return error_ret(tok);
}
}
#ifndef PGEN
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (line && !tok->encoding) {
@ -574,7 +547,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
badchar, tok->filename, tok->lineno + 1);
return error_ret(tok);
}
#endif
return line;
}
@ -738,8 +710,6 @@ decode_str(const char *input, int single, struct tok_state *tok)
return str;
}
#endif /* PGEN */
/* Set up tokenizer for string */
struct tok_state *
@ -765,9 +735,7 @@ PyTokenizer_FromUTF8(const char *str, int exec_input)
struct tok_state *tok = tok_new();
if (tok == NULL)
return NULL;
#ifndef PGEN
tok->input = str = translate_newlines(str, exec_input, tok);
#endif
if (str == NULL) {
PyTokenizer_Free(tok);
return NULL;
@ -828,11 +796,9 @@ PyTokenizer_Free(struct tok_state *tok)
{
if (tok->encoding != NULL)
PyMem_FREE(tok->encoding);
#ifndef PGEN
Py_XDECREF(tok->decoding_readline);
Py_XDECREF(tok->decoding_buffer);
Py_XDECREF(tok->filename);
#endif
if (tok->fp != NULL && tok->buf != NULL)
PyMem_FREE(tok->buf);
if (tok->input)
@ -871,7 +837,6 @@ tok_nextc(struct tok_state *tok)
}
if (tok->prompt != NULL) {
char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
#ifndef PGEN
if (newtok != NULL) {
char *translated = translate_newlines(newtok, 0, tok);
PyMem_FREE(newtok);
@ -900,7 +865,6 @@ tok_nextc(struct tok_state *tok)
strcpy(newtok, buf);
Py_DECREF(u);
}
#endif
if (tok->nextprompt != NULL)
tok->prompt = tok->nextprompt;
if (newtok == NULL)
@ -1056,7 +1020,6 @@ tok_backup(struct tok_state *tok, int c)
static int
syntaxerror(struct tok_state *tok, const char *format, ...)
{
#ifndef PGEN
va_list vargs;
#ifdef HAVE_STDARG_PROTOTYPES
va_start(vargs, format);
@ -1069,9 +1032,6 @@ syntaxerror(struct tok_state *tok, const char *format, ...)
tok->lineno,
(int)(tok->cur - tok->line_start));
tok->done = E_ERROR;
#else
tok->done = E_TOKEN;
#endif
return ERRORTOKEN;
}
@ -1083,9 +1043,6 @@ indenterror(struct tok_state *tok)
return ERRORTOKEN;
}
#ifdef PGEN
#define verify_identifier(tok) 1
#else
/* Verify that the identifier follows PEP 3131.
All identifier strings are guaranteed to be "ready" unicode objects.
*/
@ -1112,7 +1069,6 @@ verify_identifier(struct tok_state *tok)
tok->done = E_IDENTIFIER;
return result;
}
#endif
static int
tok_decimal_tail(struct tok_state *tok)
@ -1667,25 +1623,20 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
case '(':
case '[':
case '{':
#ifndef PGEN
if (tok->level >= MAXLEVEL) {
return syntaxerror(tok, "too many nested parentheses");
}
tok->parenstack[tok->level] = c;
tok->parenlinenostack[tok->level] = tok->lineno;
#endif
tok->level++;
break;
case ')':
case ']':
case '}':
#ifndef PGEN
if (!tok->level) {
return syntaxerror(tok, "unmatched '%c'", c);
}
#endif
tok->level--;
#ifndef PGEN
int opening = tok->parenstack[tok->level];
if (!((opening == '(' && c == ')') ||
(opening == '[' && c == ']') ||
@ -1704,7 +1655,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
c, opening);
}
}
#endif
break;
}
@ -1742,11 +1692,7 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
FILE *fp;
char *p_start =NULL , *p_end =NULL , *encoding = NULL;
#ifndef PGEN
fd = _Py_dup(fd);
#else
fd = dup(fd);
#endif
if (fd < 0) {
return NULL;
}
@ -1760,7 +1706,6 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
fclose(fp);
return NULL;
}
#ifndef PGEN
if (filename != NULL) {
Py_INCREF(filename);
tok->filename = filename;
@ -1773,7 +1718,6 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
return encoding;
}
}
#endif
while (tok->lineno < 2 && tok->done == E_OK) {
PyTokenizer_Get(tok, &p_start, &p_end);
}

View File

@ -42,15 +42,9 @@ struct tok_state {
expression (cf. issue 16806) */
int level; /* () [] {} Parentheses nesting level */
/* Used to allow free continuations inside them */
#ifndef PGEN
char parenstack[MAXLEVEL];
int parenlinenostack[MAXLEVEL];
/* pgen doesn't have access to Python codecs, it cannot decode the input
filename. The bytes filename might be kept, but it is only used by
indenterror() and it is not really needed: pgen only compiles one file
(Grammar/Grammar). */
PyObject *filename;
#endif
/* Stuff for checking on different tab sizes */
int altindstack[MAXINDENT]; /* Stack of alternate indents */
/* Stuff for PEP 0263 */
@ -63,10 +57,8 @@ struct tok_state {
const char* multi_line_start; /* pointer to start of first line of
a single line or multi line string
expression (cf. issue 16806) */
#ifndef PGEN
PyObject *decoding_readline; /* open(...).readline */
PyObject *decoding_buffer;
#endif
const char* enc; /* Encoding for the current str. */
const char* str;
const char* input; /* Tokenizer's newline translated copy of the string. */

View File

@ -1,2 +0,0 @@
#define PGEN
#include "tokenizer.c"

File diff suppressed because it is too large Load Diff