mirror of https://github.com/python/cpython
690 lines
15 KiB
C
690 lines
15 KiB
C
/*
|
||
XXX support range parameter on search
|
||
XXX support mstop parameter on search
|
||
*/
|
||
|
||
|
||
/* Regular expression objects */
|
||
/* This uses Tatu Ylonen's copyleft-free reimplementation of
|
||
GNU regular expressions */
|
||
|
||
#include "Python.h"
|
||
|
||
#include <ctype.h>
|
||
|
||
#include "regexpr.h"
|
||
|
||
static PyObject *RegexError; /* Exception */
|
||
|
||
typedef struct {
|
||
PyObject_HEAD
|
||
struct re_pattern_buffer re_patbuf; /* The compiled expression */
|
||
struct re_registers re_regs; /* The registers from the last match */
|
||
char re_fastmap[256]; /* Storage for fastmap */
|
||
PyObject *re_translate; /* String object for translate table */
|
||
PyObject *re_lastok; /* String object last matched/searched */
|
||
PyObject *re_groupindex; /* Group name to index dictionary */
|
||
PyObject *re_givenpat; /* Pattern with symbolic groups */
|
||
PyObject *re_realpat; /* Pattern without symbolic groups */
|
||
} regexobject;
|
||
|
||
/* Regex object methods */
|
||
|
||
static void
|
||
reg_dealloc(regexobject *re)
|
||
{
|
||
if (re->re_patbuf.buffer)
|
||
free(re->re_patbuf.buffer);
|
||
Py_XDECREF(re->re_translate);
|
||
Py_XDECREF(re->re_lastok);
|
||
Py_XDECREF(re->re_groupindex);
|
||
Py_XDECREF(re->re_givenpat);
|
||
Py_XDECREF(re->re_realpat);
|
||
PyObject_Del(re);
|
||
}
|
||
|
||
static PyObject *
|
||
makeresult(struct re_registers *regs)
|
||
{
|
||
PyObject *v;
|
||
int i;
|
||
static PyObject *filler = NULL;
|
||
|
||
if (filler == NULL) {
|
||
filler = Py_BuildValue("(ii)", -1, -1);
|
||
if (filler == NULL)
|
||
return NULL;
|
||
}
|
||
v = PyTuple_New(RE_NREGS);
|
||
if (v == NULL)
|
||
return NULL;
|
||
|
||
for (i = 0; i < RE_NREGS; i++) {
|
||
int lo = regs->start[i];
|
||
int hi = regs->end[i];
|
||
PyObject *w;
|
||
if (lo == -1 && hi == -1) {
|
||
w = filler;
|
||
Py_INCREF(w);
|
||
}
|
||
else
|
||
w = Py_BuildValue("(ii)", lo, hi);
|
||
if (w == NULL || PyTuple_SetItem(v, i, w) < 0) {
|
||
Py_DECREF(v);
|
||
return NULL;
|
||
}
|
||
}
|
||
return v;
|
||
}
|
||
|
||
static PyObject *
|
||
regobj_match(regexobject *re, PyObject *args)
|
||
{
|
||
PyObject *argstring;
|
||
char *buffer;
|
||
int size;
|
||
int offset = 0;
|
||
int result;
|
||
|
||
if (!PyArg_ParseTuple(args, "O|i:match", &argstring, &offset))
|
||
return NULL;
|
||
if (!PyArg_Parse(argstring, "t#", &buffer, &size))
|
||
return NULL;
|
||
|
||
if (offset < 0 || offset > size) {
|
||
PyErr_SetString(RegexError, "match offset out of range");
|
||
return NULL;
|
||
}
|
||
Py_XDECREF(re->re_lastok);
|
||
re->re_lastok = NULL;
|
||
result = _Py_re_match(&re->re_patbuf, (unsigned char *)buffer, size, offset,
|
||
&re->re_regs);
|
||
if (result < -1) {
|
||
/* Serious failure of some sort; if re_match didn't
|
||
set an exception, raise a generic error */
|
||
if (!PyErr_Occurred())
|
||
PyErr_SetString(RegexError, "match failure");
|
||
return NULL;
|
||
}
|
||
if (result >= 0) {
|
||
Py_INCREF(argstring);
|
||
re->re_lastok = argstring;
|
||
}
|
||
return PyInt_FromLong((long)result); /* Length of the match or -1 */
|
||
}
|
||
|
||
static PyObject *
|
||
regobj_search(regexobject *re, PyObject *args)
|
||
{
|
||
PyObject *argstring;
|
||
char *buffer;
|
||
int size;
|
||
int offset = 0;
|
||
int range;
|
||
int result;
|
||
|
||
if (!PyArg_ParseTuple(args, "O|i:search", &argstring, &offset))
|
||
return NULL;
|
||
if (!PyArg_Parse(argstring, "t#:search", &buffer, &size))
|
||
return NULL;
|
||
|
||
if (offset < 0 || offset > size) {
|
||
PyErr_SetString(RegexError, "search offset out of range");
|
||
return NULL;
|
||
}
|
||
/* NB: In Emacs 18.57, the documentation for re_search[_2] and
|
||
the implementation don't match: the documentation states that
|
||
|range| positions are tried, while the code tries |range|+1
|
||
positions. It seems more productive to believe the code! */
|
||
range = size - offset;
|
||
Py_XDECREF(re->re_lastok);
|
||
re->re_lastok = NULL;
|
||
result = _Py_re_search(&re->re_patbuf, (unsigned char *)buffer, size, offset, range,
|
||
&re->re_regs);
|
||
if (result < -1) {
|
||
/* Serious failure of some sort; if re_match didn't
|
||
set an exception, raise a generic error */
|
||
if (!PyErr_Occurred())
|
||
PyErr_SetString(RegexError, "match failure");
|
||
return NULL;
|
||
}
|
||
if (result >= 0) {
|
||
Py_INCREF(argstring);
|
||
re->re_lastok = argstring;
|
||
}
|
||
return PyInt_FromLong((long)result); /* Position of the match or -1 */
|
||
}
|
||
|
||
/* get the group from the regex where index can be a string (group name) or
|
||
an integer index [0 .. 99]
|
||
*/
|
||
static PyObject*
|
||
group_from_index(regexobject *re, PyObject *index)
|
||
{
|
||
int i, a, b;
|
||
char *v;
|
||
|
||
if (PyString_Check(index))
|
||
if (re->re_groupindex == NULL ||
|
||
!(index = PyDict_GetItem(re->re_groupindex, index)))
|
||
{
|
||
PyErr_SetString(RegexError,
|
||
"group() group name doesn't exist");
|
||
return NULL;
|
||
}
|
||
|
||
i = PyInt_AsLong(index);
|
||
if (i == -1 && PyErr_Occurred())
|
||
return NULL;
|
||
|
||
if (i < 0 || i >= RE_NREGS) {
|
||
PyErr_SetString(RegexError, "group() index out of range");
|
||
return NULL;
|
||
}
|
||
if (re->re_lastok == NULL) {
|
||
PyErr_SetString(RegexError,
|
||
"group() only valid after successful match/search");
|
||
return NULL;
|
||
}
|
||
a = re->re_regs.start[i];
|
||
b = re->re_regs.end[i];
|
||
if (a < 0 || b < 0) {
|
||
Py_INCREF(Py_None);
|
||
return Py_None;
|
||
}
|
||
|
||
if (!(v = PyString_AsString(re->re_lastok)))
|
||
return NULL;
|
||
|
||
return PyString_FromStringAndSize(v+a, b-a);
|
||
}
|
||
|
||
|
||
static PyObject *
|
||
regobj_group(regexobject *re, PyObject *args)
|
||
{
|
||
int n = PyTuple_Size(args);
|
||
int i;
|
||
PyObject *res = NULL;
|
||
|
||
if (n < 0)
|
||
return NULL;
|
||
if (n == 0) {
|
||
PyErr_SetString(PyExc_TypeError, "not enough arguments");
|
||
return NULL;
|
||
}
|
||
if (n == 1) {
|
||
/* return value is a single string */
|
||
PyObject *index = PyTuple_GetItem(args, 0);
|
||
if (!index)
|
||
return NULL;
|
||
|
||
return group_from_index(re, index);
|
||
}
|
||
|
||
/* return value is a tuple */
|
||
if (!(res = PyTuple_New(n)))
|
||
return NULL;
|
||
|
||
for (i = 0; i < n; i++) {
|
||
PyObject *index = PyTuple_GetItem(args, i);
|
||
PyObject *group = NULL;
|
||
|
||
if (!index)
|
||
goto finally;
|
||
if (!(group = group_from_index(re, index)))
|
||
goto finally;
|
||
if (PyTuple_SetItem(res, i, group) < 0)
|
||
goto finally;
|
||
}
|
||
return res;
|
||
|
||
finally:
|
||
Py_DECREF(res);
|
||
return NULL;
|
||
}
|
||
|
||
|
||
static struct PyMethodDef reg_methods[] = {
|
||
{"match", (PyCFunction)regobj_match, METH_VARARGS},
|
||
{"search", (PyCFunction)regobj_search, METH_VARARGS},
|
||
{"group", (PyCFunction)regobj_group, METH_VARARGS},
|
||
{NULL, NULL} /* sentinel */
|
||
};
|
||
|
||
|
||
|
||
static char* members[] = {
|
||
"last", "regs", "translate",
|
||
"groupindex", "realpat", "givenpat",
|
||
NULL
|
||
};
|
||
|
||
|
||
static PyObject *
|
||
regobj_getattr(regexobject *re, char *name)
|
||
{
|
||
if (strcmp(name, "regs") == 0) {
|
||
if (re->re_lastok == NULL) {
|
||
Py_INCREF(Py_None);
|
||
return Py_None;
|
||
}
|
||
return makeresult(&re->re_regs);
|
||
}
|
||
if (strcmp(name, "last") == 0) {
|
||
if (re->re_lastok == NULL) {
|
||
Py_INCREF(Py_None);
|
||
return Py_None;
|
||
}
|
||
Py_INCREF(re->re_lastok);
|
||
return re->re_lastok;
|
||
}
|
||
if (strcmp(name, "translate") == 0) {
|
||
if (re->re_translate == NULL) {
|
||
Py_INCREF(Py_None);
|
||
return Py_None;
|
||
}
|
||
Py_INCREF(re->re_translate);
|
||
return re->re_translate;
|
||
}
|
||
if (strcmp(name, "groupindex") == 0) {
|
||
if (re->re_groupindex == NULL) {
|
||
Py_INCREF(Py_None);
|
||
return Py_None;
|
||
}
|
||
Py_INCREF(re->re_groupindex);
|
||
return re->re_groupindex;
|
||
}
|
||
if (strcmp(name, "realpat") == 0) {
|
||
if (re->re_realpat == NULL) {
|
||
Py_INCREF(Py_None);
|
||
return Py_None;
|
||
}
|
||
Py_INCREF(re->re_realpat);
|
||
return re->re_realpat;
|
||
}
|
||
if (strcmp(name, "givenpat") == 0) {
|
||
if (re->re_givenpat == NULL) {
|
||
Py_INCREF(Py_None);
|
||
return Py_None;
|
||
}
|
||
Py_INCREF(re->re_givenpat);
|
||
return re->re_givenpat;
|
||
}
|
||
if (strcmp(name, "__members__") == 0) {
|
||
int i = 0;
|
||
PyObject *list = NULL;
|
||
|
||
/* okay, so it's unlikely this list will change that often.
|
||
still, it's easier to change it in just one place.
|
||
*/
|
||
while (members[i])
|
||
i++;
|
||
if (!(list = PyList_New(i)))
|
||
return NULL;
|
||
|
||
i = 0;
|
||
while (members[i]) {
|
||
PyObject* v = PyString_FromString(members[i]);
|
||
if (!v || PyList_SetItem(list, i, v) < 0) {
|
||
Py_DECREF(list);
|
||
return NULL;
|
||
}
|
||
i++;
|
||
}
|
||
return list;
|
||
}
|
||
return Py_FindMethod(reg_methods, (PyObject *)re, name);
|
||
}
|
||
|
||
static PyTypeObject Regextype = {
|
||
PyObject_HEAD_INIT(NULL)
|
||
0, /*ob_size*/
|
||
"regex.regex", /*tp_name*/
|
||
sizeof(regexobject), /*tp_size*/
|
||
0, /*tp_itemsize*/
|
||
/* methods */
|
||
(destructor)reg_dealloc, /*tp_dealloc*/
|
||
0, /*tp_print*/
|
||
(getattrfunc)regobj_getattr, /*tp_getattr*/
|
||
0, /*tp_setattr*/
|
||
0, /*tp_compare*/
|
||
0, /*tp_repr*/
|
||
};
|
||
|
||
/* reference counting invariants:
|
||
pattern: borrowed
|
||
translate: borrowed
|
||
givenpat: borrowed
|
||
groupindex: transferred
|
||
*/
|
||
static PyObject *
|
||
newregexobject(PyObject *pattern, PyObject *translate, PyObject *givenpat, PyObject *groupindex)
|
||
{
|
||
regexobject *re;
|
||
char *pat;
|
||
int size;
|
||
|
||
if (!PyArg_Parse(pattern, "t#", &pat, &size))
|
||
return NULL;
|
||
|
||
if (translate != NULL && PyString_Size(translate) != 256) {
|
||
PyErr_SetString(RegexError,
|
||
"translation table must be 256 bytes");
|
||
return NULL;
|
||
}
|
||
re = PyObject_New(regexobject, &Regextype);
|
||
if (re != NULL) {
|
||
char *error;
|
||
re->re_patbuf.buffer = NULL;
|
||
re->re_patbuf.allocated = 0;
|
||
re->re_patbuf.fastmap = (unsigned char *)re->re_fastmap;
|
||
if (translate) {
|
||
re->re_patbuf.translate = (unsigned char *)PyString_AsString(translate);
|
||
if (!re->re_patbuf.translate)
|
||
goto finally;
|
||
Py_INCREF(translate);
|
||
}
|
||
else
|
||
re->re_patbuf.translate = NULL;
|
||
re->re_translate = translate;
|
||
re->re_lastok = NULL;
|
||
re->re_groupindex = groupindex;
|
||
Py_INCREF(pattern);
|
||
re->re_realpat = pattern;
|
||
Py_INCREF(givenpat);
|
||
re->re_givenpat = givenpat;
|
||
error = _Py_re_compile_pattern((unsigned char *)pat, size, &re->re_patbuf);
|
||
if (error != NULL) {
|
||
PyErr_SetString(RegexError, error);
|
||
goto finally;
|
||
}
|
||
}
|
||
return (PyObject *)re;
|
||
finally:
|
||
Py_DECREF(re);
|
||
return NULL;
|
||
}
|
||
|
||
static PyObject *
|
||
regex_compile(PyObject *self, PyObject *args)
|
||
{
|
||
PyObject *pat = NULL;
|
||
PyObject *tran = NULL;
|
||
|
||
if (!PyArg_ParseTuple(args, "S|S:compile", &pat, &tran))
|
||
return NULL;
|
||
return newregexobject(pat, tran, pat, NULL);
|
||
}
|
||
|
||
static PyObject *
|
||
symcomp(PyObject *pattern, PyObject *gdict)
|
||
{
|
||
char *opat, *oend, *o, *n, *g, *v;
|
||
int group_count = 0;
|
||
int sz;
|
||
int escaped = 0;
|
||
char name_buf[128];
|
||
PyObject *npattern;
|
||
int require_escape = re_syntax & RE_NO_BK_PARENS ? 0 : 1;
|
||
|
||
if (!(opat = PyString_AsString(pattern)))
|
||
return NULL;
|
||
|
||
if ((sz = PyString_Size(pattern)) < 0)
|
||
return NULL;
|
||
|
||
oend = opat + sz;
|
||
o = opat;
|
||
|
||
if (oend == opat) {
|
||
Py_INCREF(pattern);
|
||
return pattern;
|
||
}
|
||
|
||
if (!(npattern = PyString_FromStringAndSize((char*)NULL, sz)) ||
|
||
!(n = PyString_AsString(npattern)))
|
||
return NULL;
|
||
|
||
while (o < oend) {
|
||
if (*o == '(' && escaped == require_escape) {
|
||
char *backtrack;
|
||
escaped = 0;
|
||
++group_count;
|
||
*n++ = *o;
|
||
if (++o >= oend || *o != '<')
|
||
continue;
|
||
/* *o == '<' */
|
||
if (o+1 < oend && *(o+1) == '>')
|
||
continue;
|
||
backtrack = o;
|
||
g = name_buf;
|
||
for (++o; o < oend;) {
|
||
if (*o == '>') {
|
||
PyObject *group_name = NULL;
|
||
PyObject *group_index = NULL;
|
||
*g++ = '\0';
|
||
group_name = PyString_FromString(name_buf);
|
||
group_index = PyInt_FromLong(group_count);
|
||
if (group_name == NULL ||
|
||
group_index == NULL ||
|
||
PyDict_SetItem(gdict, group_name,
|
||
group_index) != 0)
|
||
{
|
||
Py_XDECREF(group_name);
|
||
Py_XDECREF(group_index);
|
||
Py_XDECREF(npattern);
|
||
return NULL;
|
||
}
|
||
Py_DECREF(group_name);
|
||
Py_DECREF(group_index);
|
||
++o; /* eat the '>' */
|
||
break;
|
||
}
|
||
if (!isalnum(Py_CHARMASK(*o)) && *o != '_') {
|
||
o = backtrack;
|
||
break;
|
||
}
|
||
*g++ = *o++;
|
||
}
|
||
}
|
||
else if (*o == '[' && !escaped) {
|
||
*n++ = *o;
|
||
++o; /* eat the char following '[' */
|
||
*n++ = *o;
|
||
while (o < oend && *o != ']') {
|
||
++o;
|
||
*n++ = *o;
|
||
}
|
||
if (o < oend)
|
||
++o;
|
||
}
|
||
else if (*o == '\\') {
|
||
escaped = 1;
|
||
*n++ = *o;
|
||
++o;
|
||
}
|
||
else {
|
||
escaped = 0;
|
||
*n++ = *o;
|
||
++o;
|
||
}
|
||
}
|
||
|
||
if (!(v = PyString_AsString(npattern))) {
|
||
Py_DECREF(npattern);
|
||
return NULL;
|
||
}
|
||
/* _PyString_Resize() decrements npattern on failure */
|
||
_PyString_Resize(&npattern, n - v);
|
||
return npattern;
|
||
|
||
}
|
||
|
||
static PyObject *
|
||
regex_symcomp(PyObject *self, PyObject *args)
|
||
{
|
||
PyObject *pattern;
|
||
PyObject *tran = NULL;
|
||
PyObject *gdict = NULL;
|
||
PyObject *npattern;
|
||
PyObject *retval = NULL;
|
||
|
||
if (!PyArg_ParseTuple(args, "S|S:symcomp", &pattern, &tran))
|
||
return NULL;
|
||
|
||
gdict = PyDict_New();
|
||
if (gdict == NULL || (npattern = symcomp(pattern, gdict)) == NULL) {
|
||
Py_DECREF(gdict);
|
||
Py_DECREF(pattern);
|
||
return NULL;
|
||
}
|
||
retval = newregexobject(npattern, tran, pattern, gdict);
|
||
Py_DECREF(npattern);
|
||
return retval;
|
||
}
|
||
|
||
|
||
static PyObject *cache_pat;
|
||
static PyObject *cache_prog;
|
||
|
||
static int
|
||
update_cache(PyObject *pat)
|
||
{
|
||
PyObject *tuple = Py_BuildValue("(O)", pat);
|
||
int status = 0;
|
||
|
||
if (!tuple)
|
||
return -1;
|
||
|
||
if (pat != cache_pat) {
|
||
Py_XDECREF(cache_pat);
|
||
cache_pat = NULL;
|
||
Py_XDECREF(cache_prog);
|
||
cache_prog = regex_compile((PyObject *)NULL, tuple);
|
||
if (cache_prog == NULL) {
|
||
status = -1;
|
||
goto finally;
|
||
}
|
||
cache_pat = pat;
|
||
Py_INCREF(cache_pat);
|
||
}
|
||
finally:
|
||
Py_DECREF(tuple);
|
||
return status;
|
||
}
|
||
|
||
static PyObject *
|
||
regex_match(PyObject *self, PyObject *args)
|
||
{
|
||
PyObject *pat, *string;
|
||
PyObject *tuple, *v;
|
||
|
||
if (!PyArg_ParseTuple(args, "SS:match", &pat, &string))
|
||
return NULL;
|
||
if (update_cache(pat) < 0)
|
||
return NULL;
|
||
|
||
if (!(tuple = Py_BuildValue("(S)", string)))
|
||
return NULL;
|
||
v = regobj_match((regexobject *)cache_prog, tuple);
|
||
Py_DECREF(tuple);
|
||
return v;
|
||
}
|
||
|
||
static PyObject *
|
||
regex_search(PyObject *self, PyObject *args)
|
||
{
|
||
PyObject *pat, *string;
|
||
PyObject *tuple, *v;
|
||
|
||
if (!PyArg_ParseTuple(args, "SS:search", &pat, &string))
|
||
return NULL;
|
||
if (update_cache(pat) < 0)
|
||
return NULL;
|
||
|
||
if (!(tuple = Py_BuildValue("(S)", string)))
|
||
return NULL;
|
||
v = regobj_search((regexobject *)cache_prog, tuple);
|
||
Py_DECREF(tuple);
|
||
return v;
|
||
}
|
||
|
||
static PyObject *
|
||
regex_set_syntax(PyObject *self, PyObject *args)
|
||
{
|
||
int syntax;
|
||
if (!PyArg_ParseTuple(args, "i:set_syntax", &syntax))
|
||
return NULL;
|
||
syntax = re_set_syntax(syntax);
|
||
/* wipe the global pattern cache */
|
||
Py_XDECREF(cache_pat);
|
||
cache_pat = NULL;
|
||
Py_XDECREF(cache_prog);
|
||
cache_prog = NULL;
|
||
return PyInt_FromLong((long)syntax);
|
||
}
|
||
|
||
static PyObject *
|
||
regex_get_syntax(PyObject *self)
|
||
{
|
||
return PyInt_FromLong((long)re_syntax);
|
||
}
|
||
|
||
|
||
static struct PyMethodDef regex_global_methods[] = {
|
||
{"compile", regex_compile, METH_VARARGS},
|
||
{"symcomp", regex_symcomp, METH_VARARGS},
|
||
{"match", regex_match, METH_VARARGS},
|
||
{"search", regex_search, METH_VARARGS},
|
||
{"set_syntax", regex_set_syntax, METH_VARARGS},
|
||
{"get_syntax", (PyCFunction)regex_get_syntax, METH_NOARGS},
|
||
{NULL, NULL} /* sentinel */
|
||
};
|
||
|
||
DL_EXPORT(void)
|
||
initregex(void)
|
||
{
|
||
PyObject *m, *d, *v;
|
||
int i;
|
||
char *s;
|
||
|
||
/* Initialize object type */
|
||
Regextype.ob_type = &PyType_Type;
|
||
|
||
m = Py_InitModule("regex", regex_global_methods);
|
||
d = PyModule_GetDict(m);
|
||
|
||
if (PyErr_Warn(PyExc_DeprecationWarning,
|
||
"the regex module is deprecated; "
|
||
"please use the re module") < 0)
|
||
return;
|
||
|
||
/* Initialize regex.error exception */
|
||
v = RegexError = PyErr_NewException("regex.error", NULL, NULL);
|
||
if (v == NULL || PyDict_SetItemString(d, "error", v) != 0)
|
||
goto finally;
|
||
|
||
/* Initialize regex.casefold constant */
|
||
if (!(v = PyString_FromStringAndSize((char *)NULL, 256)))
|
||
goto finally;
|
||
|
||
if (!(s = PyString_AsString(v)))
|
||
goto finally;
|
||
|
||
for (i = 0; i < 256; i++) {
|
||
if (isupper(i))
|
||
s[i] = tolower(i);
|
||
else
|
||
s[i] = i;
|
||
}
|
||
if (PyDict_SetItemString(d, "casefold", v) < 0)
|
||
goto finally;
|
||
Py_DECREF(v);
|
||
|
||
if (!PyErr_Occurred())
|
||
return;
|
||
finally:
|
||
/* Nothing */ ;
|
||
}
|