From c357325663150599e3cf5e0ee03270d20ea2bc58 Mon Sep 17 00:00:00 2001 From: Barry Warsaw Date: Fri, 20 Dec 1996 21:56:07 +0000 Subject: [PATCH] Several changes. Test program to follow. - Where optional arguments were being used, converted to PyArg_ParseTuple() style instead of nested PyArg_Parse() style. - Check for and handle many potential error conditions that were never being tested. - internal reg_* functions renamed to regobj_* (makes it easier to figure out which are global regex functions and which are for regex objects). - reg_group (now regobj_group) was quite extensively reworked. it no longer recurses to do its job (by factoring core functionality into a separate function that knows about string and integer indexes). - some minor formatting fixes. - regex_set_syntax() now invalidates the cache. Without this change (in the example below), the second search would produce different output depending on whether the first search were performed or not (since performing the first search would cache the compiled object with RE_SYNTAX_EMACS, causing the second test to unexpectedly fail). regex.search('(a+)|(b+)', 'cdb') prev = regex.set_syntax(RE_SYNTAX_AWK) regex.search('(a+)|(b+)', 'cdb') --- Modules/regexmodule.c | 429 ++++++++++++++++++++++++++---------------- 1 file changed, 267 insertions(+), 162 deletions(-) diff --git a/Modules/regexmodule.c b/Modules/regexmodule.c index c9f9781fe93..981a6e96980 100644 --- a/Modules/regexmodule.c +++ b/Modules/regexmodule.c @@ -80,6 +80,7 @@ makeresult(regs) PyObject *v; int i; static PyObject *filler = NULL; + if (filler == NULL) { filler = Py_BuildValue("(ii)", -1, -1); if (filler == NULL) @@ -88,6 +89,7 @@ makeresult(regs) v = PyTuple_New(RE_NREGS); if (v == NULL) return NULL; + for (i = 0; i < RE_NREGS; i++) { int lo = regs->start[i]; int hi = regs->end[i]; @@ -98,35 +100,27 @@ makeresult(regs) } else w = Py_BuildValue("(ii)", lo, hi); - if (w == NULL) { - Py_XDECREF(v); + if (w == NULL || PyTuple_SetItem(v, i, w) < 0) { + Py_DECREF(v); return NULL; } - PyTuple_SetItem(v, i, w); } return v; } static PyObject * -reg_match(re, args) +regobj_match(re, args) regexobject *re; PyObject *args; { - PyObject *argstring; char *buffer; int size; - int offset; + int offset = 0; int result; - if (PyArg_Parse(args, "S", &argstring)) { - offset = 0; - } - else { - PyErr_Clear(); - if (!PyArg_Parse(args, "(Si)", &argstring, &offset)) - return NULL; - } - buffer = PyString_AsString(argstring); - size = PyString_Size(argstring); + + if (!PyArg_ParseTuple(args, "s#|i", &buffer, &size, &offset)) + return NULL; + if (offset < 0 || offset > size) { PyErr_SetString(RegexError, "match offset out of range"); return NULL; @@ -140,34 +134,28 @@ reg_match(re, args) return NULL; } if (result >= 0) { - Py_INCREF(argstring); - re->re_lastok = argstring; + PyObject* str = PyString_FromStringAndSize(buffer, size); + if (!str) + return NULL; + re->re_lastok = str; } return PyInt_FromLong((long)result); /* Length of the match or -1 */ } static PyObject * -reg_search(re, args) +regobj_search(re, args) regexobject *re; PyObject *args; { - PyObject *argstring; char *buffer; int size; - int offset; + int offset = 0; int range; int result; - if (PyArg_Parse(args, "S", &argstring)) { - offset = 0; - } - else { - PyErr_Clear(); - if (!PyArg_Parse(args, "(Si)", &argstring, &offset)) - return NULL; - } - buffer = PyString_AsString(argstring); - size = PyString_Size(argstring); + if (!PyArg_ParseTuple(args, "s#|i", &buffer, &size, &offset)) + return NULL; + if (offset < 0 || offset > size) { PyErr_SetString(RegexError, "search offset out of range"); return NULL; @@ -187,58 +175,45 @@ reg_search(re, args) return NULL; } if (result >= 0) { - Py_INCREF(argstring); - re->re_lastok = argstring; + PyObject* str = PyString_FromStringAndSize(buffer, size); + if (!str) + return NULL; + re->re_lastok = str; } return PyInt_FromLong((long)result); /* Position of the match or -1 */ } -static PyObject * -reg_group(re, args) +/* get the group from the regex where index can be a string (group name) or + an integer index [0 .. 99] + */ +static PyObject* +group_from_index(re, index) regexobject *re; - PyObject *args; + PyObject *index; { int i, a, b; - if (args != NULL && PyTuple_Check(args)) { - int n = PyTuple_Size(args); - PyObject *res = PyTuple_New(n); - if (res == NULL) + char *v; + + if (PyString_Check(index)) + if (re->re_groupindex == NULL || + !(index = PyDict_GetItem(re->re_groupindex, index))) + { + PyErr_SetString(RegexError, + "group() group name doesn't exist"); return NULL; - for (i = 0; i < n; i++) { - PyObject *v = reg_group(re, PyTuple_GetItem(args, i)); - if (v == NULL) { - Py_DECREF(res); - return NULL; - } - PyTuple_SetItem(res, i, v); } - return res; - } - if (!PyArg_Parse(args, "i", &i)) { - PyObject *n; - PyErr_Clear(); - if (!PyArg_Parse(args, "S", &n)) - return NULL; - else { - PyObject *index; - if (re->re_groupindex == NULL) - index = NULL; - else - index = PyDict_GetItem(re->re_groupindex, n); - if (index == NULL) { - PyErr_SetString(RegexError, "group() group name doesn't exist"); - return NULL; - } - i = PyInt_AsLong(index); - } - } + + i = PyInt_AsLong(index); + if (i == -1 && PyErr_Occurred()) + return NULL; + if (i < 0 || i >= RE_NREGS) { PyErr_SetString(RegexError, "group() index out of range"); return NULL; } if (re->re_lastok == NULL) { PyErr_SetString(RegexError, - "group() only valid after successful match/search"); + "group() only valid after successful match/search"); return NULL; } a = re->re_regs.start[i]; @@ -247,18 +222,79 @@ reg_group(re, args) Py_INCREF(Py_None); return Py_None; } - return PyString_FromStringAndSize(PyString_AsString(re->re_lastok)+a, b-a); + + if (!(v = PyString_AsString(re->re_lastok))) + return NULL; + + return PyString_FromStringAndSize(v+a, b-a); } + +static PyObject * +regobj_group(re, args) + regexobject *re; + PyObject *args; +{ + int n = PyTuple_Size(args); + int i; + PyObject *res = NULL; + + if (n < 0) + return NULL; + if (n == 0) { + PyErr_SetString(PyExc_TypeError, "not enough arguments"); + return NULL; + } + if (n == 1) { + /* return value is a single string */ + PyObject *index = PyTuple_GetItem(args, 0); + if (!index) + return NULL; + + return group_from_index(re, index); + } + + /* return value is a tuple */ + if (!(res = PyTuple_New(n))) + return NULL; + + for (i = 0; i < n; i++) { + PyObject *index = PyTuple_GetItem(args, i); + PyObject *group = NULL; + + if (!index) + goto finally; + if (!(group = group_from_index(re, index))) + goto finally; + if (PyTuple_SetItem(res, i, group) < 0) + goto finally; + } + return res; + + finally: + Py_DECREF(res); + return NULL; +} + + static struct PyMethodDef reg_methods[] = { - {"match", (PyCFunction)reg_match}, - {"search", (PyCFunction)reg_search}, - {"group", (PyCFunction)reg_group}, + {"match", (PyCFunction)regobj_match, 1}, + {"search", (PyCFunction)regobj_search, 1}, + {"group", (PyCFunction)regobj_group, 1}, {NULL, NULL} /* sentinel */ }; + + +static char* members[] = { + "last", "regs", "translate", + "groupindex", "realpat", "givenpat", + NULL +}; + + static PyObject * -reg_getattr(re, name) +regobj_getattr(re, name) regexobject *re; char *name; { @@ -310,18 +346,25 @@ reg_getattr(re, name) return re->re_givenpat; } if (strcmp(name, "__members__") == 0) { - PyObject *list = PyList_New(6); - if (list) { - PyList_SetItem(list, 0, PyString_FromString("last")); - PyList_SetItem(list, 1, PyString_FromString("regs")); - PyList_SetItem(list, 2, PyString_FromString("translate")); - PyList_SetItem(list, 3, PyString_FromString("groupindex")); - PyList_SetItem(list, 4, PyString_FromString("realpat")); - PyList_SetItem(list, 5, PyString_FromString("givenpat")); - if (PyErr_Occurred()) { + int i = 0; + PyObject *list = NULL; + + /* okay, so it's unlikely this list will change that often. + still, it's easier to change it in just one place. + */ + while (members[i]) + i++; + if (!(list = PyList_New(i))) + return NULL; + + i = 0; + while (members[i]) { + PyObject* v = PyString_FromString(members[i]); + if (!v || PyList_SetItem(list, i, v) < 0) { Py_DECREF(list); - list = NULL; + return NULL; } + i++; } return list; } @@ -330,19 +373,25 @@ reg_getattr(re, name) static PyTypeObject Regextype = { PyObject_HEAD_INIT(&PyType_Type) - 0, /*ob_size*/ - "regex", /*tp_name*/ - sizeof(regexobject), /*tp_size*/ - 0, /*tp_itemsize*/ + 0, /*ob_size*/ + "regex", /*tp_name*/ + sizeof(regexobject), /*tp_size*/ + 0, /*tp_itemsize*/ /* methods */ - (destructor)reg_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - (getattrfunc)reg_getattr, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_compare*/ - 0, /*tp_repr*/ + (destructor)reg_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + (getattrfunc)regobj_getattr, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ }; +/* reference counting invariants: + pattern: borrowed + translate: borrowed + givenpat: borrowed + groupindex: transferred +*/ static PyObject * newregexobject(pattern, translate, givenpat, groupindex) PyObject *pattern; @@ -351,12 +400,15 @@ newregexobject(pattern, translate, givenpat, groupindex) PyObject *groupindex; { regexobject *re; - char *pat = PyString_AsString(pattern); - int size = PyString_Size(pattern); + char *pat; + int size; + if (!PyArg_Parse(pattern, "s#", &pat, &size)) + return NULL; + if (translate != NULL && PyString_Size(translate) != 256) { PyErr_SetString(RegexError, - "translation table must be 256 bytes"); + "translation table must be 256 bytes"); return NULL; } re = PyObject_NEW(regexobject, &Regextype); @@ -365,11 +417,14 @@ newregexobject(pattern, translate, givenpat, groupindex) re->re_patbuf.buffer = NULL; re->re_patbuf.allocated = 0; re->re_patbuf.fastmap = re->re_fastmap; - if (translate) + if (translate) { re->re_patbuf.translate = PyString_AsString(translate); + if (!re->re_patbuf.translate) + goto finally; + Py_INCREF(translate); + } else re->re_patbuf.translate = NULL; - Py_XINCREF(translate); re->re_translate = translate; re->re_lastok = NULL; re->re_groupindex = groupindex; @@ -380,11 +435,13 @@ newregexobject(pattern, translate, givenpat, groupindex) error = re_compile_pattern(pat, size, &re->re_patbuf); if (error != NULL) { PyErr_SetString(RegexError, error); - Py_DECREF(re); - re = NULL; + goto finally; } } return (PyObject *)re; + finally: + Py_DECREF(re); + return NULL; } static PyObject * @@ -394,11 +451,9 @@ regex_compile(self, args) { PyObject *pat = NULL; PyObject *tran = NULL; - if (!PyArg_Parse(args, "S", &pat)) { - PyErr_Clear(); - if (!PyArg_Parse(args, "(SS)", &pat, &tran)) - return NULL; - } + + if (!PyArg_ParseTuple(args, "S|S", &pat, &tran)) + return NULL; return newregexobject(pat, tran, pat, NULL); } @@ -407,26 +462,31 @@ symcomp(pattern, gdict) PyObject *pattern; PyObject *gdict; { - char *opat = PyString_AsString(pattern); - char *oend = opat + PyString_Size(pattern); + char *opat, *oend, *o, *n, *g, *v; int group_count = 0; + int sz; int escaped = 0; - char *o = opat; - char *n; char name_buf[128]; - char *g; PyObject *npattern; int require_escape = re_syntax & RE_NO_BK_PARENS ? 0 : 1; + if (!(opat = PyString_AsString(pattern))) + return NULL; + + if ((sz = PyString_Size(pattern)) < 0) + return NULL; + + oend = opat + sz; + o = opat; + if (oend == opat) { Py_INCREF(pattern); return pattern; } - npattern = PyString_FromStringAndSize((char*)NULL, PyString_Size(pattern)); - if (npattern == NULL) + if (!(npattern = PyString_FromStringAndSize((char*)NULL, sz)) || + !(n = PyString_AsString(npattern))) return NULL; - n = PyString_AsString(npattern); while (o < oend) { if (*o == '(' && escaped == require_escape) { @@ -443,20 +503,23 @@ symcomp(pattern, gdict) g = name_buf; for (++o; o < oend;) { if (*o == '>') { - PyObject *group_name = NULL; - PyObject *group_index = NULL; - *g++ = '\0'; - group_name = PyString_FromString(name_buf); - group_index = PyInt_FromLong(group_count); - if (group_name == NULL || group_index == NULL - || PyDict_SetItem(gdict, group_name, group_index) != 0) { - Py_XDECREF(group_name); - Py_XDECREF(group_index); - Py_XDECREF(npattern); - return NULL; - } - ++o; /* eat the '>' */ - break; + PyObject *group_name = NULL; + PyObject *group_index = NULL; + *g++ = '\0'; + group_name = PyString_FromString(name_buf); + group_index = PyInt_FromLong(group_count); + if (group_name == NULL || + group_index == NULL || + PyDict_SetItem(gdict, group_name, + group_index) != 0) + { + Py_XDECREF(group_name); + Py_XDECREF(group_index); + Py_XDECREF(npattern); + return NULL; + } + ++o; /* eat the '>' */ + break; } if (!isalnum(Py_CHARMASK(*o)) && *o != '_') { o = backtrack; @@ -467,7 +530,7 @@ symcomp(pattern, gdict) } else if (*o == '[' && !escaped) { *n++ = *o; - ++o; /* eat the char following '[' */ + ++o; /* eat the char following '[' */ *n++ = *o; while (o < oend && *o != ']') { ++o; @@ -488,7 +551,12 @@ symcomp(pattern, gdict) } } - if (_PyString_Resize(&npattern, n - PyString_AsString(npattern)) == 0) + if (!(v = PyString_AsString(npattern))) { + Py_DECREF(npattern); + return NULL; + } + /* _PyString_Resize() decrements npattern on failure */ + if (_PyString_Resize(&npattern, n - v) == 0) return npattern; else { return NULL; @@ -505,14 +573,12 @@ regex_symcomp(self, args) PyObject *tran = NULL; PyObject *gdict = NULL; PyObject *npattern; - if (!PyArg_Parse(args, "S", &pattern)) { - PyErr_Clear(); - if (!PyArg_Parse(args, "(SS)", &pattern, &tran)) - return NULL; - } + + if (!PyArg_ParseTuple(args, "S|S", &pattern, &tran)) + return NULL; + gdict = PyDict_New(); - if (gdict == NULL - || (npattern = symcomp(pattern, gdict)) == NULL) { + if (gdict == NULL || (npattern = symcomp(pattern, gdict)) == NULL) { Py_DECREF(gdict); Py_DECREF(pattern); return NULL; @@ -528,17 +594,27 @@ static int update_cache(pat) PyObject *pat; { + PyObject *tuple = Py_BuildValue("(O)", pat); + int status = 0; + + if (!tuple) + return -1; + if (pat != cache_pat) { Py_XDECREF(cache_pat); cache_pat = NULL; Py_XDECREF(cache_prog); - cache_prog = regex_compile((PyObject *)NULL, pat); - if (cache_prog == NULL) - return -1; + cache_prog = regex_compile((PyObject *)NULL, tuple); + if (cache_prog == NULL) { + status = -1; + goto finally; + } cache_pat = pat; Py_INCREF(cache_pat); } - return 0; + finally: + Py_DECREF(tuple); + return status; } static PyObject * @@ -547,11 +623,18 @@ regex_match(self, args) PyObject *args; { PyObject *pat, *string; + PyObject *tuple, *v; + if (!PyArg_Parse(args, "(SS)", &pat, &string)) return NULL; if (update_cache(pat) < 0) return NULL; - return reg_match((regexobject *)cache_prog, string); + + if (!(tuple = Py_BuildValue("(S)", string))) + return NULL; + v = regobj_match((regexobject *)cache_prog, tuple); + Py_DECREF(tuple); + return v; } static PyObject * @@ -560,58 +643,80 @@ regex_search(self, args) PyObject *args; { PyObject *pat, *string; + PyObject *tuple, *v; + if (!PyArg_Parse(args, "(SS)", &pat, &string)) return NULL; if (update_cache(pat) < 0) return NULL; - return reg_search((regexobject *)cache_prog, string); + + if (!(tuple = Py_BuildValue("(S)", string))) + return NULL; + v = regobj_search((regexobject *)cache_prog, tuple); + Py_DECREF(tuple); + return v; } static PyObject * regex_set_syntax(self, args) - PyObject *self, *args; + PyObject *self; + PyObject *args; { int syntax; if (!PyArg_Parse(args, "i", &syntax)) return NULL; syntax = re_set_syntax(syntax); + /* wipe the global pattern cache */ + Py_XDECREF(cache_pat); + cache_pat = NULL; + Py_XDECREF(cache_prog); + cache_prog = NULL; return PyInt_FromLong((long)syntax); } static struct PyMethodDef regex_global_methods[] = { - {"compile", regex_compile, 0}, - {"symcomp", regex_symcomp, 0}, + {"compile", regex_compile, 1}, + {"symcomp", regex_symcomp, 1}, {"match", regex_match, 0}, {"search", regex_search, 0}, {"set_syntax", regex_set_syntax, 0}, - {NULL, NULL} /* sentinel */ + {NULL, NULL} /* sentinel */ }; void initregex() { PyObject *m, *d, *v; + int i; + char *s; m = Py_InitModule("regex", regex_global_methods); d = PyModule_GetDict(m); /* Initialize regex.error exception */ - RegexError = PyString_FromString("regex.error"); - if (RegexError == NULL || PyDict_SetItemString(d, "error", RegexError) != 0) - Py_FatalError("can't define regex.error"); - + v = RegexError = PyString_FromString("regex.error"); + if (v == NULL || PyDict_SetItemString(d, "error", v) != 0) + goto finally; + /* Initialize regex.casefold constant */ - v = PyString_FromStringAndSize((char *)NULL, 256); - if (v != NULL) { - int i; - char *s = PyString_AsString(v); - for (i = 0; i < 256; i++) { - if (isupper(i)) - s[i] = tolower(i); - else - s[i] = i; - } - PyDict_SetItemString(d, "casefold", v); - Py_DECREF(v); + if (!(v = PyString_FromStringAndSize((char *)NULL, 256))) + goto finally; + + if (!(s = PyString_AsString(v))) + goto finally; + + for (i = 0; i < 256; i++) { + if (isupper(i)) + s[i] = tolower(i); + else + s[i] = i; } + if (PyDict_SetItemString(d, "casefold", v) < 0) + goto finally; + Py_DECREF(v); + + if (!PyErr_Occurred()) + return; + finally: + Py_FatalError("can't initialize regex module"); }