AMK's latest

This commit is contained in:
Guido van Rossum 1998-04-03 21:13:31 +00:00
parent 104be4a4a7
commit 042ff9eb3a
4 changed files with 168 additions and 90 deletions

View File

@ -3,7 +3,7 @@
*************************************************/
#define PCRE_VERSION "1.04 22-Dec-1997"
#define PCRE_VERSION "1.07 16-Feb-1998"
/* This is a library of functions to support regular expressions whose syntax
@ -12,7 +12,7 @@ the file Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1997 University of Cambridge
Copyright (c) 1998 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
@ -192,6 +192,7 @@ enum {
OP_CRMINRANGE,
OP_CLASS, /* Match a character class */
OP_NEGCLASS, /* Match a character class, specified negatively */
OP_CLASS_L, /* Match a character class */
OP_REF, /* Match a back reference */

View File

@ -2,7 +2,7 @@
* Perl-Compatible Regular Expressions *
*************************************************/
/* Copyright (c) 1997 University of Cambridge */
/* Copyright (c) 1998 University of Cambridge */
#ifndef _PCRE_H
#define _PCRE_H
@ -17,6 +17,12 @@ it is needed here for malloc. */
#include <sys/types.h>
#include <stdlib.h>
/* Allow for C++ users */
#ifdef __cplusplus
extern "C" {
#endif
/* Options */
#define PCRE_CASELESS 0x0001
@ -68,4 +74,8 @@ extern int pcre_info(const pcre *, int *, int *);
extern pcre_extra *pcre_study(const pcre *, int, const char **);
extern const char *pcre_version(void);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* End of pcre.h */

View File

@ -72,7 +72,7 @@ staticforward PyTypeObject Pcre_Type;
#define NOT_WORD_BOUNDARY 6
#define BEGINNING_OF_BUFFER 7
#define END_OF_BUFFER 8
#define STRING 9
static PcreObject *
newPcreObject(arg)
@ -191,49 +191,20 @@ PyPcre_compile(self, args)
{
PcreObject *rv;
PyObject *dictionary;
char *pattern, *newpattern;
char *pattern;
const char *error;
int num_zeros, i, j;
int patternlen, options, erroroffset;
if (!PyArg_ParseTuple(args, "s#iO!", &pattern, &patternlen, &options,
int options, erroroffset;
if (!PyArg_ParseTuple(args, "siO!", &pattern, &options,
&PyDict_Type, &dictionary))
return NULL;
rv = newPcreObject(args);
if ( rv == NULL )
return NULL;
/* PCRE doesn't like having null bytes in its pattern, so we have to replace
any zeros in the string with the characters '\000'. This increases the size
of the string by 3*num_zeros, plus 1 byte for the terminating \0. */
num_zeros=1; /* Start at 1; this will give 3 extra bytes of leeway */
for(i=0; i<patternlen; i++) {
if (pattern[i]==0) num_zeros++;
}
newpattern=malloc(patternlen + num_zeros*3 + 4);
if (newpattern==NULL) {
PyErr_SetString(PyExc_MemoryError, "can't allocate memory for new pattern");
return NULL;
}
for (i=j=0; i<patternlen; i++, j++)
{
if (pattern[i]!=0) newpattern[j]=pattern[i];
else {
newpattern[j++] ='\\';
newpattern[j++] = '0';
newpattern[j++] = '0';
newpattern[j ] = '0';
}
}
/* Keep purify happy; for pcre, one null byte is enough! */
newpattern[j++]='\0';
newpattern[j++]='\0';
newpattern[j++]='\0';
newpattern[j]='\0';
rv->regex = pcre_compile((char*)newpattern, options,
rv->regex = pcre_compile((char*)pattern, options,
&error, &erroroffset, dictionary);
free(newpattern);
if (rv->regex==NULL)
{
PyMem_DEL(rv);
@ -312,6 +283,10 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr)
*indexptr=index;
return Py_BuildValue("c", (char)8);
break;
case('\\'):
*indexptr=index;
return Py_BuildValue("c", '\\');
break;
case('x'):
{
@ -348,6 +323,8 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr)
case('g'):
{
int end, i;
int group_num = 0, is_number=0;
if (pattern_len<=index)
{
PyErr_SetString(ErrorObject, "unfinished symbolic reference");
@ -374,16 +351,22 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr)
PyErr_SetString(ErrorObject, "zero-length symbolic reference");
return NULL;
}
if (!(pcre_ctypes[pattern[index]] & ctype_word) /* First char. not alphanumeric */
|| (pcre_ctypes[pattern[index]] & ctype_digit) ) /* First char. a digit */
if ((pcre_ctypes[pattern[index]] & ctype_digit)) /* First char. a digit */
{
/* XXX should include the text of the reference */
PyErr_SetString(ErrorObject, "first character of symbolic reference not a letter or _");
return NULL;
is_number = 1;
group_num = pattern[index] - '0';
}
for(i=index+1; i<end; i++)
{
if (is_number &&
!(pcre_ctypes[pattern[i]] & ctype_digit) )
{
/* XXX should include the text of the reference */
PyErr_SetString(ErrorObject, "illegal non-digit character in \\g<...> starting with digit");
return NULL;
}
else {group_num = group_num * 10 + pattern[i] - '0';}
if (!(pcre_ctypes[pattern[i]] & ctype_word) )
{
/* XXX should include the text of the reference */
@ -394,6 +377,9 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr)
*typeptr = MEMORY_REFERENCE;
*indexptr = end+1;
/* If it's a number, return the integer value of the group */
if (is_number) return Py_BuildValue("i", group_num);
/* Otherwise, return a string containing the group name */
return Py_BuildValue("s#", pattern+index, end-index);
}
break;
@ -478,8 +464,11 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr)
break;
default:
/* It's some unknown escape like \s, so return a string containing
\s */
*typeptr = STRING;
*indexptr = index;
return Py_BuildValue("c", c);
return Py_BuildValue("s#", pattern+index-2, 2);
break;
}
}
@ -571,6 +560,12 @@ PyPcre_expand(self, args)
Py_DECREF(result);
}
break;
case(STRING):
{
PyList_Append(results, value);
total_len += PyString_Size(value);
break;
}
default:
Py_DECREF(results);
PyErr_SetString(ErrorObject,

View File

@ -211,7 +211,7 @@ the file Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1997 University of Cambridge
Copyright (c) 1998 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
@ -409,6 +409,7 @@ do
according to the repeat count. */
case OP_CLASS:
case OP_NEGCLASS:
{
tcode++;
for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
@ -547,7 +548,7 @@ the file Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1997 University of Cambridge
Copyright (c) 1998 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
@ -586,18 +587,26 @@ the external pcre header. */
#ifndef Py_eval_input
/* For Python 1.4, graminit.h has to be explicitly included */
#define Py_eval_input eval_input
#endif /* FOR_PYTHON */
/* Allow compilation as C++ source code, should anybody want to do that. */
#ifdef __cplusplus
#define class pcre_class
#endif
/* Min and max values for the common repeats; for the maxima, 0 => infinity */
static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
/* Text forms of OP_ values and things, for debugging */
/* Text forms of OP_ values and things, for debugging (not all used) */
#ifdef DEBUG
static const char *OP_names[] = {
@ -610,7 +619,7 @@ static const char *OP_names[] = {
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{",
"class", "classL", "Ref",
"class", "negclass", "classL", "Ref",
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
"Brazero", "Braminzero", "Bra"
};
@ -621,7 +630,7 @@ are simple data values; negative values are for special things like \d and so
on. Zero means further processing is needed (for things like \x), or the escape
is invalid. */
static short int escapes[] = {
static const short int escapes[] = {
0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
'@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
@ -636,8 +645,9 @@ static short int escapes[] = {
/* Definition to allow mutual recursion */
static BOOL compile_regex(int, int *, uschar **, const uschar **,
const char **, PyObject *);
static BOOL
compile_regex(int, int *, uschar **, const uschar **, const char **,
PyObject *);
/* Structure for passing "static" information around between the functions
doing the matching, so that they are thread-safe. */
@ -866,12 +876,13 @@ do {
/* Check a class or a back reference for a zero minimum */
case OP_CLASS:
case OP_NEGCLASS:
case OP_REF:
case OP_CLASS_L:
switch(*cc)
{
case (OP_REF): cc += 2; break;
case (OP_CLASS): cc += 1+32; break;
case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
case (OP_CLASS_L): cc += 1+1+32; break;
}
@ -1017,15 +1028,17 @@ else
{
/* PYTHON: Try to compute an octal value for a character */
for(c=0, i=0; c!=-1 && ptr[i]!=0 && i<3; i++)
for(c=0, i=0; ptr[i]!=0 && i<3; i++)
{
if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
c = c * 8 + ptr[i]-'0';
else
c = -1; /* Non-octal character */
break; /* Non-octal character--break out of the loop */
}
/* Aha! There were 3 octal digits, so it must be a character */
if (c != -1 && i == 3)
/* It's a character if there were exactly 3 octal digits, or if
we're inside a character class and there was at least one
octal digit. */
if ( (i == 3) || (isclass && i!=0) )
{
ptr += i-1;
break;
@ -1278,11 +1291,14 @@ for (;; ptr++)
class_flag = NULL;
}
/* If the first character is '^', set the negation flag */
/* If the first character is '^', set the negation flag, and use a
different opcode. This only matters if caseless matching is specified at
runtime. */
if ((c = *(++ptr)) == '^')
{
negate_class = TRUE;
if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
c = *(++ptr);
}
else negate_class = FALSE;
@ -1648,7 +1664,8 @@ for (;; ptr++)
/* If previous was a character class or a back reference, we put the repeat
stuff after it. */
else if (*previous == OP_CLASS || *previous==OP_CLASS_L || *previous == OP_REF)
else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
*previous==OP_CLASS_L || *previous == OP_REF)
{
if (repeat_min == 0 && repeat_max == -1)
*code++ = OP_CRSTAR + repeat_type;
@ -2003,7 +2020,7 @@ for (;; ptr++)
the next state. */
previous[1] = length;
ptr--;
if (length < 255) ptr--;
break;
}
} /* end of big loop */
@ -2832,6 +2849,7 @@ while (code < code_end)
goto CLASS_REF_REPEAT;
case OP_CLASS:
case OP_NEGCLASS:
case OP_CLASS_L:
{
int i, min, max;
@ -2840,11 +2858,14 @@ while (code < code_end)
{
code++;
printf("Locflag = %i ", *code++);
printf(" [");
}
else
code++;
printf(" [");
{
if (*code++ == OP_CLASS) printf(" [");
else printf(" ^[");
}
for (i = 0; i < 256; i++)
{
@ -3601,10 +3622,14 @@ for (;;)
item to see if there is repeat information following. Then obey similar
code to character type repeats - written out again for speed. If caseless
matching was set at runtime but not at compile time, we have to check both
versions of a character. */
versions of a character, and we have to behave differently for positive and
negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
treated differently. */
case OP_CLASS:
case OP_NEGCLASS:
{
BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
const uschar *data = ecode + 1; /* Save for matching */
ecode += 33; /* Advance past the item */
@ -3633,15 +3658,8 @@ for (;;)
break;
default: /* No repeat follows */
if (eptr >= md->end_subject) FAIL;
c = *eptr++;
if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
if (md->runtime_caseless)
{
c = pcre_fcc[c];
if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
}
FAIL;
min = max = 1;
break;
}
/* First, ensure the minimum number of matches are present. */
@ -3650,13 +3668,31 @@ for (;;)
{
if (eptr >= md->end_subject) FAIL;
c = *eptr++;
if ((data[c/8] & (1 << (c&7))) != 0) continue;
if (md->runtime_caseless)
/* Either not runtime caseless, or it was a positive class. For
runtime caseless, continue if either case is in the map. */
if (!nasty_case)
{
c = pcre_fcc[c];
if ((data[c/8] & (1 << (c&7))) != 0) continue;
if (md->runtime_caseless)
{
c = pcre_fcc[c];
if ((data[c/8] & (1 << (c&7))) != 0) continue;
}
}
FAIL;
/* Runtime caseless and it was a negative class. Continue only if
both cases are in the map. */
else
{
if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
c = pcre_fcc[c];
if ((data[c/8] & (1 << (c&7))) != 0) continue;
}
FAIL;
}
/* If max == min we can continue with the main loop without the
@ -3674,12 +3710,30 @@ for (;;)
if (match(eptr, ecode, offset_top, md)) SUCCEED;
if (i >= max || eptr >= md->end_subject) FAIL;
c = *eptr++;
if ((data[c/8] & (1 << (c&7))) != 0) continue;
if (md->runtime_caseless)
/* Either not runtime caseless, or it was a positive class. For
runtime caseless, continue if either case is in the map. */
if (!nasty_case)
{
c = pcre_fcc[c];
if ((data[c/8] & (1 << (c&7))) != 0) continue;
if (md->runtime_caseless)
{
c = pcre_fcc[c];
if ((data[c/8] & (1 << (c&7))) != 0) continue;
}
}
/* Runtime caseless and it was a negative class. Continue only if
both cases are in the map. */
else
{
if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
c = pcre_fcc[c];
if ((data[c/8] & (1 << (c&7))) != 0) continue;
}
FAIL;
}
/* Control never gets here */
@ -3694,12 +3748,30 @@ for (;;)
{
if (eptr >= md->end_subject) break;
c = *eptr;
if ((data[c/8] & (1 << (c&7))) != 0) continue;
if (md->runtime_caseless)
/* Either not runtime caseless, or it was a positive class. For
runtime caseless, continue if either case is in the map. */
if (!nasty_case)
{
if ((data[c/8] & (1 << (c&7))) != 0) continue;
if (md->runtime_caseless)
{
c = pcre_fcc[c];
if ((data[c/8] & (1 << (c&7))) != 0) continue;
}
}
/* Runtime caseless and it was a negative class. Continue only if
both cases are in the map. */
else
{
if ((data[c/8] & (1 << (c&7))) == 0) break;
c = pcre_fcc[c];
if ((data[c/8] & (1 << (c&7))) != 0) continue;
}
break;
}
@ -4430,17 +4502,17 @@ pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
/* The "volatile" directives are to make gcc -Wall stop complaining
that these variables can be clobbered by the longjmp. Hopefully
they won't cost too much performance. */
int resetcount, ocount;
int first_char = -1;
volatile int resetcount, ocount;
volatile int first_char = -1;
match_data match_block;
const uschar *start_bits = NULL;
const uschar *start_match = (const uschar *)subject + start_pos;
const uschar *end_subject;
const real_pcre *re = (const real_pcre *)external_re;
const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
BOOL using_temporary_offsets = FALSE;
BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
BOOL startline = (re->options & PCRE_STARTLINE) != 0;
volatile BOOL using_temporary_offsets = FALSE;
volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
@ -4480,7 +4552,7 @@ ocount = offsetcount & (-2);
if (re->top_backref > 0 && re->top_backref >= ocount/2)
{
ocount = re->top_backref * 2 + 2;
match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
using_temporary_offsets = TRUE;
DPRINTF(("Got memory to hold back references\n"));
@ -4639,10 +4711,10 @@ do
free_stack(&match_block);
return rc;
} /* End of (if setjmp(match_block.error_env)...) */
free_stack(&match_block);
/* Return an error code; pcremodule.c will preserve the exception */
if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
free_stack(&match_block);
}
while (!anchored &&
match_block.errorcode == PCRE_ERROR_NOMATCH &&