/* string_format.h -- implementation of string.format(). It uses the Objects/stringlib conventions, so that it can be compiled for both unicode and string objects. */ /* Defines for more efficiently reallocating the string buffer */ #define INITIAL_SIZE_INCREMENT 100 #define SIZE_MULTIPLIER 2 #define MAX_SIZE_INCREMENT 3200 /************************************************************************/ /*********** Global data structures and forward declarations *********/ /************************************************************************/ /* A SubString consists of the characters between two string or unicode pointers. */ typedef struct { STRINGLIB_CHAR *ptr; STRINGLIB_CHAR *end; } SubString; /* forward declaration for recursion */ static PyObject * build_string(SubString *input, PyObject *args, PyObject *kwargs, int *recursion_level); /************************************************************************/ /************************** Utility functions ************************/ /************************************************************************/ /* fill in a SubString from a pointer and length */ Py_LOCAL_INLINE(void) SubString_init(SubString *str, STRINGLIB_CHAR *p, Py_ssize_t len) { str->ptr = p; if (p == NULL) str->end = NULL; else str->end = str->ptr + len; } Py_LOCAL_INLINE(PyObject *) SubString_new_object(SubString *str) { return STRINGLIB_NEW(str->ptr, str->end - str->ptr); } /************************************************************************/ /*********** Error handling and exception generation **************/ /************************************************************************/ /* Most of our errors are value errors, because to Python, the format string is a "value". Also, it's convenient to return a NULL when we are erroring out. XXX: need better error handling, per PEP 3101. */ static void * SetError(const char *s) { /* PyErr_Format always returns NULL */ return PyErr_Format(PyExc_ValueError, "%s in format string", s); } /* check_input returns True if we still have characters left in the input string. XXX: make this function go away when better error handling is implemented. */ Py_LOCAL_INLINE(int) check_input(SubString *input) { if (input->ptr < input->end) return 1; PyErr_SetString(PyExc_ValueError, "unterminated replacement field"); return 0; } /************************************************************************/ /*********** Output string management functions ****************/ /************************************************************************/ typedef struct { STRINGLIB_CHAR *ptr; STRINGLIB_CHAR *end; PyObject *obj; Py_ssize_t size_increment; } OutputString; /* initialize an OutputString object, reserving size characters */ static int output_initialize(OutputString *output, Py_ssize_t size) { output->obj = STRINGLIB_NEW(NULL, size); if (output->obj == NULL) return 0; output->ptr = STRINGLIB_STR(output->obj); output->end = STRINGLIB_LEN(output->obj) + output->ptr; output->size_increment = INITIAL_SIZE_INCREMENT; return 1; } /* output_extend reallocates the output string buffer. It returns a status: 0 for a failed reallocation, 1 for success. */ static int output_extend(OutputString *output, Py_ssize_t count) { STRINGLIB_CHAR *startptr = STRINGLIB_STR(output->obj); Py_ssize_t curlen = output->ptr - startptr; Py_ssize_t maxlen = curlen + count + output->size_increment; if (STRINGLIB_RESIZE(&output->obj, maxlen) < 0) return 0; startptr = STRINGLIB_STR(output->obj); output->ptr = startptr + curlen; output->end = startptr + maxlen; if (output->size_increment < MAX_SIZE_INCREMENT) output->size_increment *= SIZE_MULTIPLIER; return 1; } /* output_data dumps characters into our output string buffer. In some cases, it has to reallocate the string. It returns a status: 0 for a failed reallocation, 1 for success. */ static int output_data(OutputString *output, const STRINGLIB_CHAR *s, Py_ssize_t count) { if ((count > output->end - output->ptr) && !output_extend(output, count)) return 0; memcpy(output->ptr, s, count * sizeof(STRINGLIB_CHAR)); output->ptr += count; return 1; } /************************************************************************/ /*********** Format string parsing -- integers and identifiers *********/ /************************************************************************/ /* end_identifier returns true if a character marks the end of an identifier string. Although the PEP specifies that identifiers are numbers or valid Python identifiers, we just let getattr/getitem handle that, so the implementation is more flexible than the PEP would indicate. */ Py_LOCAL_INLINE(int) end_identifier(STRINGLIB_CHAR c) { switch (c) { case '.': case '[': case ']': return 1; default: return 0; } } /* get_integer consumes 0 or more decimal digit characters from an input string, updates *result with the corresponding positive integer, and returns the number of digits consumed. returns -1 on error. */ static int get_integer(STRINGLIB_CHAR **ptr, STRINGLIB_CHAR *end, Py_ssize_t *result) { Py_ssize_t accumulator, digitval, oldaccumulator; int numdigits; accumulator = numdigits = 0; for (;;(*ptr)++, numdigits++) { if (*ptr >= end) break; digitval = STRINGLIB_TODECIMAL(**ptr); if (digitval < 0) break; /* This trick was copied from old Unicode format code. It's cute, but would really suck on an old machine with a slow divide implementation. Fortunately, in the normal case we do not expect too many digits. */ oldaccumulator = accumulator; accumulator *= 10; if ((accumulator+10)/10 != oldaccumulator+1) { PyErr_Format(PyExc_ValueError, "Too many decimal digits in format string"); return -1; } accumulator += digitval; } *result = accumulator; return numdigits; } /* get_identifier is a bit of a misnomer. It returns a value for use with getattr or getindex. This value will a string/unicode object. The input cannot be zero length. Continues until end of input, or end_identifier() returns true. */ static PyObject * get_identifier(SubString *input) { STRINGLIB_CHAR *start; for (start = input->ptr; input->ptr < input->end && !end_identifier(*input->ptr); input->ptr++) ; return STRINGLIB_NEW(start, input->ptr - start); /* We might want to add code here to check for invalid Python identifiers. All identifiers are eventually passed to getattr or getitem, so there is a check when used. However, we might want to remove (or not) the ability to have strings like "a/b" or " ab" or "-1" (which is not parsed as a number). For now, this is left as an exercise for the first disgruntled user... if (XXX -- need check function) { Py_DECREF(result); PyErr_SetString(PyExc_ValueError, "Invalid embedded Python identifier"); return NULL; } */ } /************************************************************************/ /******** Functions to get field objects and specification strings ******/ /************************************************************************/ /* get_field_and_spec is the main function in this section. It parses the format string well enough to return a field object to render along with a field specification string. */ /* look up key in our keyword arguments */ static PyObject * key_lookup(PyObject *kwargs, PyObject *key) { PyObject *result; if (kwargs && (result = PyDict_GetItem(kwargs, key)) != NULL) { Py_INCREF(result); return result; } return NULL; } /* get_field_object returns the object inside {}, before the format_spec. It handles getindex and getattr lookups and consumes the entire input string. */ static PyObject * get_field_object(SubString *input, PyObject *args, PyObject *kwargs) { PyObject *myobj, *subobj, *newobj; STRINGLIB_CHAR c; Py_ssize_t index; int isindex, isnumeric, isargument; index = isnumeric = 0; /* Just to shut up the compiler warnings */ myobj = args; Py_INCREF(myobj); for (isindex=1, isargument=1;;) { if (!check_input(input)) break; if (!isindex) { if ((subobj = get_identifier(input)) == NULL) break; newobj = PyObject_GetAttr(myobj, subobj); Py_DECREF(subobj); } else { isnumeric = (STRINGLIB_ISDECIMAL(*input->ptr)); if (isnumeric) /* XXX: add error checking */ get_integer(&input->ptr, input->end, &index); if (isnumeric && PySequence_Check(myobj)) newobj = PySequence_GetItem(myobj, index); else { /* XXX -- do we need PyLong_FromLongLong? Using ssizet, not int... */ subobj = isnumeric ? PyInt_FromLong(index) : get_identifier(input); if (subobj == NULL) break; if (isargument) { newobj = key_lookup(kwargs, subobj); } else { newobj = PyObject_GetItem(myobj, subobj); } Py_DECREF(subobj); } } Py_DECREF(myobj); myobj = newobj; if (myobj == NULL) break; if (!isargument && isindex) if ((!check_input(input)) || (*(input->ptr++) != ']')) { SetError("Expected ]"); break; } /* if at the end of input, return with myobj */ if (input->ptr >= input->end) return myobj; c = *input->ptr; input->ptr++; isargument = 0; isindex = (c == '['); if (!isindex && (c != '.')) { SetError("Expected ., [, :, !, or }"); break; } } if ((myobj == NULL) && isargument) { /* XXX: include more useful error information, like which * keyword not found or which index missing */ PyErr_Clear(); return SetError(isnumeric ? "Not enough positional arguments" : "Keyword argument not found"); } Py_XDECREF(myobj); return NULL; } /************************************************************************/ /***************** Field rendering functions **************************/ /************************************************************************/ /* render_field() is the main function in this section. It takes the field object and field specification string generated by get_field_and_spec, and renders the field into the output string. format() does the actual calling of the objects __format__ method. */ /* returns fieldobj.__format__(format_spec) */ static PyObject * format(PyObject *fieldobj, SubString *format_spec) { static PyObject *format_str = NULL; PyObject *meth; PyObject *spec = NULL; PyObject *result = NULL; /* Initialize cached value */ if (format_str == NULL) { /* Initialize static variable needed by _PyType_Lookup */ format_str = PyUnicode_FromString("__format__"); if (format_str == NULL) return NULL; } /* Make sure the type is initialized. float gets initialized late */ if (Py_Type(fieldobj)->tp_dict == NULL) if (PyType_Ready(Py_Type(fieldobj)) < 0) return NULL; /* we need to create an object out of the pointers we have */ spec = SubString_new_object(format_spec); if (spec == NULL) goto done; /* Find the (unbound!) __format__ method (a borrowed reference) */ meth = _PyType_Lookup(Py_Type(fieldobj), format_str); if (meth == NULL) { PyErr_Format(PyExc_TypeError, "Type %.100s doesn't define __format__", Py_Type(fieldobj)->tp_name); goto done; } /* And call it, binding it to the value */ result = PyObject_CallFunctionObjArgs(meth, fieldobj, spec, NULL); if (result == NULL) goto done; if (!STRINGLIB_CHECK(result)) { PyErr_SetString(PyExc_TypeError, "__format__ method did not return " STRINGLIB_TYPE_NAME); Py_DECREF(result); result = NULL; goto done; } done: Py_XDECREF(spec); return result; } /* render_field calls fieldobj.__format__(format_spec) method, and appends to the output. */ static int render_field(PyObject *fieldobj, SubString *format_spec, OutputString *output) { int ok = 0; PyObject *result = format(fieldobj, format_spec); if (result == NULL) goto done; ok = output_data(output, STRINGLIB_STR(result), STRINGLIB_LEN(result)); done: Py_XDECREF(result); return ok; } static int parse_field(SubString *str, SubString *field_name, SubString *format_spec, STRINGLIB_CHAR *conversion) { STRINGLIB_CHAR c = 0; /* initialize these, as they may be empty */ *conversion = '\0'; SubString_init(format_spec, NULL, 0); /* search for the field name. it's terminated by the end of the string, or a ':' or '!' */ field_name->ptr = str->ptr; while (str->ptr < str->end) { switch (c = *(str->ptr++)) { case ':': case '!': break; default: continue; } break; } if (c == '!' || c == ':') { /* we have a format specifier and/or a conversion */ /* don't include the last character */ field_name->end = str->ptr-1; /* the format specifier is the rest of the string */ format_spec->ptr = str->ptr; format_spec->end = str->end; /* see if there's a conversion specifier */ if (c == '!') { /* there must be another character present */ if (format_spec->ptr >= format_spec->end) { PyErr_SetString(PyExc_ValueError, "end of format while looking for conversion " "specifier"); return 0; } *conversion = *(format_spec->ptr++); /* if there is another character, it must be a colon */ if (format_spec->ptr < format_spec->end) { c = *(format_spec->ptr++); if (c != ':') { PyErr_SetString(PyExc_ValueError, "expected ':' after format specifier"); return 0; } } } return 1; } else { /* end of string, there's no format_spec or conversion */ field_name->end = str->ptr; return 1; } } /************************************************************************/ /******* Output string allocation and escape-to-markup processing ******/ /************************************************************************/ /* MarkupIterator breaks the string into pieces of either literal text, or things inside {} that need to be marked up. it is designed to make it easy to wrap a Python iterator around it, for use with the Formatter class */ typedef struct { SubString str; int in_markup; } MarkupIterator; static int MarkupIterator_init(MarkupIterator *self, STRINGLIB_CHAR *ptr, Py_ssize_t len) { SubString_init(&self->str, ptr, len); self->in_markup = 0; return 1; } /* returns 0 on error, 1 on non-error termination, and 2 if it got a string (or something to be expanded) */ static int MarkupIterator_next(MarkupIterator *self, int *is_markup, SubString *literal, SubString *field_name, SubString *format_spec, STRINGLIB_CHAR *conversion, int *format_spec_needs_expanding) { int at_end; STRINGLIB_CHAR c = 0; STRINGLIB_CHAR *start; int count; Py_ssize_t len; *format_spec_needs_expanding = 0; /* no more input, end of iterator */ if (self->str.ptr >= self->str.end) return 1; *is_markup = self->in_markup; start = self->str.ptr; if (self->in_markup) { /* prepare for next iteration */ self->in_markup = 0; /* this is markup, find the end of the string by counting nested braces. note that this prohibits escaped braces, so that format_specs cannot have braces in them. */ count = 1; /* we know we can't have a zero length string, so don't worry about that case */ while (self->str.ptr < self->str.end) { switch (c = *(self->str.ptr++)) { case '{': /* the format spec needs to be recursively expanded. this is an optimization, and not strictly needed */ *format_spec_needs_expanding = 1; count++; break; case '}': count--; if (count <= 0) { /* we're done. parse and get out */ literal->ptr = start; literal->end = self->str.ptr-1; if (parse_field(literal, field_name, format_spec, conversion) == 0) return 0; /* success */ return 2; } break; } } /* end of string while searching for matching '}' */ PyErr_SetString(PyExc_ValueError, "unmatched '{' in format"); return 0; } else { /* literal text, read until the end of string, an escaped { or }, or an unescaped { */ while (self->str.ptr < self->str.end) { switch (c = *(self->str.ptr++)) { case '{': case '}': self->in_markup = 1; break; default: continue; } break; } at_end = self->str.ptr >= self->str.end; len = self->str.ptr - start; if ((c == '}') && (at_end || (c != *self->str.ptr))) { SetError("Single } encountered"); return 0; } if (at_end && c == '{') { SetError("Single { encountered"); return 0; } if (!at_end) { if (c == *self->str.ptr) { /* escaped } or {, skip it in the input */ self->str.ptr++; self->in_markup = 0; } else len--; } /* this is just plain text, return it */ literal->ptr = start; literal->end = start + len; return 2; } } /* do the !r or !s conversion on obj */ static PyObject * do_conversion(PyObject *obj, STRINGLIB_CHAR conversion) { /* XXX in pre-3.0, do we need to convert this to unicode, since it might have returned a string? */ switch (conversion) { case 'r': return PyObject_Repr(obj); case 's': return PyObject_Unicode(obj); default: PyErr_Format(PyExc_ValueError, "Unknown converion specifier %c", conversion); return NULL; } } /* given: {field_name!conversion:format_spec} compute the result and write it to output. format_spec_needs_expanding is an optimization. if it's false, just output the string directly, otherwise recursively expand the format_spec string. */ static int output_markup(SubString *field_name, SubString *format_spec, int format_spec_needs_expanding, STRINGLIB_CHAR conversion, OutputString *output, PyObject *args, PyObject *kwargs, int *recursion_level) { PyObject *tmp = NULL; PyObject *fieldobj = NULL; SubString expanded_format_spec; SubString *actual_format_spec; int result = 0; /* convert field_name to an object */ fieldobj = get_field_object(field_name, args, kwargs); if (fieldobj == NULL) goto done; if (conversion != '\0') { tmp = do_conversion(fieldobj, conversion); if (tmp == NULL) goto done; /* do the assignment, transferring ownership: fieldobj = tmp */ Py_DECREF(fieldobj); fieldobj = tmp; tmp = NULL; } /* if needed, recurively compute the format_spec */ if (format_spec_needs_expanding) { tmp = build_string(format_spec, args, kwargs, recursion_level); if (tmp == NULL) goto done; /* note that in the case we're expanding the format string, tmp must be kept around until after the call to render_field. */ SubString_init(&expanded_format_spec, STRINGLIB_STR(tmp), STRINGLIB_LEN(tmp)); actual_format_spec = &expanded_format_spec; } else actual_format_spec = format_spec; if (render_field(fieldobj, actual_format_spec, output) == 0) goto done; result = 1; done: Py_XDECREF(fieldobj); Py_XDECREF(tmp); return result; } /* do_markup is the top-level loop for the format() function. It searches through the format string for escapes to markup codes, and calls other functions to move non-markup text to the output, and to perform the markup to the output. */ static int do_markup(SubString *input, PyObject *args, PyObject *kwargs, OutputString *output, int *recursion_level) { MarkupIterator iter; int is_markup; int format_spec_needs_expanding; int result; SubString str; SubString field_name; SubString format_spec; STRINGLIB_CHAR conversion; MarkupIterator_init(&iter, input->ptr, input->end - input->ptr); while ((result = MarkupIterator_next(&iter, &is_markup, &str, &field_name, &format_spec, &conversion, &format_spec_needs_expanding)) == 2) { if (is_markup) { if (!output_markup(&field_name, &format_spec, format_spec_needs_expanding, conversion, output, args, kwargs, recursion_level)) return 0; } else { if (!output_data(output, str.ptr, str.end-str.ptr)) return 0; } } return result; } /* build_string allocates the output string and then calls do_markup to do the heavy lifting. */ static PyObject * build_string(SubString *input, PyObject *args, PyObject *kwargs, int *recursion_level) { OutputString output; PyObject *result = NULL; Py_ssize_t count; output.obj = NULL; /* needed so cleanup code always works */ /* check the recursion level */ (*recursion_level)--; if (*recursion_level < 0) { PyErr_SetString(PyExc_ValueError, "Max string recursion exceeded"); goto done; } /* initial size is the length of the format string, plus the size increment. seems like a reasonable default */ if (!output_initialize(&output, input->end - input->ptr + INITIAL_SIZE_INCREMENT)) goto done; if (!do_markup(input, args, kwargs, &output, recursion_level)) { goto done; } count = output.ptr - STRINGLIB_STR(output.obj); if (STRINGLIB_RESIZE(&output.obj, count) < 0) { goto done; } /* transfer ownership to result */ result = output.obj; output.obj = NULL; done: (*recursion_level)++; Py_XDECREF(output.obj); return result; } /************************************************************************/ /*********** main routine ***********************************************/ /************************************************************************/ /* this is the main entry point */ static PyObject * do_string_format(PyObject *self, PyObject *args, PyObject *kwargs) { SubString input; /* PEP 3101 says only 2 levels, so that "{0:{1}}".format('abc', 's') # works "{0:{1:{2}}}".format('abc', 's', '') # fails */ int recursion_level = 2; SubString_init(&input, STRINGLIB_STR(self), STRINGLIB_LEN(self)); return build_string(&input, args, kwargs, &recursion_level); }