[Old patch that hadn't been checked in.]

get_starttag_text():  New method.
        Return the text of the most recently parsed start tag, from
        the '<' to the '>' or '/'.  Not really useful for structure
        processing, but requested for Web-related use.  May also be
        useful for being able to re-generate the input from the parse
        events, but there's no equivalent for end tags.

attrfind:  Be a little more forgiving of unquoted attribute values.
This commit is contained in:
Fred Drake 2000-06-29 18:50:59 +00:00
parent 8094611eb8
commit b46696c0ed
1 changed files with 11 additions and 2 deletions

View File

@ -37,7 +37,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9]*')
attrfind = re.compile(
'[%s]*([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace
+ ('([%s]*=[%s]*' % (string.whitespace, string.whitespace))
+ r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!\(\)_#=~]*))?')
+ r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!&$\(\)_#=~]*))?')
# SGML parser base class -- find tags and call handler functions.
@ -207,9 +207,15 @@ class SGMLParser:
self.handle_pi(rawdata[i+2: j])
j = match.end(0)
return j-i
__starttag_text = None
def get_starttag_text(self):
return self.__starttag_text
# Internal -- handle starttag, return length or -1 if not terminated
def parse_starttag(self, i):
self.__starttag_text = None
start_pos = i
rawdata = self.rawdata
if shorttagopen.match(rawdata, i):
# SGML shorthand: <tag/data/ == <tag>data</tag>
@ -220,9 +226,11 @@ class SGMLParser:
if not match:
return -1
tag, data = match.group(1, 2)
self.__starttag_text = '<%s/' % tag
tag = string.lower(tag)
self.finish_shorttag(tag, data)
k = match.end(0)
self.finish_shorttag(tag, data)
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
return k
# XXX The following should skip matching quotes (' or ")
match = endbracket.search(rawdata, i+1)
@ -255,6 +263,7 @@ class SGMLParser:
k = match.end(0)
if rawdata[j] == '>':
j = j+1
self.__starttag_text = rawdata[start_pos:j]
self.finish_starttag(tag, attrs)
return j