[Old patch that hadn't been checked in.]
get_starttag_text(): New method. Return the text of the most recently parsed start tag, from the '<' to the '>' or '/'. Not really useful for structure processing, but requested for Web-related use. May also be useful for being able to re-generate the input from the parse events, but there's no equivalent for end tags. attrfind: Be a little more forgiving of unquoted attribute values.
This commit is contained in:
parent
8094611eb8
commit
b46696c0ed
|
@ -37,7 +37,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9]*')
|
|||
attrfind = re.compile(
|
||||
'[%s]*([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace
|
||||
+ ('([%s]*=[%s]*' % (string.whitespace, string.whitespace))
|
||||
+ r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!\(\)_#=~]*))?')
|
||||
+ r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!&$\(\)_#=~]*))?')
|
||||
|
||||
|
||||
# SGML parser base class -- find tags and call handler functions.
|
||||
|
@ -207,9 +207,15 @@ class SGMLParser:
|
|||
self.handle_pi(rawdata[i+2: j])
|
||||
j = match.end(0)
|
||||
return j-i
|
||||
|
||||
__starttag_text = None
|
||||
def get_starttag_text(self):
|
||||
return self.__starttag_text
|
||||
|
||||
# Internal -- handle starttag, return length or -1 if not terminated
|
||||
def parse_starttag(self, i):
|
||||
self.__starttag_text = None
|
||||
start_pos = i
|
||||
rawdata = self.rawdata
|
||||
if shorttagopen.match(rawdata, i):
|
||||
# SGML shorthand: <tag/data/ == <tag>data</tag>
|
||||
|
@ -220,9 +226,11 @@ class SGMLParser:
|
|||
if not match:
|
||||
return -1
|
||||
tag, data = match.group(1, 2)
|
||||
self.__starttag_text = '<%s/' % tag
|
||||
tag = string.lower(tag)
|
||||
self.finish_shorttag(tag, data)
|
||||
k = match.end(0)
|
||||
self.finish_shorttag(tag, data)
|
||||
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
|
||||
return k
|
||||
# XXX The following should skip matching quotes (' or ")
|
||||
match = endbracket.search(rawdata, i+1)
|
||||
|
@ -255,6 +263,7 @@ class SGMLParser:
|
|||
k = match.end(0)
|
||||
if rawdata[j] == '>':
|
||||
j = j+1
|
||||
self.__starttag_text = rawdata[start_pos:j]
|
||||
self.finish_starttag(tag, attrs)
|
||||
return j
|
||||
|
||||
|
|
Loading…
Reference in New Issue