diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 3020d119811..3ab57c23071 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -29,12 +29,7 @@ starttagopen = re.compile('<[>a-zA-Z]') shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') piclose = re.compile('>') -starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*(' - r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]' - r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?' - r')*\s*/?\s*(?=[<>])') -endtag = re.compile(r'])') +endbracket = re.compile('[<>]') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' @@ -254,10 +249,14 @@ class SGMLParser(markupbase.ParserBase): self.finish_shorttag(tag, data) self.__starttag_text = rawdata[start_pos:match.end(1) + 1] return k - match = starttag.match(rawdata, i) + # XXX The following should skip matching quotes (' or ") + # As a shortcut way to exit, this isn't so bad, but shouldn't + # be used to locate the actual end of the start tag since the + # < or > characters may be embedded in an attribute value. + match = endbracket.search(rawdata, i+1) if not match: return -1 - j = match.end(0) + j = match.start(0) # Now parse the data between i+1 and j into a tag and attrs attrs = [] if rawdata[i:i+2] == '<>': @@ -306,10 +305,10 @@ class SGMLParser(markupbase.ParserBase): # Internal -- parse endtag def parse_endtag(self, i): rawdata = self.rawdata - match = endtag.match(rawdata, i) + match = endbracket.search(rawdata, i+1) if not match: return -1 - j = match.end(0) + j = match.start(0) tag = rawdata[i+2:j].strip().lower() if rawdata[j] == '>': j = j+1 diff --git a/Lib/test/sgml_input.html b/Lib/test/sgml_input.html new file mode 100644 index 00000000000..f4d2e6cc805 --- /dev/null +++ b/Lib/test/sgml_input.html @@ -0,0 +1,212 @@ + + + + + + + + +
+ + + + + +
+
+ + + + + +
+ + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
  MetalCristalDeuterioEnergía  
160.6363.40639.230-80/3.965
+
+
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Flotas (max. 9)
Num.MisiónCantidadComienzoSalidaObjetivoLlegadaOrden
1 + Espionaje + (F) + 3[2:250:6]Wed Aug 9 18:00:02[2:242:5]Wed Aug 9 18:01:02 +
+ + +
+
2 + Espionaje + (V) + 3[2:250:6]Wed Aug 9 17:59:55[2:242:1]Wed Aug 9 18:01:55 +
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Nueva misión: elegir naves
NavesDisponibles--
Nave pequeña de carga10máx
Nave grande de carga19máx
Crucero6máx
Reciclador1máx
Sonda de espionaje139máx
Ninguna naveTodas las naves
+ +

+
+ + diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py index 28a21a466bb..b6986360ac1 100644 --- a/Lib/test/test_sgmllib.py +++ b/Lib/test/test_sgmllib.py @@ -286,21 +286,6 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ('codepoint', 'convert', 42), ]) - def test_attr_values_quoted_markup(self): - """Multi-line and markup in attribute values""" - self.check_events("""text""", - [("starttag", "a", [("title", "foo\n
bar")]), - ("data", "text"), - ("endtag", "a")]) - self.check_events("""text""", - [("starttag", "a", [("title", "less < than")]), - ("data", "text"), - ("endtag", "a")]) - self.check_events("""text""", - [("starttag", "a", [("title", "greater > than")]), - ("data", "text"), - ("endtag", "a")]) - def test_attr_funky_names(self): self.check_events("""""", [ ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), @@ -376,6 +361,19 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ('decl', 'DOCTYPE doc []'), ]) + def test_read_chunks(self): + # SF bug #1541697, this caused sgml parser to hang + # Just verify this code doesn't cause a hang. + CHUNK = 1024 # increasing this to 8212 makes the problem go away + + f = open(test_support.findfile('sgml_input.html')) + fp = sgmllib.SGMLParser() + while 1: + data = f.read(CHUNK) + fp.feed(data) + if len(data) != CHUNK: + break + # XXX These tests have been disabled by prefixing their names with # an underscore. The first two exercise outstanding bugs in the # sgmllib module, and the third exhibits questionable behavior diff --git a/Misc/NEWS b/Misc/NEWS index b2f48f81e83..1feda610a34 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -49,6 +49,8 @@ Core and builtins Library ------- +- Reverted patch #1504333 because it introduced an infinite loop. + - Patch #1553314: Fix the inspect.py slowdown that was hurting IPython & SAGE by adding smarter caching in inspect.getmodule().