ðÕYf?Eã@sdZddlZddlZddlZddlmZdgZejdƒZejdƒZ ejdƒZ ejdƒZ ejd ƒZ ejd ƒZ ejd ƒZejd ƒZejd ƒZejdejƒZejd ƒZejdƒZGdd„dejƒZdS)zA parser for HTML and XHTML.éN)ÚunescapeÚ HTMLParserz[&<]z &[a-zA-Z#]z%&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]z)&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]z <[a-zA-Z]ú>z--\s*>z$([a-zA-Z][^ />]*)(?:\s|/(?!>))*z]((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*aF <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) (?:\s*,)* # possibly followed by a comma )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace z#c@sWeZdZdZd:Zdddd„Zdd „Zd d „Zd d „ZdZ dd„Z dd„Z dd„Z dd„Z dd„Zddd„Zdd„Zdd„Zd d!„Zd"d#„Zd$d%„Zd&d'„Zd(d)„Zd*d+„Zd,d-„Zd.d/„Zd0d1„Zd2d3„Zd4d5„Zd6d7„Zd8d9„ZdS);raEFind tags and other markup and call handler functions. Usage: p = HTMLParser() p.feed(data) ... p.close() Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). If convert_charrefs is True the character references are converted automatically to the corresponding Unicode character (and self.handle_data() is no longer split in chunks), otherwise they are passed by calling self.handle_entityref() or self.handle_charref() with the string containing respectively the named or numeric reference as the argument. ÚscriptÚstyleÚconvert_charrefsTcCs||_|jƒdS)zÆInitialize and reset this instance. If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters. N)rÚreset)Úselfr©r ú0/opt/alt/python35/lib64/python3.5/html/parser.pyÚ__init__Ws zHTMLParser.__init__cCs8d|_d|_t|_d|_tjj|ƒdS)z1Reset this instance. Loses all unprocessed data.Úz???N)ÚrawdataÚlasttagÚinteresting_normalÚ interestingÚ cdata_elemÚ _markupbaseÚ ParserBaser)r r r r r`s     zHTMLParser.resetcCs!|j||_|jdƒdS)z‘Feed data to the parser. Call this as often as you want, with as little or as much text as you want (may include '\n'). rN)rÚgoahead)r Údatar r r ÚfeedhszHTMLParser.feedcCs|jdƒdS)zHandle any buffered data.éN)r)r r r r ÚcloseqszHTMLParser.closeNcCs|jS)z)Return full source of start tag: '<...>'.)Ú_HTMLParser__starttag_text)r r r r Úget_starttag_textwszHTMLParser.get_starttag_textcCs2|jƒ|_tjd|jtjƒ|_dS)Nz )ÚlowerrÚreÚcompileÚIr)r Úelemr r r Úset_cdata_mode{szHTMLParser.set_cdata_modecCst|_d|_dS)N)rrr)r r r r Úclear_cdata_modes zHTMLParser.clear_cdata_modec Cs©|j}d}t|ƒ}xü||kr|jr¬|j r¬|jd|ƒ}|dkræ|jdt||dƒƒ}|dkr£tjdƒj ||ƒ r£P|}n:|j j ||ƒ}|rÖ|j ƒ}n|jràP|}||kr<|jr%|j r%|j t |||…ƒƒn|j |||…ƒ|j||ƒ}||kr[P|j}|d|ƒrtj||ƒr—|j|ƒ} n¯|d|ƒr¸|j|ƒ} nŽ|d|ƒrÙ|j|ƒ} nm|d|ƒrú|j|ƒ} nL|d |ƒr|j|ƒ} n+|d |krE|j dƒ|d } nP| dkrþ|sYP|jd |d ƒ} | dkrª|jd|d ƒ} | dkr´|d } n | d 7} |jrç|j rç|j t ||| …ƒƒn|j ||| …ƒ|j|| ƒ}q|d |ƒrétj||ƒ}|rž|jƒd d…} |j| ƒ|jƒ} |d| d ƒs†| d } |j|| ƒ}qqd||d…krå|j |||d …ƒ|j||d ƒ}Pq|d|ƒrtj||ƒ}|rj|jd ƒ} |j| ƒ|jƒ} |d| d ƒsU| d } |j|| ƒ}qtj||ƒ}|rÜ|rØ|jƒ||d…krØ|jƒ} | |krÂ|} |j||d ƒ}Pq|d |kr|j dƒ|j||d ƒ}qPqqW|r’||kr’|j r’|jri|j ri|j t |||…ƒƒn|j |||…ƒ|j||ƒ}||d…|_dS)Nrú<ú&é"z[\s;]zÚ r(r(r()rrP)rÚcheck_for_whole_start_tagrÚtagfind_tolerantr2r;r9rrÚattrfind_tolerantrÚappendÚstripZgetposÚcountr)r+r/ÚendswithÚhandle_startendtagÚhandle_starttagÚCDATA_CONTENT_ELEMENTSr!)r r?ÚendposrÚattrsr2rBÚtagÚmZattrnameÚrestZ attrvaluer;ÚlinenoÚoffsetr r r r3-sP     00    zHTMLParser.parse_starttagcCsñ|j}tj||ƒ}|rá|jƒ}|||d…}|dkrU|dS|dkr©|jd|ƒr{|dS|jd|ƒr‘d S||kr¡|S|dS|dkr¹d S|dkrÉd S||krÙ|S|dStdƒ‚dS) Nrrú/z/>r&r z6abcdefghijklmnopqrstuvwxyz=/ABCDEFGHIJKLMNOPQRSTUVWXYZzwe should not get here!r(r(r()rÚlocatestarttagend_tolerantr2r;r0ÚAssertionError)r r?rr_rAÚnextr r r rR`s.        z$HTMLParser.check_for_whole_start_tagcCss|j}tj||dƒ}|s)dS|jƒ}tj||ƒ}|s|jdk rw|j|||…ƒ|Stj||dƒ}|sÂ|||d…dkrµ|dS|j |ƒS|j dƒj ƒ}|j d|jƒƒ}|j |ƒ|dS|j dƒj ƒ}|jdk rR||jkrR|j|||…ƒ|S|j |j ƒƒ|jƒ|S)Nrr&rEzrr()rÚ endendtagr-r;Ú endtagfindr2rr/rSrHr9rr*Ú handle_endtagr")r r?rr2rIZ namematchZtagnamer r r r r4‚s6     zHTMLParser.parse_endtagcCs!|j||ƒ|j|ƒdS)N)rZri)r r^r]r r r rYªszHTMLParser.handle_startendtagcCsdS)Nr )r r^r]r r r rZ¯szHTMLParser.handle_starttagcCsdS)Nr )r r^r r r ri³szHTMLParser.handle_endtagcCsdS)Nr )r rCr r r r:·szHTMLParser.handle_charrefcCsdS)Nr )r rCr r r r=»szHTMLParser.handle_entityrefcCsdS)Nr )r rr r r r/¿szHTMLParser.handle_datacCsdS)Nr )r rr r r rJÃszHTMLParser.handle_commentcCsdS)Nr )r Zdeclr r r rGÇszHTMLParser.handle_declcCsdS)Nr )r rr r r rMËszHTMLParser.handle_picCsdS)Nr )r rr r r Ú unknown_declÎszHTMLParser.unknown_declcCs tjdtddƒt|ƒS)NzZThe unescape method is deprecated and will be removed in 3.5, use html.unescape() instead.Ú stacklevelr&)ÚwarningsÚwarnÚDeprecationWarningr)r Úsr r r rÒs  zHTMLParser.unescape)rr)Ú__name__Ú __module__Ú __qualname__Ú__doc__r[r rrrrrr!r"rr7rHr6r3rRr4rYrZrir:r=r/rJrGrMrjrr r r r r?s8       z  3 " (          )rsrrlrZhtmlrÚ__all__rrr>r<r8r1rLZ commentcloserSrTÚVERBOSErdrgrhrrr r r r Ús(