a ´iÌRã@sÞdZddlZddlZddlmZdgZe d¡Ze d¡Ze d¡Z e d¡Z e d ¡Z e d ¡Z e d ¡Z e d ¡Ze d ¡Ze d¡Ze dej¡Ze dej¡Ze dej¡Ze d ¡Ze d¡ZGdd„dejƒZdS)zA parser for HTML and XHTML.éN)ÚunescapeÚ HTMLParserz[&<]z &[a-zA-Z#]z%&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]z)&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]z <[a-zA-Z]z z--!?>z-?>z0([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*a{ ( (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name ) ([\t\n\r\f ]*=[\t\n\r\f ]* # value indicator ('[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\t\n\r\f ]* # bare value ) )? (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space a [a-zA-Z][^\t\n\r\f />]* # tag name [\t\n\r\f /]* # optional whitespace before attribute name (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name (?:[\t\n\r\f ]*=[\t\n\r\f ]* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\t\n\r\f ]* # bare value ) )? [\t\n\r\f /]* # possibly followed by a space )* >? aF <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) \s* # possibly followed by a space )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace z#c@seZdZdZdZdZdddœdd„Zd d „Zd d „Zd d„Z dZ dd„Z ddœdd„Z dd„Z d>dd„Zdd„Zdd„Zd?dd„Zd@d d!„Zd"d#„Zd$d%„Zd&d'„Zd(d)„Zd*d+„Zd,d-„Zd.d/„Zd0d1„Zd2d3„Zd4d5„Zd6d7„Zd8d9„Zd:d;„Zd'.)Ú_HTMLParser__starttag_textrr r r Úget_starttag_textszHTMLParser.get_starttag_text©Ú escapablecCsp| ¡|_||_|jdkr(t d¡|_nD|rP|jsPt d|jtjtjB¡|_nt d|jtjtjB¡|_dS)NÚ plaintextz\Zz&|])z])) ÚlowerrrÚreÚcompilerrÚ IGNORECASEÚASCII)r Úelemr"r r r Úset_cdata_mode¡s     ÿ  ÿzHTMLParser.set_cdata_modecCst|_d|_d|_dS)NT)rrrrrr r r Úclear_cdata_mode­szHTMLParser.clear_cdata_modecCs ||_dS)aEnable or disable support of the CDATA sections. If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>". If disabled, "<[CDATA[" starts a bogus comments which ends with ">". This method is not called by default. Its purpose is to be called in custom handle_starttag() and handle_endtag() methods, with value that depends on the adjusted current node. See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state for details. N)r)r Úflagr r r Ú_set_support_cdata²s zHTMLParser._set_support_cdatac CsB|j}d}t|ƒ}||krÚ|jrv|jsv| d|¡}|dkr | dt||dƒ¡}|dkrpt d¡  ||¡spqÚ|}n*|j   ||¡}|r’|  ¡}n|jrœqÚ|}||krÞ|jrÌ|j rÌ|  t|||…ƒ¡n|  |||…¡| ||¡}||kröqÚ|j}|d|ƒrrr<r9r;rrzdS|rX| ||d|…¡|dS)Nr5)r4r1rcrr<r)rr?rN)r r[rfrÚposr r r rbszHTMLParser.parse_bogus_commentcCsd|j}|||d…dks"Jdƒ‚t ||d¡}|s:dS| ¡}| ||d|…¡| ¡}|S)Nr5r3zunexpected call to parse_pi()r<)rÚpicloserBrCrRrW©r r[rrGr]r r r rK‹szHTMLParser.parse_picCs(d|_| |¡}|dkr|S|j}|||…|_g}t ||d¡}|sPJdƒ‚| ¡}| d¡ ¡|_}||kr.t  ||¡}|sŠq.| ddd¡\} } } | s¨d} n\| dd…dkrÌ| dd…ksøn| dd…dkrô| dd…krnn | dd…} | rt | ƒ} |  |  ¡| f¡| ¡}ql|||…  ¡} | d vr¬|  ¡\} }d |jvrˆ| |j d ¡} t|jƒ|j d ¡}n|t|jƒ}| |||…¡|S|  d ¡rÆ| ||¡n^| ||¡||jvsú|jrð|d ksú|d kr |j|ddn||jvr$|j|dd|S)Nrrz#unexpected call to parse_starttag()r5r9ú'r<ú")rú/>Ú rlZnoscriptr#Fr!T)rÚcheck_for_whole_start_tagrÚtagfind_tolerantrGrWrUr$rÚattrfind_tolerantrÚappendÚstripZgetposÚcountr>r@rDrOÚhandle_startendtagÚhandle_starttagÚCDATA_CONTENT_ELEMENTSrr*ÚRCDATA_CONTENT_ELEMENTS)r r[ÚendposrÚattrsrGr^ÚtagÚmÚattrnameÚrestZ attrvaluerWÚlinenoÚoffsetr r r rH—sj    & ÿ ÿ       ÿ   ÿÿþ zHTMLParser.parse_starttagcCs>|j}t ||d¡}|sJ‚| ¡}||ddkr:dS|S)Nrrr<)rÚ locatetagendrGrWrir r r rnÐsz$HTMLParser.check_for_whole_start_tagcCsà|j}|||d…dks"Jdƒ‚| d|d¡dkr:dSt ||¡sp||d|d…dkrf|dS| |¡St ||d¡}|sˆJ‚| ¡}||ddkr¤dSt ||d¡}|s¼J‚| d¡  ¡}|  |¡|  ¡|S) Nr5r1zunexpected call to parse_endtagrrr<r9r) rr?rMrGrbr€rWrorUr$Ú handle_endtagr+)r r[rrGr]rzr r r rIÚs&   zHTMLParser.parse_endtagcCs| ||¡| |¡dS©N)rur©r rzryr r r rtøs zHTMLParser.handle_startendtagcCsdSr‚r rƒr r r ruýszHTMLParser.handle_starttagcCsdSr‚r )r rzr r r rszHTMLParser.handle_endtagcCsdSr‚r ©r r`r r r rVszHTMLParser.handle_charrefcCsdSr‚r r„r r r rY szHTMLParser.handle_entityrefcCsdSr‚r rr r r rD szHTMLParser.handle_datacCsdSr‚r rr r r rNszHTMLParser.handle_commentcCsdSr‚r )r Zdeclr r r rQszHTMLParser.handle_declcCsdSr‚r rr r r rRszHTMLParser.handle_picCsdSr‚r rr r r rPszHTMLParser.unknown_decl)T)T)r)!Ú__name__Ú __module__Ú __qualname__Ú__doc__rvrwr r rrrr r*r+r-rrLrJrbrKrHrnrIrtrurrVrYrDrNrQrRrPr r r r rZs>     "   9 )rˆr%rZhtmlrÚ__all__r&rrZrXrTrFrMrhrdreroÚVERBOSErpr€Zlocatestarttagend_tolerantZ endendtagZ endtagfindrrr r r r Ús4            õ  óò