a !q ^a@sdZddlZddlZddlZddlmZddlm Z ddl m Z m Z m Z mZmZddlmZddlmZmZddl mZdd lmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#d Z$Gd d d ej%j&Z'de(e(e e)e(dddZ*dS)z>html2text: Turn HTML into equivalent Markdown-structured text.N)wrap)DictListOptionalTupleUnion)config) AnchorElement ListElement) OutCallback) dumb_css_parser element_style escape_mdescape_md_sectiongoogle_fixed_width_fontgoogle_has_heightgoogle_list_stylegoogle_text_emphasishnlist_numbering_startpad_tables_in_textskipwrap unifiable_n)ircseZdZddejfeeeeddfdd Z eddfdd Z eedd d Z edd d d Z edddZ eddddZeddddZeeeeeefddddZeddddZeeeefeedddZeeeefeeefddd d!Zeeeeefedd"d#d$Zddd%d&Zddd'd(Zddd)d*Zd=eeeeefdd,d-d.Zd>eedd/d0d1Zeed2d3d4Zeedd5d6Z eeefed7d8d9Z!eed:d;d<Z"Z#S)? HTML2TextN)outbaseurl bodywidthreturncs tjddd|_d|_d|_tj|_tj|_ tj |_ ||_ tj |_tj|_tj|_tj|_tj|_tj|_tj|_tj|_tj|_tj|_ tj!|_"tj#|_$d|_%d|_&d|_'d|_(tj)|_*tj+|_,d|_-tj.|_/tj0|_1tj2|_3tj4|_5tj6|_7d|_8tj9|_:tj;|_<|dur|j=|_>n||_>g|_?d|_@d|_Ad|_Bd|_Cd|_Dg|_Eg|_Fd|_Gd|_HtIJd |_Kd|_Lg|_Md|_Nd|_Od|_Pd|_Qd|_Rd |_Sd|_Td|_Ud|_Vi|_Wg|_Xd|_Yd|_Zd|_[d|_\d|_]i|_^||__d|_`d|_ad |_bd |_cd tjdd <dS) z Input parameters: out: possible custom replacement for self.outtextf (which appends lines of text). baseurl: base URL of the document we process F)Zconvert_charrefsr*_z**NTz^[a-zA-Z+]+://r _place_holder;nbsp)esuper__init__ split_next_tdtd_count table_startr Z UNICODE_SNOB unicode_snobZ ESCAPE_SNOB escape_snobZLINKS_EACH_PARAGRAPHlinks_each_paragraph body_widthZSKIP_INTERNAL_LINKSskip_internal_linksZ INLINE_LINKS inline_linksZ PROTECT_LINKS protect_linksZGOOGLE_LIST_INDENTgoogle_list_indentZIGNORE_ANCHORS ignore_linksZ IGNORE_IMAGES ignore_imagesZIMAGES_AS_HTMLimages_as_htmlZ IMAGES_TO_ALT images_to_altZIMAGES_WITH_SIZEimages_with_sizeZIGNORE_EMPHASISignore_emphasisZ BYPASS_TABLES bypass_tablesZ IGNORE_TABLES ignore_tables google_doc ul_item_mark emphasis_mark strong_markZSINGLE_LINE_BREAKsingle_line_breakZUSE_AUTOMATIC_LINKSuse_automatic_linkshide_strikethroughZ MARK_CODE mark_codeZWRAP_LIST_ITEMSwrap_list_itemsZ WRAP_LINKS wrap_linksZ PAD_TABLES pad_tablesZDEFAULT_IMAGE_ALTdefault_image_alt tag_callbackZ OPEN_QUOTE open_quoteZ CLOSE_QUOTE close_quoteouttextfr outtextlistquietp_poutcountstartspaceaastackmaybe_automatic_link empty_linkrecompileabsolute_url_matcheracountlist blockquoteprestartprecodequote br_toggle lastWasNL lastWasListstyle style_def tag_stackemphasisdrop_white_spaceinheader abbr_title abbr_data abbr_listrstressedpreceding_stressedpreceding_data current_tag UNIFIABLE)selfrrr __class__6/usr/lib/python3.9/site-packages/html2text/__init__.pyr&%s    zHTML2Text.__init__)datar cs|dd}t|dS)Nzz )replacer%feed)rortrprrrsrvs zHTML2Text.feedcCs8|||d||}|jr0t|S|SdS)Nr)rvoptwrapfinishrDr)rortZmarkdownrrrrrshandles   zHTML2Text.handle)sr cCs"|j||r|ddk|_dS)N )rJappendr_)rorzrrrrrsrIs zHTML2Text.outtextf)r cCsX|||jdddd|j}|jr>tjjd}nd}| d|}g|_|S)Nrendforceznbsp; r#) closepbrojoinrJr*htmlentitieshtml5ru)roZouttextr$rrrrrsrxs  zHTML2Text.finish)cr cCs|||ddSNT) handle_datacharref)rorrrrrrshandle_charrefszHTML2Text.handle_charrefcCs||}|r||ddSr) entityrefr)rorrefrrrrrshandle_entityrefs zHTML2Text.handle_entityref)tagattrsr cCs|j|t|dddS)NTrN) handle_tagdict)rorrrrrrrshandle_starttagszHTML2Text.handle_starttag)rr cCs|j|idddS)NFr)r)rorrrrrrs handle_endtagszHTML2Text.handle_endtag)rr cCsd|vr dSd}t|jD]p\}}d|jvr~|jd|dkr~d|jvsPd|vrzd|jvr~d|vr~|jd|dkr~d}nd}|r|SqdS)z :type attrs: dict :returns: The index of certain set of attributes (of a link) in the self.a list. If the set of attributes is not found, returns None :rtype: int hrefNFtitleT) enumeraterPr)rormatchirPrrrrrs previousIndexs" zHTML2Text.previousIndex)rN tag_style parent_styler c Cst|}t|}d|vo|j}d}tjD]}||vo:||v}|r(qFq(d|voTd|v} t|ont| on|j } |r|s| s| r|jd7_|r|jd7_| r||j |j d7_ |r||j |j d7_ | r|d|j d7_ d|_ n|s| s| r*|jd8_d|_ | rX|j rH|j d8_ n |dd|_ |r|j rv|j d8_ n ||j | r|j r|j d8_ n ||j |s| r|js|d|r|jd8_dS) z/ Handles various text emphases z line-throughFitalicr`TrN)rr@r ZBOLD_TEXT_STYLE_VALUESrrZrdrKrr<rer=r\rO) rorNrrZ tag_emphasisZparent_emphasisZ strikethroughZboldZ bold_markerrfixedrrrrrshandle_emphasissb         zHTML2Text.handle_emphasis)rrrNr cCs ||_|jdur(|||||dur(dS|rb|jdurb|dvrb|dksL|jrb|dd|_d|_|jri}|r|jr|jdd}t||j |}|j |||fn4|jr|j ndiif\}}}|jr|jdd}t |r| |rd|_|t |dd n d|_dS|d vrl|jrP|rFt|rF| n|n|jrd|d krdn| |d kr|r|jd kr|dn |d|dkr|r| |d| |dvr|r|jd7_n|jd8_|dkr$|r|jd7_n|jd8_|dvr4d |_|dkr|rp| |jdddd|_|jd7_n|jd8_| ttddd}|dvr|js|r||rd |j}n|j}|||rd|_|dvr*|js*|r||rd |j} n|j} || |r*d|_|dvrd|rJ||rJd} nd } || |rdd|_|jr|js|||||d!vr|js|d"|j |_|d#kr|rd|_d$|_ d%|vr|d%|_n6|jdur |j dusJ|j|j!|j <d|_d|_ |d&krF|j"s0||j#n ||j$|j" |_"dVtt%t%dd'd(d)} |d*kr|j&s|rd+|vr|d+dur|j'r|d+(ds|j ||d+|_d|_|j)rd,|d+d-|d+<n |j dn|jr|j } |jr|jsd|_n| r| d+dus.J|jrL|dd|_d|_|j*r~| +d%pbd$} t,| } | || d+| nb|-| }|dur|j.|}n*|j/d7_/t0| |j/|j1}|j. ||d.t%|j2d/|dkr4|r4|js4d0|vr4|d0dusJ|j3s(|d0|d+<|+d1p8|j4}|j5s^|j6rd2|vs^d3|vr|d4|d0d5d2|vr|d2dusJ|d6|d2d5d3|vr|d3dusJ|d7|d3d5|r|d8|d5|d9dS|jdurj|j}|j3rTt,||krT|j78|rT|d,t,|d-d|_dS|dd|_d|_|j3r|t,|n|d:t,|d/|j*r|+d+pd$}|d;t,t9:|j;|d<nb|-|}|dur|j.|}n*|j/d7_/t0||j/|j1}|j. ||dt%|j2d/|d=krL|rL| |d>krd|sd|<|d?kr~|r~|d@|d?kr|s|<|dAv r$|j=s|j>s| |r|jrt?|}n|}t@|}|j= tA||n,|j= r|j= |j s|j= s|dBd|_>nd|_>|dCk r|<| r|j= rV|j=d}n tAdDd }|j rt|B|}n tC|j=}|dE||jDdDk r||jEd n.|jDdFk r|jFd7_F|t%|jFdGd|_|dHv r|jG r|dIk r| rn|nn|jH r| r&||dJv rZ| rH|dKI|n|dLI|n(| rr|dMI|n|dNI|n|dOk r| rd|_J|jK r|d,tLjMd-|dn&|jK r|dPtLjMd-|d|dJv r| r|jN r |dQd|_N|dIk r(| r(d |_O|dIk rF| sFd|_N||dIk r| s|jJ r|dRPdSg|jO|d|_J|dJv r| r|jOd7_O|dTk r| rd|_Qd|_nd|_|jR r|SdU| dS)WNT)pdivradldtZimg[Fr{#r)rrrbrrz >  hrz* * *)headraZscriptrra)bodyrY> r)ror cSst|jotd|jdS)Nz[^\s]r{)boolrlrTrrorrrrrsno_preceding_spacesz0HTML2Text.handle_tag..no_preceding_space)Zemru)strongb)delstrikerzz ~~z~~)Zkbdr\ttrabbrrrq)rolinkrr cSs@t|j|}|r d|nd}|djt||ddS)Nz "{}"rz]({url}{title}))urlr)urlparseurljoinrstripformatrr)rorrrrrrrrslink_urlsz&HTML2Text.handle_tag..link_urlrPr<>z][]srcaltwidthZheightz rLrrrrrrsrsz HTML2Text.pcCs|d|_dS)z Soft breaksrN)rr^rrrrrrsrszHTML2Text.soft_brF)rtpuredatarr c Cs2|jdur|j|7_|js.|jrR|}|jrD|jsD|jsD|}|dkrRd|_|r|jstdd|}|r|ddkrd|_ |dd}|s|sdS|j r| ds| d sd|}|j r| d d|_d |j}|r|r|dd ks|jr|d7}|jr6|js|d 7}|d t|j7}|dd|}|j rVd |_ |jrV|d}|jrpd |_ d|_d |_|dkrd|_| dd |_ |jr| |jd||jd |_ d|_|j r|js| dd |_ |jr|jdkr|js|dkr|dkr| dg}|jD]}|j|jkr| dt|jdt|j|jdd|jvr|jddusJ| d|jdd| dn ||q"|j|kr| d||_|j r|dkr|j !D]$\}} | d|d| dqd|_| ||jd7_dS)z6 Deal with indentation and whitespace Nrrz\s+rTrr|z z [code]rrFr~rz [z]: rrz (rz *[)"rhrKr:lstriprerZr\rTsubrOr[rrArrLrYrXrrurNr^r_rPr,rMrrrrrrr}riitems) rortrrZlstripped_dataZbqZnewarrZ definitionrrrrrsrs                 z HTML2Text.o)rt entity_charr cCs|sdS|jr$|}d|_d|_n8|jr\td|drVt|jsV|jdvrVd|}d|_|jrr|j t ||j dur|j }||kr|j |r|j r|d|dd|_dS|d d|_ d|_|js|js|st||jd }||_|j|dd dS) NFTz[^\s.!?]r)rPr\rZrrrr)Zsnob)r)rjrrkrTrrrmrarbupdater rRrVr?rrSr\rZrr+rl)rortrrrrrrrsr!sF   zHTML2Text.handle_data)rr cCsb|ddvr t|ddd}nt|}|js>|tvr>t|Sz t|WSty\YdS0dS)Nr)xXrrr)intr*rchr ValueError)rorrrrrrrsrLs   zHTML2Text.charrefcCsd|js|tjvrtj|Sztjj|d}WntyLd|dYS0|dkr`tj|S|S)N;&r$)r*r rnrrrKeyError)rorZchrrrrrsrZs  zHTML2Text.entityref)rar cCs*d}d|vr&t|ddd|j}|S)zq Calculate the nesting count of google doc lists :type style: dict :rtype: int rz margin-leftN)rr1)rorarrrrrrsrcszHTML2Text.google_nest_count)textr cCs|js |Sd}d}|jsd|_|dD]}t|dkrt||j|jsd}|d|jrbd}n|drpd}t ||jd|d}|d |7}| dr|d 7}d }q|r|d7}d }q|d 7}d }qt j |s||d7}d }q(|d kr(|d7}|d 7}q(|S) zi Wrap all paragraphs in the provided text. :type text: str :rtype: str rrFr|rrr)Zbreak_long_wordssubsequent_indentrrz r)r-rCr/splitrrrBrr;rrendswithr ZRE_SPACEr)rorresultnewlinesZparaindentwrappedrrrrrsrwqsH      zHTML2Text.optwrap)FF)F)$__name__ __module__ __qualname__r BODY_WIDTHrr rrr&rvryrIrxrrrrrrrrrrrrrrrrrrrrrw __classcell__rrrrrprsr$sNd  "  J { q+ rr)rrrr cCs$|durtj}t||d}||S)N)rr)r rrry)rrrhrrrrrs html2texts r)rN)+__doc__Z html.entitiesrZ html.parserrT urllib.parseparsertextwraprtypingrrrrrrr elementsr r r Zutilsr rrrrrrrrrrrr __version__parserZ HTMLParserrrrrrrrrrrrss(    <