3 ]@sdZddlZddlZddlZddljZddlm Z ddl m Z ddl m Z mZmZmZmZmZmZmZmZmZmZmZmZdZGd d d ejjZdd d Z dS)z>html2text: Turn HTML into equivalent Markdown-structured text.N)wrap)config) dumb_css_parser element_style escape_mdescape_md_sectiongoogle_fixed_width_fontgoogle_has_heightgoogle_list_stylegoogle_text_emphasishnlist_numbering_startpad_tables_in_textskipwrap unifiable_n cseZdZddejffdd ZfddZddZd d Zfd d Z d dZ ddZ ddZ ddZ ddZddZddZddZddZdd Zd.d"d#Zd/d$d%Zd&d'Zd(d)Zd*d+Zd,d-ZZS)0 HTML2TextNcs tjddd|_d|_d|_tj|_tj|_ tj |_ ||_ tj |_tj|_tj|_tj|_tj|_tj|_tj|_tj|_tj|_tj|_ tj!|_"tj#|_$d|_%d|_&d|_'d|_(tj)|_*tj+|_,d|_-tj.|_/tj0|_1tj2|_3tj4|_5tj6|_7d|_8tj9|_:tj;|_<|dkr|j=|_>n||_>g|_?d|_@d|_Ad|_Bd|_Cd|_Dg|_Eg|_Fd|_Gd|_HtIjJd |_Kd|_Lg|_Md|_Nd|_Od|_Pd|_Qd|_Rd |_Sd|_Td|_Ud|_Vi|_Wg|_Xd|_Yd|_Zd|_[d|_\d|_]i|_^||__d|_`d|_ad|_bd|_cd tjdd <dS) z Input parameters: out: possible custom replacement for self.outtextf (which appends lines of text). baseurl: base URL of the document we process F)Zconvert_charrefsr*_z**NTz^[a-zA-Z+]+://rz _place_holder;nbsp)esuper__init__ split_next_tdtd_count table_startrZ UNICODE_SNOB unicode_snobZ ESCAPE_SNOB escape_snobZLINKS_EACH_PARAGRAPHlinks_each_paragraph body_widthZSKIP_INTERNAL_LINKSskip_internal_linksZ INLINE_LINKS inline_linksZ PROTECT_LINKS protect_linksZGOOGLE_LIST_INDENTgoogle_list_indentZIGNORE_ANCHORS ignore_linksZ IGNORE_IMAGES ignore_imagesZIMAGES_AS_HTMLimages_as_htmlZ IMAGES_TO_ALT images_to_altZIMAGES_WITH_SIZEimages_with_sizeZIGNORE_EMPHASISignore_emphasisZ BYPASS_TABLES bypass_tablesZ IGNORE_TABLES ignore_tables google_doc ul_item_mark emphasis_mark strong_markZSINGLE_LINE_BREAKsingle_line_breakZUSE_AUTOMATIC_LINKSuse_automatic_linkshide_strikethroughZ MARK_CODE mark_codeZWRAP_LIST_ITEMSwrap_list_itemsZ WRAP_LINKS wrap_linksZ PAD_TABLES pad_tablesZDEFAULT_IMAGE_ALTdefault_image_alt tag_callbackZ OPEN_QUOTE open_quoteZ CLOSE_QUOTE close_quoteouttextfout outtextlistquietp_poutcountstartspaceaastackmaybe_automatic_link empty_linkrecompileabsolute_url_matcheracountlist blockquoteprestartprecodequote br_toggle lastWasNL lastWasListstyle style_def tag_stackemphasisdrop_white_spaceinheader abbr_title abbr_data abbr_listbaseurlstressedpreceding_stressedpreceding_data current_tag UNIFIABLE)selfr>r_ bodywidth) __class__/usr/lib/python3.6/__init__.pyr"s   zHTML2Text.__init__cs|jdd}tj|dS)Nzz )replacerfeed)redata)rgrhrirk|s zHTML2Text.feedcCs8|j||jd|j|j}|jr0t|S|SdS)Nr)rkoptwrapcloser8r)rerlZmarkdownrhrhrihandles   zHTML2Text.handlecCs"|jj||r|ddk|_dS)N )r?appendrT)resrhrhrir=s zHTML2Text.outtextfcsZtj|j|jddddj|j}|jr@tjj d}nd}|j d|}g|_|S)Nrend)forceznbsp; z _place_holder;) rrnpbrojoinr?rhtmlentitieshtml5rj)reZouttextr)rgrhrirns   zHTML2Text.closecCs|j|j|ddS)NT) handle_datacharref)recrhrhrihandle_charrefszHTML2Text.handle_charrefcCs|j|}|r|j|ddS)NT) entityrefr~)rerrefrhrhrihandle_entityrefs zHTML2Text.handle_entityrefcCs|j||dddS)NT)rC) handle_tag)retagattrsrhrhrihandle_starttagszHTML2Text.handle_starttagcCs|j|ddddS)NF)rC)r)rerrhrhri handle_endtagszHTML2Text.handle_endtagcCsd|kr dSd}xpt|jD]b\}}d|krv|d|dkrvd|ksLd|krrd|krvd|krv|d|dkrvd}nd}|r|SqWdS)z :type attrs: dict :returns: The index of certain set of attributes (of a link) in the self.a list. If the set of attributes is not found, returns None :rtype: int hrefNFtitleT) enumeraterE)rermatchirErhrhri previousIndexszHTML2Text.previousIndexc Cst|}t|}d|ko|j}d}x$tjD]}||ko<||k}|r*Pq*Wd|koVd|k} t|opt| op|j } |r|s| s| r|jd7_|r|jd7_| r|j|j |j d7_ |r|j|j |j d7_ | r|jd|j d7_ d|_ n|s| s| r,|jd8_d|_ | rZ|j rJ|j d8_ n |jdd|_ |r|j rx|j d8_ n |j|j | r|j r|j d8_ n |j|j |s| r|j r|jd|r|jd8_dS) z/ Handles various text emphases z line-throughFitalicrp`TrwN)r r4rZBOLD_TEXT_STYLE_VALUESrrOrYr@ryr0rZr1rQrD) rerC tag_style parent_styleZ tag_emphasisZparent_emphasisZ strikethroughZboldZ bold_markerrZfixedrhrhrihandle_emphasiss^          zHTML2Text.handle_emphasiscCs ||_|dkri}nt|}|jdk r>|j||||dkr>dS|rx|jdk rx|d^krx|dksb|jrx|jdd|_d |_|jri}|r|jr|jd_d }t ||j |}|jj |||fn4|jr|jj ndiif\}}}|jr|jd`d }t |r0|j|r&d|_|jt |d d n d |_dS|dakr|jrf|r\t|r\|jn|jn|jrz|dkrzn|j|dkr|r|jdkr|jdn |jd|dkr|r|j|jd|j|dbkr |r|jd 7_n|jd 8_|dkr:|r,|jd 7_n|jd 8_|dckrJd|_|dkr|r|j|jdddd|_|jd 7_n|jd 8_|jdd}|ddkr|j r|r||rd |j}n|j}|j||rd|_|dekr<|j r<|r ||r d |j} n|j} |j| |r|d8|d1<|j1d9pN|j2}|j3st|j4rd:|kstd;|kr|jd<|d8d=d:|kr|jd>|d:d=d;|kr|jd?|d;d=|r|jd@|d=|jdAdS|jdk r\|j}|j0rFt)||krF|j5j6|rF|jd2t)|d3d |_dS|jdd|_d |_|j0rt|jt)|n|jdBt)|d7|j(r|j1d1pd+}|jdCt)t7j8|j9|dDnh|j+|}|dk r|j,|}n.|j-d 7_-|j-|d4<|j.|d5<|j,j ||jdt/|d4d7|dkrD|rD|j|dkr^| r^|j:|dEkrx|rx|jdF|dEkr| r|j:|dik r(|j; r|j< r|j|r|jrt=|}n|}t>|}|j;j ||dIn0|j; r |j;j |j r |j; r |jdJd|_ z Zhrz* * *headscriptbodyrNz> )rvcSs|jotjd|jdS)Nz[^\s]rprr)rbrIr)rerhrhrino_preceding_space|sz0HTML2Text.handle_tag..no_preceding_spaceemrustrongbdelstrikertz ~~z~~kbdrQttrabbrrrqcSs@tj|j|}|jr dj|nd}|jdjt||ddS)Nz "{}"rz]({url}{title}))urlr)urlparseurljoinr_stripformatryr)relinkrrrhrhrilink_urlsz&HTML2Text.handle_tag..link_urlrEr<>countrBz][]srcaltwidthZheightz )rerrrCrrZdummyrrYrrrrErrrrZ list_styleZnumbering_startr nest_countrhrhrirs$                                                                             zHTML2Text.handle_tagcCs|jdkrd|_dS)zPretty print has a line breakrrpN)rA)rerhrhrirxs z HTML2Text.pbrcCs|jr dnd|_dS)z Set pretty print to 1 or 2 linesrprN)r2rA)rerhrhrirsz HTML2Text.pcCs|jd|_dS)z Soft breaksz N)rxrS)rerhrhrirszHTML2Text.soft_brFc CsF|jdk r|j|7_|jsB|jrT|j}|jrF|jp>|j rF|}|dkrTd|_|r|j rtjdd|}|r|ddkrd|_ |dd}| r| rdS|j r|j d r|j d  rd|}|j r|j d d|_d |j}|o|o|dd k r|jr|d7}|jrX|js&|d 7}x tt|jD]}|d 7}q6W|jdd|}|j rxd |_ |jrx|jd}|jrd |_ d|_d |_|dkrd|_|j dd |_ |jr|j |jd||jd |_ d|_|j r|js|j dd |_ |jr|jdkr|js&|dkr|dkr:|j dg}x|jD]x}|j|dkr|j dt|ddtj|j|dd|kr|j d|dd|j dn |j|qFW|j|kr|j d||_|jr$|dkr$x2|jj D]$\} } |j d| d| dqWd|_|j ||jd7_dS)z6 Deal with indentation and whitespace Nrrz\s+rwTrprqz z [code]rz FrurrBz [rz]: rrz (rz *[)!r]r@r.lstriprZrOrQrIsubrDrPrr5r>rArNrMrangerrjrCrSrTrEr rBrrrr_rsr^items) rerlpuredatarvZlstripped_dataZbqrZnewarrZ definitionrhrhrirys              (      z HTML2Text.ocCs |sdS|jr$|j}d|_d|_n:|jr^tjd|drXt|j rX|jdkrXd|}d|_|jrt|jj t ||j dk r|j }||kr|j j|r|j r|jd |d d|_dS|jd d|_ d|_|j r|j r| rt||jd }||_|j|dd dS)NFTz[^\s.!?]rrErQrOrwrrr)Zsnob)r)rErQrO)r`rrarIrr rcrVrWupdaterrGrKr3ryrHrQrOrrrb)rerlZ entity_charrrhrhrir~s:     zHTML2Text.handle_datac Csb|ddkr t|ddd}nt|}|j r@|tkr@t|Syt|Stk r\dSXdS)NrxXrpr)rr)intrrchr ValueError)rerrrhrhrir=s zHTML2Text.charrefc Csd|j r|tjkrtj|Sytjj|d}Wntk rLd|dSX|dkr`tj|S|S)N;&r)rrrdr{r|r}r)rerZchrhrhrirKs zHTML2Text.entityrefcCs*d}d|kr&t|ddd|j}|S)zq Calculate the nesting count of google doc lists :type style: dict :rtype: int rz margin-leftNr)rr%)rerVrrhrhrirTszHTML2Text.google_nest_countcCs|js |Sd}d}|jsd|_x|jdD]}t|dkrt||j|jsd}|jd|jrdd}n|jdrrd}t ||jd|d}|dj |7}|j dr|d 7}d }q|r|d7}d }q|d 7}d }nt j j|s||d7}d }q*|d kr*|d7}|d 7}q*W|S) zi Wrap all paragraphs in the provided text. :type text: str :rtype: str rrFrqz z z> )Zbreak_long_wordsZsubsequent_indentz rpz r)r!r7r#splitrrr6rr/rrzendswithrZRE_SPACEr)retextresultnewlinesZparaindentwrappedrhrhrirmbsF      zHTML2Text.optwrap)FF)F)__name__ __module__ __qualname__r BODY_WIDTHrrkror=rnrrrrrrrrxrrryr~rrrrm __classcell__rhrh)rgrir!s.Z    G} o + rrcCs$|dkrtj}t||d}|j|S)N)r_rf)rrrro)r{r_rfhrhrhri html2texts r)rrr)rN)__doc__Z html.entitiesr{Z html.parserrIZ urllib.parseparsertextwraprrrZhtml2text.utilsrrrrrr r r r r rrr __version__parserZ HTMLParserrrhrhrhris"   <