o
    Dh                     @   s  d Z ddlZddlmZmZ ddlmZmZmZ ddl	m
Z
mZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$ e%e&Z'da(dZ)dedefddZ*dededede+de,dedeee+e,f fddZ-dee+ fddZ.dedee+ defd d!Z/ded"e+d#e+defd$d%Z0dededeee+e,f fd&d'Z1dededeee+e,f fd(d)Z2dS )*z.
Functions grounding on third-party software.
    N)AnyTuple)ParagraphMakerclassify_paragraphsrevise_paragraph_classification)get_stoplistget_stoplists)_ElementElement
strip_tagstostring)HtmlElement   )basic_cleaning)convert_tagsprune_unwanted_nodestree_cleaning)Document)JUSTEXT_LANGUAGES)fromstring_bytestrimTEI_VALID_TAGS)OVERALL_DISCARD_XPATHz.//aside|.//audio|.//button|.//fieldset|.//figure|.//footer|.//iframe|.//input|.//label|.//link|.//nav|.//noindex|.//noscript|.//object|.//option|.//select|.//source|.//svg|.//time	htmlinputreturnc              
   C   sh   zt | ddd}t| }|dur|W S t W S  ty3 } ztd| t W  Y d}~S d}~ww )z6Safety net: try with the generic algorithm readability      )min_text_lengthretry_lengthNzreadability_lxml failed: %s)ReadabilityDocumentr   summaryr   	ExceptionLOGGERwarning)r   docr!   err r'   U/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/external.pytry_readability    s   r)   treebackup_treebodytextlen_textoptionsc                 C   s  |j dkr||jd kr|||fS d\}}|j dkrt|t}t|}tt|dddd}	t|	}
t	
d|
| |
d	|fv rCd
}nj|d	krN|
d	krNd}n_|d|
 krWd
}nV|
d| kre|	dsed}nH|dst|
|jd krtd}n9t|dt|dkr|
|jd krd}n!|j dkr|ds|dr|
|krd}nt	
d||
|j d
}|r||	|
}}}t	
d|j nt	
d|j |ts||jk rt	
d|j t| |\}}}t|}|r|d| kst	
d| |||}}}|r|st||\}}}|||fS )zZDecide whether to choose own or external extraction
       based on a series of heuristicsrecall
   )FF	precisionr-   zutf-8)methodencodingz0extracted length: %s (algorithm) %s (extraction)r   FT   {z.//p//text()z.//tablez.//pz.//headz.//h2|.//h3|.//h4zextraction values: %s %s for %szusing generic algorithm: %szusing custom extraction: %sz3unclean document triggering justext examination: %s   zusing justext, length: %s)focusmin_extracted_sizer   r   r)   r   r   decodelenr#   debug
startswithxpathfindallsourceSANITIZED_XPATHjustext_rescueboolsanitize_tree)r*   r+   r,   r-   r.   r/   use_readability	jt_resulttemppost_algo	algo_textlen_algobody2text2	len_text2r'   r'   r(   compare_extraction-   sN   


*&
rM   c                  C   s,   t  } t D ]	}| t| qt| atS )z8Retrieve and return the content of all JusText stoplists)setr   updater   tupleJT_STOPLIST)stoplistlanguager'   r'   r(   jt_stoplist_inito   s
   
rT   rR   c              	   C   s.   t | }t||dddddd t|d |S )z(Customized version of JusText processing2      g?g?g      ?T)r   make_paragraphsr   r   )r*   rR   
paragraphsr'   r'   r(   custom_justexty   s   

rY   urltarget_languagec           	   
   C   s   t d}|tv rtt| }ntpt }zt| |}W n ty5 } ztd|| W Y d}~|S d}~ww |D ]}|j	r>q8t d|j
}|_
|| q8|S )z9Second safety net: try with the generic algorithm justextr,   zjustext %s %sNp)r
   r   r   rQ   rT   rY   r"   r#   erroris_boilerplater-   append)	r*   rZ   r[   result_bodyjustext_stoplistrX   r&   	paragraphelemr'   r'   r(   try_justext   s"   
rd   c                 C   s8   t | } t| |j|j}td| }||t|fS )z1Try to use justext algorithm as a second fallback )r   rd   rZ   langr   joinitertextr;   )r*   r/   rG   	temp_textr'   r'   r(   rB      s   rB   c                 C   s   t | |}|jdu rt|d t|d t||}|dddD ]}|jdkr+d|_q |jdv r>|jdkr;|d	d
 d|_q dd dd t|dD D }t|g|R   td|	 }||t
|fS )zLConvert and sanitize the output from the generic algorithm (post-processing)Faspantdthtrrow)rl   rm   roleheadcellc                 S   s   g | ]}|t vr|qS r'   r   ).0tagnamer'   r'   r(   
<listcomp>   s
    z!sanitize_tree.<locals>.<listcomp>c                 S   s   g | ]}|j qS r'   )tag)rs   elementr'   r'   r(   ru      s    *re   )r   linksr   r   iterrv   rN   r   rg   rh   r;   )r*   r/   cleaned_treerc   sanitization_listr-   r'   r'   r(   rD      s&   







rD   )3__doc__loggingtypingr   r   justext.corer   r   r   justext.utilsr   r   
lxml.etreer	   r
   r   r   	lxml.htmlr   baseliner   htmlprocessingr   r   r   readability_lxmlr   r    settingsr   utilsr   r   xmlr   xpathsr   	getLogger__name__r#   rQ   rA   r)   strintrM   rT   rY   rd   rB   rD   r'   r'   r'   r(   <module>   s0   
0B
 $
