o
    Dh[                     @   s>  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z ddlmZ ddlmZmZ dd	lmZmZmZmZmZmZmZ d
dlmZmZ d
dlmZmZmZ e e!Z"edZ#e$ee%j&d d Z'h dZ(h dZ)da*ddhZ+h dZ,eddZ-h dZ.h dZ/h dZ0h dZ1g dZ2ddddd Z3d!Z4dgd"ed#e5d$dfd%d&Z6dhd"ed(e5d$dfd)d*Z7d+ed$efd,d-Z8d+ed$efd.d/Z9dgd0ed1e5d$e$fd2d3Z:d+ed$efd4d5Z;d0ed$efd6d7Z<d8ed9ed$e$fd:d;Z=d<ed0ed$dfd=d>Z>d0ed$efd?d@Z?dAedBee$ d$efdCdDZ@dAed$e5fdEdFZAd"ed(e5d$e$fdGdHZBd"edIee$ d(e5d$dfdJdKZCdLee d(e5d$e$fdMdNZDdOdPdQd8ed(e5dRe$dPe$d$e$f
dSdTZEd0ed$efdUdVZFd0ed$e$fdWdXZGdYed0ed$efdZd[ZHd"ed$dfd\d]ZId"ed$dfd^d_ZJd"ed$efd`daZKdbed$dfdcddZLd"ed$dfdedfZMdS )izE
All functions related to XML generation, processing and validation.
    N)unescape)version)StringIO)dumps)Path)ListOptional)_ElementElement
SubElement	XMLParser
fromstringtostringDTD   )Document	Extractor)sanitizesanitize_treetext_chars_testtrafilaturadataztei_corpus.dtd>   pabhilbdeldivrefrowbodycellcodeheaditemlistquotetablegraphic>   rendroletypetarget	renditionr   r   >   r   r   r%   r&   r'   T)remove_blank_text>	   r   r   r   r"   r#   r%   r&   r'   r(   >   r   r   r   r#   >	   r   r   r   r   r!   r#   r$   r%   r(   >   r!   r$   noter&   figure)sitenametitleauthordateurlhostnamedescription
categoriestagslicenseidfingerprintlanguagez***__`)z#bz#iz#uz#ti  element	keep_tailreturnc                 C   sb   |   }|du r
dS |r*| jr*|  }|du r!|jpd| j |_n	|jp%d| j |_||  dS )z
    Removes this element from the tree, including its children and
    text. The tail text is joined to the previous element or parent.
    N )	getparenttailgetprevioustextremove)rA   rB   parentprevious rL   P/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/xml.pydelete_element6   s   
rN   Finclude_formattingc                 C   s   |   }|du r
dS t| |}| jdur|| j7 }|  }|dur0|jr,|j d| n||_n|jdur?|j d| |_n||_||  dS )zAMerge element with its parent and convert formatting to markdown.N )rE   replace_element_textrF   rG   rH   rI   )rA   rO   rJ   	full_textrK   rL   rL   rM   merge_with_parentI   s   



rS   treec                 C   sj   |  dD ]-}t|dkr2t|jdu r2t|jdu r2| }|dur2|jdkr2|jdkr2|| q| S )z"Remove text elements without text.r>   r   FNr(   r"   )iterlenr   rH   rF   rE   tagrI   )rT   rA   rJ   rL   rL   rM   remove_empty_elements^   s   (
rX   c                 C   sN   t | dD ]}|dddD ]}|j|jkr#| jtvr#t| qq| S )z/Prevent nested tags among a fixed list of tags.z.//head | .//code | .//pr"   r#   r   )reversedxpathiterdescendantsrW   rE   NESTING_WHITELISTrS   )rT   elemsubelemrL   rL   rM   strip_double_tagsj   s   r_   docmetawith_metadatac                    s   |r> fdd j D }||d|d|dd|dp"g d|dp+g t|d	d
dd |d}ndt jd
di} j}t|d
d|d< t|d
dS )z0Build JSON output based on extracted informationc                    s   i | ]	}|t  |d qS )N)getattr).0slotr`   rL   rM   
<dictcomp>v   s    z%build_json_output.<locals>.<dictcomp>r5   r1   r7   ;r8   r9   r    F)rO   )sourcezsource-hostnameexcerptr8   r9   rH   commentsbodyrH   comments)ensure_ascii)	__slots__updatepopjoinxmltotxtr    rj   
json_dumps)r`   ra   
outputdictrj   rL   re   rM   build_json_outputs   s   rt   c                 C   s(   |  dD ]}|jtvr|j  q| S )zRemove unnecessary attributes.r>   )rU   rW   WITH_ATTRIBUTESattribclear)rT   r]   rL   rL   rM   clean_attributes   s
   

rx   c                 C   sF   t d}t||  d| j_|t| j d| j_|t| j |S )z4Build XML output tree based on extracted informationdocmainrk   )r
   add_xml_metar    rW   appendrx   rj   r`   outputrL   rL   rM   build_xml_output   s   
r   documentoptionsc                 C   s~   t | j t| j |jdkrtnt}|| }t|}tt|ddt	}|jdkr6|j
r6tdt||j t|ddd S )z9Make sure the XML output is conform and valid if requiredxmlunicode)encodingxmlteizTEI validation result: %s %sT)pretty_printr   )r_   r    rX   formatr   build_tei_outputr   r   r   CONTROL_PARSERtei_validationLOGGERdebugvalidate_teirh   strip)r   r   funcoutput_treerL   rL   rM   control_xml_output   s   

r   r~   c                 C   s>   t D ]}t||d}|r| |t|tr|nd| qdS )z-Add extracted metadata to the XML output treeNrg   )META_ATTRIBUTESrb   set
isinstancestrrp   )r~   r`   	attributevaluerL   rL   rM   r{      s    r{   c                 C   s   t | }t|| j}|S )z8Build TEI-XML output tree based on extracted information)write_teitree	check_teir5   r}   rL   rL   rM   r      s   r   xmldocr5   c                 C   s<  |  dD ]/}d|_|dd | }|du rqt|dkr+t|}||| |}|jdkr4t| q| dD ]}|j	rP|j	
 rPd|j	d|_|_|_	q:| d	D ]E}|jtvrjtd
|j| t| qV|jtv rtt| n|jdkrt| t| dd |jD D ]}td||j| |j| qqV| S )zCCheck if the resulting XML file is conform and scrub remaining tagsr#   r   r+   headerNr   r   z.//text/body//div/lbz.//text/body//*z"not a TEI element, removing: %s %sr   c                 S   s   g | ]}|t vr|qS rL   )TEI_VALID_ATTRS)rc   arL   rL   rM   
<listcomp>       zcheck_tei.<locals>.<listcomp>z0not a valid TEI attribute, removing: %s in %s %s)rU   rW   r   rE   rV   _tei_handle_complex_headreplace_move_element_one_level_upfindallrF   r   rH   TEI_VALID_TAGSr   warningrS   TEI_REMOVE_TAIL_handle_unwanted_tails!_handle_text_content_of_div_nodes_wrap_unwanted_siblings_of_divrv   ro   )r   r5   r]   rJ   new_elemr   rL   rL   rM   r      s@   




r   c                 C   s6   t du rtta t | }|du rtdt jj |S )zUCheck if an XML document is conform to the guidelines of the Text Encoding InitiativeNFznot a valid TEI document: %s)TEI_DTDr   
TEI_SCHEMAvalidater   r   	error_log
last_error)r   resultrL   rL   rM   r      s   
r   c              	   C   s  | j pd}|rp| j rp| jdkr2zt| dd }W n ttfy'   d}Y nw d|  d| }n>| jdkr>d	| d	}n2| jd
krY| d}|tv rXt|  | t|  }n| jdkrpd| j v rjd| d}nd| d}| jdkr|rd| d}| d}|r| d| d}ntd|| j	 |}ntd|| j	 | jdkr|rt
| dkr| d jdkr|  dur| dnd| d}|S | jdkr|r|  dur| }|S d| }|S | jdkr|rd| d}|S )zeDetermine element text based on just the text of the element. One must deal with the tail separately.rD   r#   r)   r      #rP   r   z~~r   r"   
z```
z
```r@   r   []r,   ()zmissing link attribute: %s %s'zempty link: %s %sr!   r   r   Nz| r$   z- )rH   rW   intget	TypeError
ValueErrorHI_FORMATTINGr   r   rv   rV   rG   )rA   rO   	elem_textnumberr)   	link_textr,   rL   rL   rM   rQ      sR   









"
rQ   
returnlistc              	   C   s  | j r|t| | | D ]}t||| q| j s| js| jdkrC| dd d| dd }|d|  d| dd d	 nY| jtv r| jd
krt	| 
d}| dp]| d}|rd| sgd}ntt|t}||k r~|d||   d | 
dr|dd|  d n|d n| jdkrdS | jtv r| 
ds||r| jd
krdnd n| jdkr|d n
| jtvr|d | jr|| j dS dS )zYRecursively convert a LXML element and its children to a flattened string representation.r(   r2   rD   rP   altz![z](srcr   r   z.//cellcolspanspanr   |r   z./cell[@role='head']z
|z---|r!   Nzancestor::cellu   
␤
z | )rH   r|   rQ   process_elementrF   rW   r   r   NEWLINE_ELEMSrV   rZ   isdigitminr   MAX_TABLE_WIDTHSPECIAL_FORMATTING)rA   r   rO   childrH   
cell_count	span_infomax_spanrL   rL   rM   r   ,  s@   
&






r   	xmloutputc                 C   s2   | du rdS g }t | || ttd|pdS )zLConvert to plain text format and optionally preserve formatting as markdown.NrD   )r   r   r   rp   )r   rO   r   rL   rL   rM   rq   b  s
   rq   	null)delimr   r   c                   s   t | j|p }t | j|p }t }tj||tjd}| fdd| j| j	| j
| j| j| j| j||| j| jfD  | S )zAConvert the internal XML document representation to a CSV string.)	delimiterquotingc                    s   g | ]}|r|n qS rL   rL   )rc   dr   rL   rM   r   y  r   zxmltocsv.<locals>.<listcomp>)rq   r    rj   r   csvwriterQUOTE_MINIMALwriterowr5   r;   r<   r6   r2   imager4   r:   pagetypegetvalue)r   rO   r   r   posttextcommentstextr~   outputwriterrL   r   rM   xmltocsvn  s$   
r   c                 C   sz   t ddd}t||  t|d}t|d}t| j}d|_|dd || t| j}d|_|dd	 || |S )
z6Bundle the extracted post and comments into a TEI treeTEIzhttp://www.tei-c.org/ns/1.0)xmlnsrH   r    r   r+   entryrk   )	r
   write_fullheaderr   rx   r    rW   r   r|   rj   )r`   teidoctextelemtextbodypostbodyrj   rL   rL   rM   r     s   






r   c                 C   s^   | j r| jr| j  d| j  d}|S | j p| jpd}ttjr-|dkr-td| j |S )z5Construct a publisher string to include in TEI headerz (r   zN/Azno publisher for URL %s)	r6   r1   r   r   isEnabledForloggingWARNINGr   r5   )r`   	publisherrL   rL   rM   _define_publisher_string  s   r   r   c                 C   s|  t | d}t |d}t |d}|jt |ddd_|jr"|jt |d_t |d}t|}|jrA|t |d	_t |d
}|jt |d_nt |d t |d}|jrW|jt |ddd_|jt |ddd_t |d}	t |	d}
dt	d|j
|jg}|std|j dt	d|j|g|
_|t |	ddd_t |	d}t |d}|jt |ddd_|jr|jt |d_t |d}|t |d	_|jrt |dd|jd |jt |d_t |d}t |d}|jt |d_|js|jrt |d}t |d}|jrd|jt |d d!d_|jrd|jt |d d"d_t |d#}|jt |dd$d_t |d%}t |d&}t |d'td(d)}d(t |d*_t |dd+d, |S )-z+Write TEI header based on gathered metadata	teiHeaderfileDesc	titleStmtr2   rz   )r+   r3   publicationStmtr   availabilityr   	notesStmtr/   r;   r<   
sourceDescbiblz, Nzno sigle for URL %ssiglebiblFullptrURL)r+   r,   r4   profileDescabstract	textClasskeywords,termr8   r9   creationdownloadencodingDescappInfoapplicationTrafilatura)r   identlabelz$https://github.com/adbar/trafilatura)r,   )r   r2   rH   r3   r   r:   r;   r<   rp   filterr1   r4   r   r   r5   r7   r8   r9   filedatePKG_VERSION)r   r`   r   filedescbib_titlestmtpublicationstmt_apublisher_stringr   	notesstmt
sourcedescsource_biblr   biblfullpublicationstmtprofiledescr   	textclassr   r  encodingdescappinfor  rL   rL   rM   r     sh   


















r   c                 C   s   | j r9| j  r9t| dkr(| d jdkr(| j  d| d j pd  | d _ ntd}| j |_ | d| d| _ | jrs| j rut| dkra| d jdkra| d j pTd d| j  | d _ ntd}| j|_ | | d| _dS dS dS )z@Wrap loose text in <div> within <p> elements for TEI conformity.r   r   rP   rD   N)rH   r   rV   rW   r
   insertrF   r|   )rA   	new_childrL   rL   rM   r     s   &&

r   c                 C   s   | j r| j  nd| _ | j sdS | jdkr#dtd| j| j g| _ntd}| j |_|  }|dur>||	| d | d| _ dS )z Handle tail on p and ab elementsNr   rP   r   )
rF   r   rW   rp   r
  rH   r
   rE   r  index)rA   new_siblingrJ   rL   rL   rM   r     s   

r   c                 C   s   t d| jd}| jr| j nd|_|  D ]1}|jdkrBt|dks&|jr=t|dks1|d jr6t|d |j|d _q|j|_q|	| q| jrP| j nd}|rW||_|S )z0Convert certain child elements to <ab> and <lb>.r   )rv   Nr   r   r  r   )
r
   rv   rH   r   iterchildrenrW   rV   rF   r   r|   )rA   new_elementr   rF   rL   rL   rM   r     s   


r   div_elementc                 C   s   t d}d}|  }|du rdS |  D ]/}|jdkr n'|jtv r/|p(||}|| q|rCt|dkrC||| t d}d}q|rTt|dkrV||| dS dS dS )z=Wrap unwanted siblings of a div element in a new div element.r   Nr   )	r
   rE   itersiblingsrW   TEI_DIV_SIBLINGSr  r|   rV   r  )r!  r  new_sibling_indexrJ   siblingrL   rL   rM   r   )  s&   

r   c                 C   s   |   }|dur|  nd}|du s|du rdS td}|t|   |||d |  | jr8| j nd}|rB||_	d| _|jrJ|j nd}|rT||_d|_t
|dks`|j	s`|jrk||| d | t
|dkr{|j	s}|| dS dS dS )z
    Fix TEI compatibility issues by moving certain p-elems up in the XML tree.
    There is always a n+2 nesting for p-elements with the minimal structure ./TEI/text/body/p
    Nr   r   r   )rE   r
   extendr%   r"  r  r  rF   r   rH   rV   rI   )rA   rJ   grand_parentr   rF   rL   rL   rM   r   B  s(   r   )T)F)N__doc__r   r   htmlr   importlib.metadatar   ior   jsonr   rr   pathlibr   typingr   r   
lxml.etreer	   r
   r   r   r   r   r   settingsr   r   utilsr   r   r   	getLogger__name__r   r  r   __file__rJ   r   r   r   r   r   r#  r   r   r   ru   r\   r   r   r   boolrN   rS   rX   r_   rt   rx   r   r   r{   r   r   r   rQ   r   rq   r   r   r   r   r   r   r   r   r   rL   rL   rL   rM   <module>   sh   $

	
*/6&G