o
    Dh|"                     @   s,  U d Z ddlZddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
zddlmZ W n ey6   dZY nw ddlmZ ddlmZmZmZ dd	lmZ eeZd
dhZee ed< e
e
jj e
j j!ddg ddZ"e
j#e"dZ$eddd
ddZ%e&dej'Z(e&dej'Z)G dd dZ*dede+fddZ,de-de+fddZ.de-dee fdd Z/d!e	e-ef defd"d#Z0d$edefd%d&Z1d'edee fd(d)Z2d*ede+fd+d,Z3d-ed*edefd.d/Z4d0edee fd1d2Z5d0e	e-eef dee fd3d4Z6d5ed6ee defd7d8Z7d9edefd:d;Z8dS )<z7
Module bundling functions related to HTML processing.
    N)datetime)AnyListOptionalSetUnion)detect)
from_bytes)HtmlElement
HTMLParser
fromstring   )MAX_FILE_SIZEutf-8utf_8UNICODE_ALIASES   )i  i  i  i  i  )totalconnectstatus_forcelist)retriesFT)collect_idsdefault_doctypeencoding
remove_pisz^< ?! ?DOCTYPE.+?/ ?>z(<html.*?)\s*/>c                   @   s:   e Zd ZdZg dZdedededededd	fd
dZd	S )	Extractorz0Defines a class to store all extraction options.	extensiveformatmaxminoriginalextensive_searchmax_datemin_dateoriginal_dateoutputformatreturnNc                 C   s"   || _ || _|| _|| _|| _d S )Nr   )selfr"   r#   r$   r%   r&    r)   O/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/htmldate/utils.py__init__4   s
   
zExtractor.__init__)	__name__
__module____qualname____doc__	__slots__boolr   strr+   r)   r)   r)   r*   r   /   s     r   datar'   c                 C   s   | rt | tkr
dS dS )z6Check if the input object is suitable to be processed.TF)lenr   r3   r)   r)   r*   is_wrong_documentC   s   r6   c                 C   s&   z|  d W dS  ty   Y dS w )zLSimple heuristic to determine if a bytestring uses standard unicode encodingzUTF-8FT)decodeUnicodeDecodeErrorr5   r)   r)   r*   isutf8J   s   r9   bytesobjectc                 C   sv   t | rdgS g }tdurt| d }|dur||  t| dd p)t| }|dd |D  dd |D S )z<Read all input or first chunk and return a list of encodingsr   Nr   i:  c                 S   s   g | ]}|j qS r)   )r   ).0rr)   r)   r*   
<listcomp>c   s    z#detect_encoding.<locals>.<listcomp>c                 S   s   g | ]}|t vr|qS r)   )r   )r;   gr)   r)   r*   r=   f   s    )r9   cchardet_detectappendlowerr	   extend)r:   guessescchardet_guessdetection_resultsr)   r)   r*   detect_encodingS   s   rF   filecontentc              
   C   sh   t | tr| S d}t| D ]}z| |}W n ttfy)   td| d}Y qw  |p3t| dddS )znGuess bytestring encoding and try to decode to Unicode string.
    Resort to destructive conversion otherwise.Nzwrong encoding detected: %sr   replace)r   errors)
isinstancer2   rF   r7   LookupErrorr8   LOGGERwarning)rG   htmltextguessed_encodingr)   r)   r*   decode_filei   s   
rP   responsec                 C   s2   t | tjjst| dr| j}t|S | }t|S )zRead the urllib3 object corresponding to the server response, then
    try to guess its encoding and decode it to return a unicode stringr3   )rJ   urllib3rQ   HTTPResponsehasattrr3   rP   )rQ   resp_contentr)   r)   r*   decode_response}   s
   rV   urlc              
   C   s   z
t jd| dd}W n ty$ } ztd| | W Y d}~dS d}~ww |jdkr4td|j|  dS t|jrAtd|  dS t|jS )	a,  Fetches page using urllib3 and decodes the response.

    Args:
        url: URL of the page to fetch.

    Returns:
        HTML code as string, or Urllib3 response object (headers + body), or empty string in case
        the result is invalid, or None if there was a problem with the network.

    GET   )timeoutzdownload error: %s %sN   z!not a 200 response: %s for URL %szincorrect input data for URL %s)		HTTP_POOLrequest	ExceptionrL   errorstatusr6   r3   rV   )rW   rQ   errr)   r)   r*   	fetch_url   s   	


rb   	beginningc                 C   s   d| vS )zOAssess if the object is proper HTML (awith a corresponding tag or declaration).htmlr)   )rc   r)   r)   r*   is_dubious_html   s   re   
htmlstringc                 C   s   d|v r|  d\}}}tjd|ddd | } tt|  D ]\}}d|v r8|dr8tjd| dd}  | S |d	kr? | S q | S )
z>Repair faulty HTML strings to make then palatable for libxml2.doctype
 r   )countz<htmlz/>z\1>   )	partitionDOCTYPE_TAGsub	enumerateiter
splitlinesendswithFAULTY_HTML)rf   rc   	firstline_restiliner)   r)   r*   repair_faulty_html   s   ry   
htmlobjectc              
   C   sP   d}zt | dtd}W |S  ty' } ztd| W Y d}~|S d}~ww )z!Try to pass bytes to LXML parser.Nutf8parserzlxml parser bytestring %s)r   encodeHTML_PARSERr^   rL   r_   )rz   treera   r)   r)   r*   fromstring_bytes   s   r   c              
   C   sH  t | tr| S t | ttfstdt| t | tr6| dr6d| vr6td|  t	| } | du r6t
d| d}t| } | dd  }t| |} d}zt| td	}W n& t
yb   d
}t| }Y n tyy } ztd| W Y d}~nd}~ww |du st|dk r|st| }|durt|rt|dk rtdt| d}|S )zkLoad object given as input and validate its type
    (accepted: lxml.html tree, bytestring and string)
    zincompatible input type: %shttp zURL detected, downloading: %sNzURL couldn't be processed: %s2   Fr|   Tzlxml parsing failed: %sr   rk   z9parsed tree length: %s, wrong data type or not valid HTML)rJ   r
   bytesr2   	TypeErrortype
startswithrL   debugrb   
ValueErrorrP   rA   ry   r   r   r   r^   r_   r4   re   )rz   r   rc   fallback_parsera   r)   r)   r*   	load_html   sF   


r   r   elemlistc                 C   s.   |  |D ]}| }|dur|| q| S )zDelete selected elements.N)rp   	getparentremove)r   r   elementparentr)   r)   r*   
clean_html   s   
r   stringc                 C   s   d |   S )z7Remove superfluous space and normalize remaining space.r   )joinsplitstrip)r   r)   r)   r*   	trim_text  s   r   )9r/   loggingrer   typingr   r   r   r   r   rR   cchardetr   r?   ImportErrorcharset_normalizerr	   	lxml.htmlr
   r   r   settingsr   	getLoggerr,   rL   r   r2   __annotations__disable_warnings
exceptionsInsecureRequestWarningutilRetryRETRY_STRATEGYPoolManagerr\   r   compileIrm   rs   r   r1   r6   r   r9   rF   rP   rV   rb   re   ry   r   r   r   r   r)   r)   r)   r*   <module>   sR   
	 
4	