o
    DhI                     @   s(  d Z ddlZddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZmZ ee Z!edddddd eD ddddddZ"dZ#dZ$dZ%ee#d Z&dZ'dZ(edZ)dZ*dZ+dZ,e-d Z.e-d!e, d"e+ d#e* d$e* d%e+ d&Z/e-d!e, d"e+ d'e+ d(e, d)	Z0d*Z1e-d+e1 d,e* d-e, d.e* d/e1 d0e, d12d2d3ej3Z4e-d4e, d5e+ d5e* d6Z5e-d7e, d8e+ d8e* d1ej3Z6e-d9e, d8e+ d8e* d1ej3Z7e-d:e, d8e+ d8e* d;Z8g d<Z9d=d> e:e9dd?D Z;e-d@Z<e-dAZ=e-dBej3Z>e-d:e* dCe+ dCe, d1Z?e-d:e* dDe+ dEe* dFe+ dG	Z@e-d:e+ dCe, d1ZAe-dHe, d1ZBe-dIe, dJe, dKZCe-dLZDe-dMZEe-dNZFe-dOZGe-dPZHe-d:e, dQZIe-dRe, d1ZJe-dSZKe-d:e, dTZLe-dUZMe-dVZNe-dWZOe-d:e, dXZPe-dYZQe-d:e, dQZRe-dZe, dKZSd[ed\e
eee f fd]d^ZTd_eeU d`ed\eeU fdadbZVdceWd\eWfdddeZXdfeWdgeWd\e
eWeWf fdhdiZYdjeUd\ee fdkdlZZdjeUdmeUdnedoed\eeU f
dpdqZ[djeUdmeUd\eeU fdrdsZ\eedtdjeeU dmeUdue]dnedoed\eeU fdvdwZ^d[ed`ed\eeU fdxdyZ_dzeUd{e	eU d`ed\eeU fd|d}Z`d[ed`ed\eeU fd~dZadeUd`ed\eeU fddZbdS )z:
Custom parsers and XPath expressions for date extraction
    N)datetime)	lru_cache)ListOptionalPatternTuple)DateDataParser)default_parsers)parse)XPath)HtmlElement   )
CACHE_SIZE)	Extractor	trim_text)convert_dateis_valid_datevalidate_and_convertTc                 C   s   g | ]}|d vr|qS ))zno-spaces-timezrelative-time	timestamp ).0pr   r   T/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/htmldate/extractors.py
<listcomp>$   s
    r   pastF)	NORMALIZEPARSERSPREFER_DATES_FROMPREFER_LOCALE_DATE_ORDERRETURN_AS_TIMEZONE_AWARESTRICT_PARSING)	languageslocalesregionsettingszr.//*[self::div or self::h2 or self::h3 or self::h4 or self::li or self::p or self::span or self::time or self::ul]z.//*a  
[
    contains(translate(@id|@class|@itemprop, "D", "d"), 'date') or
    contains(translate(@id|@class|@itemprop, "D", "d"), 'datum') or
    contains(translate(@id|@class, "M", "m"), 'meta') or
    contains(@id|@class, 'time') or
    contains(@id|@class, 'publish') or
    contains(@id|@class, 'footer') or
    contains(@class, 'info') or
    contains(@class, 'post_detail') or
    contains(@class, 'block-content') or
    contains(@class, 'byline') or
    contains(@class, 'subline') or
    contains(@class, 'posted') or
    contains(@class, 'submitted') or
    contains(@class, 'created-post') or
    contains(@class, 'publication') or
    contains(@class, 'author') or
    contains(@class, 'autor') or
    contains(@class, 'field-content') or
    contains(@class, 'fa-clock-o') or
    contains(@class, 'fa-calendar') or
    contains(@class, 'fecha') or
    contains(@class, 'parution') or
    contains(@id, 'footer-info-lastmod')
] |
.//footer | .//small
z/text()   4   z).//div[@id="wm-ipp-base" or @id="wm-ipp"]z[0-3]?[0-9]z[0-1]?[0-9]z199[0-9]|20[0-3][0-9]z\b(\d{8})\bz(?:\D|^)(?:(?P<year>z)[\-/.](?P<month>z)[\-/.](?P<day>z)|(?P<day2>z)[\-/.](?P<month2>z")[\-/.](?P<year2>\d{2,4}))(?:\D|$)z)|(?P<month2>z)[\-/.](?P<year2>z
))(?:\D|$)u  
January?|February?|March|A[pv]ril|Ma[iy]|Jun[ei]|Jul[iy]|August|September|O[ck]tober|November|De[csz]ember|
Jan|Feb|M[aä]r|Apr|Jun|Jul|Aug|Sep|O[ck]t|Nov|De[cz]|
Januari|Februari|Maret|Mei|Agustus|
Jänner|Feber|März|
janvier|février|mars|juin|juillet|aout|septembre|octobre|novembre|décembre|
Ocak|Şubat|Mart|Nisan|Mayıs|Haziran|Temmuz|Ağustos|Eylül|Ekim|Kasım|Aralık|
Oca|Şub|Mar|Nis|Haz|Tem|Ağu|Eyl|Eki|Kas|Ara
z
(?P<month>z)\s
(?P<day>z)(?:st|nd|rd|th)?,? (?P<year>z)|
(?P<day2>z))(?:st|nd|rd|th|\.)? (?:of )?
(?P<month2>z)[,.]? (?P<year2>)
 z\D(z)[/_-](z	)(?:\D|$)z"dateModified": ?"(-z"datePublished": ?"((z).[0-9]{2}:[0-9]{2}:[0-9]{2}))janjanuaru   jännerjanuaryjanuarijanvierocakoca)febfebruarfeberfebruaryfebruariu   févrieru   şubatu   şub)maru   märu   märzmarchmaretmartmars)apraprilavrilnisannis)maymaimeiu   mayıs)junjunijunejuinhaziranhaz)juljulijulyjuillettemmuztem)augaugustagustusu   ağustosu   ağuaout)sep	september	septembreu   eylüleyl)octoktoberoctoberoctobreoktekimeki)novnovemberu   kasımkasnovembre)decdezdezemberdecemberdesemberu	   décembreu   aralıkarac                 C   s    i | ]\}}|D ]}||qqS r   r   )r   mnummlistmonthr   r   r   
<dictcomp>   s
    rm   )startz[.:,_/ -]|^\d+$u   ^\d{2}:\d{2}(?: |:|$)|^\D*\d{4}\D*$|[$€¥Ұ£¢₽₱฿#₹]|[A-Z]{3}[^A-Z]|(?:^|\D)(?:\+\d{2}|\d{3}|\d{5})\D|ftps?|https?|sftp|\.(?:com|net|org|info|gov|edu|de|fr|io)\b|IBAN|[A-Z]{2}[0-9]{2}|®u  (?:date[^0-9"]{,20}|updated|last-modified|published|posted|on)(?:[ :])*?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|(?:Datum|Stand|Veröffentlicht am):? ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|(?:güncellen?me|yayı(?:m|n)lan?ma) *?(?:tarihi)? *?:? *?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4}) *?(?:'de|'da|'te|'ta|’de|’da|’te|’ta|tarihinde) *(?:güncellendi|yayı(?:m|n)landı)z)[/.-](z)/(z)/([0-9]{2})|(z)[.-](z)[.-]([0-9]{2})z^\D?(u$   (?:©|\&copy;|Copyright|\(c\))\D*(?:z)?-?(z)\Dz"/([0-9]{4}/[0-9]{2}/[0-9]{2})[01/]z ([0-9]{4})/([0-9]{2})/([0-9]{2})z(\D([0-9]{4}[/.-][0-9]{2}[/.-][0-9]{2})\Dz(([0-9]{4})[/.-]([0-9]{2})[/.-]([0-9]{2})z-\D([0-3]?[0-9][/.-][01]?[0-9][/.-][0-9]{4})\Dz)\D?$z^(zE(\D19[0-9]{2}[01][0-9][0-3][0-9]\D|\D20[0-9]{2}[01][0-9][0-3][0-9]\D)z)([01][0-9])([0-3][0-9])zK\D([0-3]?[0-9]/[01]?[0-9]/[0129][0-9]|[0-3][0-9]\.[01][0-9]\.[0129][0-9])\Dz([0-9]{2})$z(\D([12][0-9]{3}[/.-](?:1[0-2]|0[1-9]))\Dz)[/.-](1[0-2]|0[1-9]|)z!\D([01]?[0-9][/.-][12][0-9]{3})\Dz(?<!w3.org)\D(treereturnc                 C   s2   g }t | D ]}|| | | q| |fS )zFDelete unwanted sections of an HTML document and return them as a list)DISCARD_EXPRESSIONSappend	getparentremove)ro   my_discardedsubtreer   r   r   discard_unwanted   s
   
rw   testurloptionsc              
   C   s   | durWt | }|rWtd|d  z'tt|d t|d t|d }t||j|j|j	dr8|
|jW S W dS  tyV } ztd|d | W Y d}~dS d}~ww dS )	zEExtract the date out of an URL string complying with the Y-M-D formatNzfound date in URL: %sr   r         earliestlatestzconversion error: %s %s)COMPLETE_URLsearchLOGGERdebugr   intr   formatminmaxstrftime
ValueError)rx   ry   match
dateobjecterrr   r   r   extract_url_date   s$   
$r   yearc                 C   s    | dk r| | dkrdnd7 } | S )z!Adapt year from YY to YYYY formatd   Z   il  i  r   )r   r   r   r   correct_year   s   r   dayrl   c                 C   s    |dkr| dkr|| fS | |fS )z/Swap day and month values if it seems feasible.   r   )r   rl   r   r   r   try_swap_values   s    r   stringc                 C   s   t | }|s	dS |jdkrdnd}z7t||d tt||d  d t||d }}}t|}t	||\}}t
|||}W n
 tyS   Y dS w td	| |S )
zTry full-text parse for date elements using a series of regular expressions
    with particular emphasis on English, French, German and TurkishNr   )r   rl   r   )day2month2year2r   r   .rz   zmultilingual text found: %s)LONG_TEXT_PATTERNr   	lastgroupr   groupTEXT_MONTHSlowerstripr   r   r   r   r   r   )r   r   groupsr   rl   r   r   r   r   r   regex_parse  s(   


r   outputformatmin_datemax_datec           
      C   s|  t d|  | dd  rd}| dd  rFztt| dd t| dd t| dd }W nE tyE   t d| dd  Y n3w zt| }W n* tyw   t d|  zt| dd	}W n tt	tfyt   t d
|  Y nw Y nw |durt
||||drt d| ||S t| }|rz(t|d dd t|d dd t|d dd }}}t|||}W n ty   t d|d  Y nw t
|d||drt d| ||S t| }|rZzH|jdkr
t|dt|dt|d}}}n"t|dt|dt|d}}}t|}t||\}}t|||}W n tyE   t d|d  Y nw t
|d||drZt d| ||S t| }|rz)|jdkrztt|dt|dd}ntt|dt|dd}W n ty   t d|d  Y nw t
|d||drt d| ||S t| }	t|	|||dS )z!Try to bypass the slow dateparserzcustom parse test: %sN      r%   z8-digit error: %sznot an ISO date string: %sF)fuzzyzdateutil parsing error: %sr|   zparsing result: %sr   zYYYYMMDD value error: %sr   %Y-%m-%dzYYYYMMDD match: %sr   r   rl   r   r   r   zregex value error: %szregex match: %szY-M value error: %szY-M match: %s)r   r   isdigitr   r   r   fromisoformatdateutil_parseOverflowError	TypeErrorr   r   YMD_NO_SEP_PATTERNr   YMD_PATTERNr   r   r   r   
YM_PATTERNr   r   )
r   r   r   r   	candidater   r   rl   r   r   r   r   r   custom_parse   s   *

@





r   c              
   C   sn   t d|  z	t| d }W n ttfy, } zd}t d| | W Y d}~nd}~ww |r5t||S dS )zEUse dateutil parser or dateparser module according to system settingszsend to external parser: %sdate_objNzexternal parser error: %s %s)	r   r   EXTERNAL_PARSERget_date_datar   r   errorr   r   )r   r   targetr   r   r   r   external_date_parser  s   r   )maxsizeextensive_searchc                 C   s   | sdS t | dt } | r dtttj|   krdks"dS  dS t| r)dS t| |||}|dur6|S |rLt	| rLt
| |}t||||drL|S dS )zIUse a series of heuristics and rules to parse a potential date expressionNr      r|   )r   MAX_SEGMENT_LENsummapstrr   DISCARD_PATTERNSr   r   TEXT_DATE_PATTERNr   r   )r   r   r   r   r   customresultdateparser_resultr   r   r   try_date_expr  s&   	"

r   c                 C   s&   |  d}|durt|d|S dS )zSkim through image elementsz'.//meta[@property="og:image"][@content]Ncontent)findr   get)ro   ry   elementr   r   r   
img_search  s   
r   textdate_patternc                 C   sN   | | }|r%t|d d|j|jdr%td||d  t|d d|jS dS )zILook for date expressions using a regular expression on a string of text.r   r   r|   zregex found: %s %sr   N)r   r   r   r   r   r   r   r   )r   r   ry   r   r   r   r   pattern_search  s   
r   c                 C   sD   |j rtnt}| dD ]}|jrd|jvrqt|j||  S dS )z8Look for JSON time patterns in JSON sections of the treezK.//script[@type="application/ld+json" or @type="application/settings+json"]z"dateN)originalJSON_PUBLISHEDJSON_MODIFIEDxpathr   r   )ro   ry   json_patternelemr   r   r   json_search  s   r   
htmlstringc              	   C   s   t | }|rqttd| }zLt|d dkr,tt|d t|d t|d }ntt|d t|d \}}t	t|d }t|||}t
|d|j|jdrZ||jW S W dS  ttfyp   td|d  Y dS w dS )	z5Look for author-written dates throughout the web pageNr   r   r   rz   r   r|   z!cannot process idiosyncrasies: %s)TEXT_PATTERNSr   listfilterr   lenr   r   r   r   r   r   r   r   r   
IndexErrorr   r   r   )r   ry   r   partsr   r   rl   r   r   r   r   idiosyncrasies_search  s(   
&r   )c__doc__loggingrer   	functoolsr   typingr   r   r   r   
dateparserr   dateparser_data.settingsr	   dateutil.parserr
   r   
lxml.etreer   	lxml.htmlr   r$   r   utilsr   r   
validatorsr   r   r   	getLogger__name__r   r   FAST_PREPENDSLOW_PREPENDDATE_EXPRESSIONSFREE_TEXT_EXPRESSIONSMIN_SEGMENT_LENr   rq   DAY_REMONTH_REYEAR_REcompiler   r   r   REGEX_MONTHSreplaceIr   r   r   r   TIMESTAMP_PATTERNMONTHS	enumerater   r   r   r   THREE_COMP_REGEX_ATHREE_COMP_REGEX_BTWO_COMP_REGEXYEAR_PATTERNCOPYRIGHT_PATTERNTHREE_PATTERNTHREE_CATCHTHREE_LOOSE_PATTERNTHREE_LOOSE_CATCHSELECT_YMD_PATTERNSELECT_YMD_YEARYMD_YEARDATESTRINGS_PATTERNDATESTRINGS_CATCHSLASHES_PATTERNSLASHES_YEARYYYYMM_PATTERNYYYYMM_CATCHMMYYYY_PATTERNMMYYYY_YEARSIMPLE_PATTERNrw   r   r   r   r   r   r   r   r   boolr   r   r   r   r   r   r   r   r   <module>   sx  
!
	
"

	







	

d(


