o
    Dh)                     @   sD  d Z ddlZddlZddlZddlmZmZ ddlmZ ddl	m
Z
mZmZmZmZ ddlmZ ddlmZ dd	lmZ eeZd
e
de
fddZ				d"dee dededee dedeeeef  fddZdedeee  fddZd
e
ddfddZd
e
ddfddZd
e
ddfddZd#dd Z ed!kre   dS dS )$z,
Implements a basic command-line interface.
    N)ProcessPoolExecutoras_completed)islice)AnyIteratorListOptionalTuple   )	check_url)_make_sample)UrlStoreargsreturnc                 C   s   t jdd}|dd}|jdddtdd	 |jd
ddtdd	 |jdddtd |jddddd |jdddtd |dd}|jdddd |jdddtd |jd d!d"dd |d#d$}|jd%d&td |jd'd(td |jd)d*td | S )+z(Define parser for command-line argumentsz"Command-line interface for Courlan)descriptionzI/OzManage input and outputz-iz--inputfilezname of input file (required)T)helptyperequiredz-oz--outputfilezname of output file (required)z-dz--discardedfilez/name of file to store discarded URLs (optional))r   r   z-vz	--verbosezincrease output verbosity
store_true)r   actionz-pz
--parallelz4number of parallel processes (not used for sampling)	FilteringzConfigure URL filtersz--strictzperform more restrictive testsz-lz
--languagez$use language filter (ISO 639-1 code)z-rz--redirectszcheck redirectsSamplingz+Use sampling by host, configure sample sizez--samplezsize of sample per domainz--exclude-maxz%exclude domains with more than n URLsz--exclude-minz%exclude domains with less than n URLs)argparseArgumentParseradd_argument_groupadd_argumentstrint
parse_args)r   
argsparsergroup1group2group3 r#   L/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/courlan/cli.pyr      sl   r   Furlsstrictwith_redirectslanguagewith_navc                 C   sN   g }| D ] }t |||||d}|dur|d|d f q|d|f q|S )z6Internal function to be used with CLI multiprocessing.)r&   r'   r(   r)   NTr   F)r   append)r%   r&   r'   r(   r)   resultsurlresultr#   r#   r$   _cli_check_urlsO   s   r.   	inputfilec                 c   s\    t | dddd}	 dd t|dD }|s 	 W d	   d	S |V  q1 s'w   Y  d	S )
zRead input line in batchesrutf-8ignoreencodingerrorsTc                 S   s   g | ]}|  qS r#   )strip).0liner#   r#   r$   
<listcomp>k   s    z _batch_lines.<locals>.<listcomp>i N)openr   )r/   inputfhbatchr#   r#   r$   _batch_linesg   s   r=   c                 C   s   | j r
ttj nttj tdd| j| j d}t| j	D ]}|
| qt| jddd}t|| j| j| jdD ]	}||d  q;W d   dS 1 sPw   Y  dS )	zSample URLs on the CLI.TN)
compressedr(   r&   verbosewr1   r4   )exclude_minexclude_max
)r?   LOGGERsetLevelloggingDEBUGERRORr   r&   r=   r/   add_urlsr:   
outputfiler   samplerB   rC   write)r   urlstorer<   outputfhr,   r#   r#   r$   _cli_sampleq   s$   
"rP   c           
         st  t  jdt jddd}t jddddp}	 g }t|d	k r8tt|d	}|s-n|| t|d	k s#|s;nG fd
d|D }t	|D ]7}|
 D ]0\}}|r]||d  qO jdurt jddd}	|	| W d   n1 szw   Y  qOqIqW d   n1 sw   Y  W d   n1 sw   Y  W d   dS W d   dS 1 sw   Y  dS )z7Read input file bit by bit and process URLs in batches.)max_workersr@   r1   rA   r0   r2   r3   Ti  c                 3   s*    | ]}j t| j j jd V  qdS ))r&   r'   r(   N)submitr.   r&   	redirectsr(   )r7   r<   r   executorr#   r$   	<genexpr>   s    
z_cli_process.<locals>.<genexpr>rD   Na)r   parallelr:   rK   r/   lenlistr   r*   r   r-   rM   discardedfile)
r   rO   r;   batches
line_batchfuturesfuturevalidr,   	discardfhr#   rT   r$   _cli_process   sN   


	P rb   c                 C   s   | j r	t|  dS t|  dS )z+Start processing according to the argumentsN)rL   rP   rb   r   r#   r#   r$   process_args   s   rd   c                  C   s   t tjdd } t|  dS )zRun as a command-line utility.r
   N)r   sysargvrd   rc   r#   r#   r$   main   s   rg   __main__)FFNF)r   N)!__doc__r   rG   re   concurrent.futuresr   r   	itertoolsr   typingr   r   r   r   r	   corer   samplingr   rN   r   	getLogger__name__rE   r   r   boolr.   r=   rP   rb   rd   rg   r#   r#   r#   r$   <module>   sJ    
<

)

