o
    Dh81                     @   s   d dl mZ d dl mZmZmZ d dlZd dlZd dlZd dlZddl	m
Z
mZmZmZ ddlT ddlmZmZ dd	 Zejd
fddZejfddZejfddZdd Zedkr_e  dS dS )    )absolute_import)divisionprint_functionunicode_literalsN   )escapePY3URLErrorurllib)*)get_stoplistget_stoplistsc                   C   s2   dt jt jtjd ttttt	t
ttd	 S )Na  Usage: %(progname)s -s STOPLIST [OPTIONS] [HTML_FILE]
Convert HTML to plain text and remove boilerplate.

  -o OUTPUT_FILE   if not specified, output is written to stdout
  --encoding=...   default character encoding to be used if not specified
                   in the HTML meta tags (default: %(default_encoding)s)
  --enc-force      force specified encoding, ignore HTML meta tags
  --enc-errors=... errors handling for character encoding conversion:
                     strict: fail on error
                     ignore: ignore characters which can't be converted
                     replace: replace characters which can't be converted
                              with U+FFFD unicode replacement characters
                   (default: %(default_enc_errors)s)
  --format=...     output format; possible values:
                     default: one paragraph per line, each preceded with
                              <p> or <h> (headings)
                     boilerplate: same as default, except for boilerplate
                                  paragraphs are included, too, preceded
                                  with <b>
                     detailed: one paragraph per line, each preceded with
                               <p> tag containing detailed information
                               about classification as attributes
                     krdwrd: KrdWrd compatible format
  --no-headings    disable special handling of headings
  --list-stoplists print a list of inbuilt stoplists and exit
  -V, --version    print version information and exit
  -h, --help       display this help and exit

If no HTML_FILE specified, input is read from stdin.

STOPLIST must be one of the following:
  - one of the inbuilt stoplists; see:
      %(progname)s --list-stoplists
  - path to a file with the most frequent words for given language,
    one per line, in UTF-8 encoding
  - None - this activates a language-independent mode

Advanced options:
  --length-low=INT (default %(length_low)i)
  --length-high=INT (default %(length_high)i)
  --stopwords-low=FLOAT (default %(stopwords_low)f)
  --stopwords-high=FLOAT (default %(stopwords_high)f)
  --max-link-density=FLOAT (default %(max_link_density)f)
  --max-heading-distance=INT (default %(max_heading_distance)i)
r   )	progname
length_lowlength_highstopwords_lowstopwords_highmax_link_densitymax_heading_distancedefault_encodingdefault_enc_errors)ospathbasenamesysargvLENGTH_LOW_DEFAULTLENGTH_HIGH_DEFAULTSTOPWORDS_LOW_DEFAULTSTOPWORDS_HIGH_DEFAULTMAX_LINK_DENSITY_DEFAULTMAX_HEADING_DISTANCE_DEFAULTDEFAULT_ENCODINGDEFAULT_ENC_ERRORS r$   r$   Q/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/justext/__main__.pyusage   s   -r&   Tc                 C   sR   | D ]$}|j dkr|jrd}nd}n|rqd}td|t|jddf |d qd	S )
z
    Outputs the paragraphs as:
    <tag> text of the first paragraph
    <tag> text of the second paragraph
    ...
    where <tag> is <p>, <h> or <b> which indicates
    standard paragraph, heading or boilerplate respecitvely.
    goodhpbz<%s> %sFquotefileN)
class_typeheadingprintr   text)
paragraphsfpno_boilerplate	paragraphtagr$   r$   r%   output_defaultJ   s   	
 r8   c              
   C   sB   | D ]}d|j |jt|j|jt|jddf }t||d qdS )z
    Same as output_default, but only <p> tags are used and the following
    attributes are added: class, cfclass and heading.
    z6<p class="%s" cfclass="%s" heading="%i" xpath="%s"> %sFr+   r-   N)r/   cf_classintr0   xpathr   r2   r1   )r3   r4   r6   outputr$   r$   r%   output_detaileda   s   r=   c                 C   sR   | D ]$}|j dv r|jrd}nd}nd}|jD ]}td|| f |d qqdS )a  
    Outputs the paragraphs in a KrdWrd compatible format:
    class<TAB>first text node
    class<TAB>second text node
    ...
    where class is 1, 2 or 3 which means
    boilerplate, undecided or good respectively. Headings are output as
    undecided.
    )r'   neargood      r   z%i	%sr-   N)r/   r0   
text_nodesr1   strip)r3   r4   r6   cls	text_noder$   r$   r%   output_krdwrdq   s   


rE   c                  C   sL  dd l } ddlm} z|  tjdd  dg d\}}W n' | jyC } zt|tjd tt tjd t	d W Y d }~nd }~ww t
dd }tj}trW|tjj}n|tj}d }d	}	d
}
t}t}t}t}t}t}d }t}d
}t}z|D ]\}}|dv rtt  t	d |dv rtdtjtjd |f  t	d qz|dkrtdtt  t	d qz|dkrz	t
|dd}W qz ty } zt d||f d }~ww |dkrV|! dkrt" }qztj#|r0zt
|dd}t"dd |D }|$  W qz ty } zt d||f d }~w t%y/ } zt d| d }~ww |t v r;t&|}qzt'(d|rPt d|dtt f t d| |dkrsz	|}d)| W qz t*yr   t d| w |d kr{d!}qz|d"kr|! d#v r|! }qzt d$| |d%kr|d&v r|}	qzt d'| |d(krd!}
qz|d)krzt+|}W qz t,y   t d*||f w |d+krzt+|}W qz t,y   t d*||f w |d,krzt-|}W qz t,y    t d-||f w |d.krzt-|}W qz t,y   t d-||f w |d/kr9zt-|}W qz t,y8   t d-||f w |d0krUzt+|}W qz t,yT   t d*||f w qz|r[|}|d u rdt d1|skd}d}|rzt'(d2|d rt./|d }nt|d d}W n tt0fy } z
t d|d |f d }~ww |1 }|tjur|$  t|||||||||
|||}|	d	krt2|| W d S |	d3krt2||d
d4 W d S |	d5krt3|| W d S |	d6krt4|| W d S t5d7|	  t6y% } ztd8tjtjd |f tjd t	d W Y d }~d S d }~ww )9Nr   )__version__r   zo:s:hV)z	encoding=z	enc-forcezenc-errors=zformat=zno-headingshelpversionzlength-low=zlength-high=zstopwords-low=zstopwords-high=zmax-link-density=zmax-heading-distance=zlist-stoplistsr-   utf8defaultF)z-hz--help)z-Vz	--versionzK%s: jusText v%s

Copyright (c) 2011 Jan Pomikalek <jan.pomikalek@gmail.com>z--list-stoplists
z-owzCan't open %s for writing: %sz-snonerc                 S   s   g | ]}|  qS r$   )rB   ).0lr$   r$   r%   
<listcomp>   s    zmain.<locals>.<listcomp>zCan't open %s for reading: %szLUnicode decoding error when reading the stoplist (probably not in UTF-8): %sz^\w*$z,Unknown stoplist: %s
Available stoplists:
%szFile not found: %sz
--encoding zUknown character encoding: %sz--enc-forceTz--enc-errors)strictignorereplacezInvalid --enc-errors value: %sz--format)rK   boilerplatedetailedkrdwrdzUknown output format: %sz--no-headingsz--length-lowz-Invalid value for %s: '%s'. Integer expected.z--length-highz--stopwords-lowz+Invalid value for %s: '%s'. Float expected.z--stopwords-highz--max-link-densityz--max-heading-distancezNo stoplist specified.z	[^:/]+://rW   )r5   rX   rY   zUnknown format: %sz%s: %s)7getoptjustextrF   r   r   GetoptErrorr1   stderrr&   exitcodecslookupstdinr   stdoutbufferr   r   r   r   r    r!   r"   r#   r   r   r   joinsortedr   openIOErrorJustextInvalidOptionslowersetisfilecloseUnicodeDecodeErrorr   rematchencodeLookupErrorr:   
ValueErrorfloatr
   urlopenr	   readr8   r=   rE   AssertionErrorJustextError)rZ   VERSIONoptsargsestream_writerfp_infp_outstoplistformatno_headingsr   r   r   r   r   r   encodingr   force_default_encoding
enc_errorsoafp_stoplist	html_textr3   r$   r$   r%   main   sx  $































$r   __main__)
__future__r   r   r   r   r_   r   rn   r   _compatr   r   r	   r
   coreutilsr   r   r&   rb   r8   r=   rE   r   __name__r$   r$   r$   r%   <module>   s$   : 8
