
    ƑiZ                       % S r SSKJr  SSKJr  SSKJr  SSKJrJ	r	J
r
Jr  SSKJrJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJ r J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,  \" S\SS9r-\" SSSS9r.\" SSSS9r/Sr0\1" 1 Sk5      r2S\3S'   \1" 1 Sk5      r4S\3S'   \1" 1 Sk5      r5S\3S '   \1" 1 S!k5      r6S\3S"'   \4\5\6S#.r7S$\3S%'   \1" 1 S&k5      r8S\3S''   S8S( jr9S)r:S*r;S+r<S,r=S-r>        S9S. jr?          S:S/ jr@      S;S0 jrA      S;S1 jrBS2rCS<S3 jrD      S;S4 jrE      S;S5 jrF\4       S=S6 jjrG\4       S=S7 jjrHg)>u@   Pipeline orchestrator — runs all detection stages in sequence.    )annotations)DEFAULT_MAX_BYTES)EncodingEra)BigramProfilehas_model_variantsinfer_languagescore_best_language)DETERMINISTIC_CONFIDENCE
HIGH_BYTESDetectionResultPipelineContext)detect_ascii)	is_binary)
detect_bom)resolve_confusion_groups)detect_escape_encoding)detect_markup_charset)score_candidates)compute_lead_byte_diversitycompute_multibyte_byte_coveragecompute_structural_score)detect_utf8)detect_utf1632_patterns)filter_by_validity)REGISTRYEncodingInfoget_candidatesNencoding
confidencelanguageutf-8g?windows-1252g333333?>   
iso-8859-1iso-8859-15r#   zfrozenset[str]_COMMON_LATIN_ENCODINGS>.                                                                                                                                             zfrozenset[int]_ISO_8859_10_DISTINGUISHING>   r'   r(   r*   r+   r,   r-   r/   r0   r1   r3      r4   r5   r6   r7   r8      r:   r;   r<   r=   r>   r?   r@   rA      rI         rR      _ISO_8859_14_DISTINGUISHING>   rX      rY   rZ      r[   _WINDOWS_1254_DISTINGUISHING)ziso-8859-10ziso-8859-14zwindows-1254zdict[str, frozenset[int]]_DEMOTION_CANDIDATES>                           r'   r(   r+   r8   _KOI8_T_DISTINGUISHINGc                j   ^ [         R                  U 5      mTc  g[        U4S jU 5       5      (       + $ )aa  Return True if encoding is a demotion candidate with no distinguishing bytes.

Checks whether any non-ASCII byte in *data* falls in the set of byte
values that decode differently under the given encoding vs iso-8859-1.
If none do, the data is equally valid under both encodings and there is
no byte-level evidence for preferring the candidate encoding.
Fc              3  <   >#    U  H  oS :  d  M
  UT;   v   M     g7f   N ).0bdistinguishings     ]/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/chardet/pipeline/orchestrator.py	<genexpr>!_should_demote.<locals>.<genexpr>   s     A1D&1&s   	)r`   getany)r   datarq   s     @rr   _should_demoterx      s2     *--h7NAAAAA    g?   gffffff?      c                D   / nU GH  nUR                   (       a  [        XU5      nXRR                  UR                  '   U[        :  a  ME  UR
                  c0  [        U 5      [        U R                  S[        5      5      -
  Ul        UR
                  [        :  a  M  [        XX"R
                  S9nXbR                  UR                  '   U[        :  a  M  UR
                  [        :  a  [        XU5      nU[        :  a  M  UR!                  U5        GM     [#        U5      $ )a  Eliminate CJK multi-byte candidates that lack genuine multi-byte structure.

Four checks are applied in order to each multi-byte candidate:

1. **Structural pair ratio** (valid_pairs / lead_bytes) must be
   >= ``_CJK_MIN_MB_RATIO``.  Catches files with many orphan lead bytes.

2. **Minimum non-ASCII byte count**: the data must contain at least
   ``_CJK_MIN_NON_ASCII`` bytes > 0x7F.  Tiny files with 1-5 high bytes
   can accidentally form perfect pairs and score 1.0 structurally.

3. **Byte coverage** (non-ASCII bytes in valid multi-byte sequences /
   total non-ASCII bytes) must be >= ``_CJK_MIN_BYTE_COVERAGE``.  Latin
   text has many high bytes that are NOT consumed by multi-byte pairs;
   genuine CJK text has nearly all high bytes accounted for.

4. **Lead byte diversity**: the number of distinct lead byte values in
   valid pairs must be >= ``_CJK_MIN_LEAD_DIVERSITY``.  Genuine CJK text
   draws from a wide repertoire of lead bytes; European false positives
   cluster in a narrow band (e.g. 0xC0-0xDF for accented Latin).

Returns the filtered candidate list.  Structural scores are cached in
``ctx.mb_scores`` for reuse in Stage 2b.
N)non_ascii_count)is_multibyter   	mb_scoresname_CJK_MIN_MB_RATIOr~   len	translater   _CJK_MIN_NON_ASCIIr   mb_coverage_CJK_MIN_BYTE_COVERAGE_CJK_DIVERSITY_MIN_NON_ASCIIr   _CJK_MIN_LEAD_DIVERSITYappendtuple)rw   valid_candidatesctxgatedencmb_scorebyte_coveragelead_diversitys           rr   _gate_cjk_candidatesr      s    : !#E/3?H&.MM#((#++""*&)$i#dnnT:6V2W&W#""%77;30C0CM )6OOCHH%55""&BB!<T!L!$;;S)  * <ry   c           	     L  ^ U Vs0 s H"  oDR                   (       d  M  UR                  U_M$     snm[        U4S jU 5       5      n[        S U 5       5      n[        [	        U / UQUQ75      5      n/ nU H  n	U	R
                  (       a&  UR                  R                  U	R
                  S5      OSn
U
S:  a@  UR                  [        U	R
                  U	R                  SU
-   -  U	R                  S95        M  UR                  U	5        M     UR                  S SS	9  U$ s  snf )
a]  Score structurally-valid CJK candidates using statistical bigrams.

When multiple CJK encodings score equally high structurally, statistical
scoring differentiates them (e.g. euc-jp vs big5 for Japanese data).
Single-byte candidates are also scored and included so that the caller
can compare CJK vs single-byte confidence.

Multi-byte candidates with high byte coverage (>= 0.95) receive a
confidence boost proportional to coverage.  When nearly all non-ASCII
bytes form valid multi-byte pairs, the structural evidence is strong
and should increase the candidate's ranking relative to single-byte
alternatives whose bigram models may score higher on small samples.

Note: boosted confidence values may exceed 1.0 and are used only for
relative ranking among candidates.  ``run_pipeline`` clamps all
confidence values to [0.0, 1.0] before returning to callers.
c              3  B   >#    U  H  u  pUT;   d  M  TU   v   M     g 7fNrn   )ro   r   _sc
enc_lookups      rr   rs   /_score_structural_candidates.<locals>.<genexpr>5  s&      *;YTtz?Q
4*;s   c              3  J   #    U  H  oR                   (       a  M  Uv   M     g 7fr   )r   )ro   es     rr   rs   r   8  s     J#3a>>#3s   #	#        gffffff?   r   c                    U R                   $ r   )r    xs    rr   <lambda>._score_structural_candidates.<locals>.<lambda>I  s    q||ry   Tkeyreverse)r   r   r   listr   r   r   ru   r   r   r    r!   sort)rw   structural_scoresr   r   r   valid_mbsingle_byteresultsboostedrcoverager   s              @rr   _score_structural_candidatesr     s	   . &6H%5)!&&!)%5HJ *; H J#3JJK#D*CH*C{*CDEG &(G;<::3??&&qzz373tNNZZ ||q8|<ZZ NN1  LL+TL:N- Is
   D!D!c                   [        U5      S:  a  US   R                  b  [        US   R                  U 5      (       a  US   R                  nUSS  Hl  nUR                  [        ;   d  M  U Vs/ s H  oDR                  U:w  d  M  XCLd  M  UPM     nnU Vs/ s H  oDR                  U:X  d  M  UPM     nnU/UQUQs  $    U$ s  snf s  snf )a  Demote niche Latin encodings when no distinguishing bytes are present.

Some bigram models (e.g. iso-8859-10, iso-8859-14, windows-1254) can win
on data that contains only bytes shared with common Western Latin
encodings.  When there is no byte-level evidence for the winning
encoding, promote the first common Western Latin candidate to the top and
push the demoted encoding to last.
r   r   N)r   r   rx   r&   )rw   r   demoted_encodingr   r   othersdemoted_entriess          rr   _demote_niche_latinr   M  s     	GqAJ+71:..55"1:..Azz44&&!**8H*HAQZAw   /6"XgGW9W1g"X5F5_55  N #Ys   /CCCC	/C	c                4   U(       a  US   R                   S:w  a  U$ Sn[        U5       H  u  p4UR                   S:X  d  M  Un  O   Uc  U$ [        S U  5       5      (       a/  X   n[        U5       VVs/ s H  u  p4X2:w  d  M  UPM     nnnU/UQ$ U$ s  snnf )ag  Promote KOI8-T over KOI8-R when Tajik-specific bytes are present.

KOI8-T and KOI8-R share the entire 0xC0-0xFF Cyrillic letter block,
making statistical discrimination difficult.  However, KOI8-T maps 12
bytes in 0x80-0xBF to Tajik-specific Cyrillic letters where KOI8-R has
box-drawing characters.  If any of these bytes appear, KOI8-T is the
better match.
r   zkoi8-rNzkoi8-tc              3  B   #    U  H  oS :  d  M
  U[         ;   v   M     g7frl   )ri   )ro   rp   s     rr   rs   !_promote_koi8t.<locals>.<genexpr>  s     
A1D&1&&s   	)r   	enumeraterv   )rw   r   	koi8t_idxir   koi8t_resultr   s          rr   _promote_koi8tr   i  s     gaj))X5I'"::!I # 

A
AAA) )' 2E 2an! 2E&v&&N Fs   5BBi   c                |    US:X  a  U $  U R                  USS9R                  SSS9$ ! [        [        4 a     gf = f)a<  Decode data from encoding and re-encode as UTF-8 for language scoring.

Returns None if the encoding is unknown. For UTF-8, returns data as-is.
Uses ``errors="ignore"`` because the data already passed byte-validity
filtering for the detected encoding; any residual invalid bytes are
irrelevant for language scoring.
r"   ignore)errorssurrogatepassN)decodeencodeLookupError	TypeError)rw   r   s     rr   _to_utf8r     sY     7{{8H{5<<O = 
 	
 # s   ( ;;c           	     b   / nSnSnU GH!  nUR                   c  UR                  b  [        UR                  5      nUcE  U (       a>  [        UR                  5      (       a$  Uc  [	        U 5      n[        XR                  US9u  pvUc^  U (       aW  [        S5      (       aG  [        XR                  5      nU(       a+  Ub  UR                  S:w  a  [	        U5      n[        USUS9u  pvUb1  UR                  [        UR                  UR                  US95        GM  UR                  U5        GM$     U$ )a  Fill in language for results missing it.

Tier 1: single-language encodings via hardcoded map (instant).
Tier 2: multi-language encodings via statistical bigram scoring (lazy).
Tier 3: decode to UTF-8, score against UTF-8 language models (universal fallback).
N)profiler"   r   )
r!   r   r   r   r   r	   r   r   r   r    )	rw   r   filledr   utf8_profileresultlang_	utf8_datas	            rr   _fill_languager     s    %'F$(G)-L??"v'B!&//2D|);FOO)L)L?+D1G-dOOWU|);G)D)D$T??;	#+v'/I'4Y'?1!7LGA #!'#)#4#4!% f7 8 Mry   c                D    [        X5      n[        X5      n[        X5      $ )zGApply confusion resolution, niche Latin demotion, and KOI8-T promotion.)r   r   r   )rw   r   s     rr   _postprocess_resultsr     s#    
 't5G!$0G$((ry   c                   [        5       nU SU n U (       d  [        /$ [        U 5      nUb  U/$ [        U 5      nUb  U/$ [	        U 5      nUbF  UR
                  b9  [        R                  " UR
                  5      nUb  XR                  -  (       a  U/$ [        U 5      nUc  [        XS9(       a  [        /$ [        U 5      n	U	b  U	/$ [        U 5      n
U
b  U
/$ Ub  U/$ [        U5      n[        X5      nU(       d  [         /$ [#        XU5      nU(       d  [         /$ / nU Ho  nUR$                  (       d  M  UR&                  R                  UR(                  5      nUc  [+        XU5      nUS:  d  MR  UR-                  UR(                  U45        Mq     U(       a;  UR/                  S SS9  US   u  nnU[0        :  a  [3        XX5      n[5        U U5      $ [7        [9        U [;        U5      5      5      nU(       d  [         /$ [5        U U5      $ )zBCore pipeline logic. Returns list of results sorted by confidence.N)	max_bytesr   c                    U S   $ )Nr   rn   r   s    rr   r   $_run_pipeline_core.<locals>.<lambda>0  s    QqTry   Tr   r   )r   _EMPTY_RESULTr   r   r   r   r   ru   erar   r   _BINARY_RESULTr   r   r   r   _FALLBACK_RESULTr   r   r   r   r   r   r    _STRUCTURAL_CONFIDENCE_THRESHOLDr   r   r   r   r   )rw   encoding_erar   r   
bom_resultutf1632_resultescape_resultenc_infoutf8_precheckmarkup_resultascii_result
candidatesr   r   r   scorer   
best_scorer   s                      rr   _run_pipeline_corer     s,    
C
D D!J|
 -T2N! +40M ]%;%;%G<< 6 67|ll:!?"  %M 4!E
 *$/M   %L~    -J)$; !! ,DCH !! 24MM%%chh/E}0C@s{!((#((E):;   >4@)!,:992)9G (g66 #D%0@*ABCG !!g..ry   c           
     "   [        XU5      n[        U S[         U5      nU(       d  Sn[        U5      eU Vs/ s HJ  nUR                  S:  a5  [        UR                  [        UR                  S5      UR                  5      OUPML     sn$ s  snf )a  Run the full detection pipeline.

:param data: The raw byte data to analyze.
:param encoding_era: Filter candidates to a specific era of encodings.
:param max_bytes: Maximum number of bytes to process.
:returns: A list of :class:`DetectionResult` sorted by confidence descending.
Nz/pipeline must always return at least one resultg      ?)	r   r   _LANG_SCORE_MAX_BYTESRuntimeErrorr    r   r   minr!   )rw   r   r   r   msgr   s         rr   run_pipeliner   @  s     !Y?G T"8#897CG?3 	 A <<# 	

Cc$:AJJG	 	  s   AB)r   strrw   bytesreturnbool)rw   r   r   tuple[EncodingInfo, ...]r   r   r   r   )
rw   r   r   zlist[tuple[str, float]]r   r   r   r   r   list[DetectionResult])rw   r   r   r   r   r   )rw   r   r   r   r   zbytes | None)rw   r   r   r   r   intr   r   )I__doc__
__future__r   chardet._utilsr   chardet.enumsr   chardet.modelsr   r   r   r	   chardet.pipeliner
   r   r   r   chardet.pipeline.asciir   chardet.pipeline.binaryr   chardet.pipeline.bomr   chardet.pipeline.confusionr   chardet.pipeline.escaper   chardet.pipeline.markupr   chardet.pipeline.statisticalr   chardet.pipeline.structuralr   r   r   chardet.pipeline.utf8r   chardet.pipeline.utf1632r   chardet.pipeline.validityr   chardet.registryr   r   r   r   r   r   r   	frozensetr&   __annotations__rU   r\   r_   r`   ri   rx   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rn   ry   rr   <module>r     s   F " , %   0 - + ? : 9 9 
 . < 8 C C 6
  TDQ #t 
 $(   +4+   /8/1/ ^ 1n /8 "/ ^ "T 09(0 n  /.03 /  *3L*  
B$          " 3
3.3 
3 	3l-
-.- /- 
	-
 -`
" 8
" B  $(
(/((V)
)") ) 'i/
i/i/ i/ 	i/^ '
  	ry   