
    Ƒi%                         S r SSKrSSKJrJr  SrSrSrSrSr	S	r
S
\S\S-  4S jrS
\S\S-  4S jrS
\S\S-  4S jrS\S\4S jrSS\S\S\4S jjrg)a  Stage 1a+: UTF-16/UTF-32 detection for data without BOM.

This stage runs after BOM detection but before binary detection.
UTF-16 and UTF-32 encoded text contains characteristic null-byte patterns
that would otherwise cause binary detection to reject the data.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    N)DETERMINISTIC_CONFIDENCEDetectionResulti      
   gQ?      ?gffffff?datareturnc                 r    U S[          n[        U5      [        :  a  g[        U5      nUb  U$ [	        U5      $ )a   Detect UTF-32 or UTF-16 encoding from null-byte patterns.

UTF-32 is checked before UTF-16 since UTF-32 patterns are more specific.

:param data: The raw byte data to examine.
:returns: A :class:`DetectionResult` if a strong pattern is found, or ``None``.
N)_SAMPLE_SIZElen_MIN_BYTES_UTF16_check_utf32_check_utf16)r   sampleresults      X/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/chardet/pipeline/utf1632.pydetect_utf1632_patternsr   *   sD     -< F
6{%% &!F     c           	        ^  [        T 5      [        T 5      S-  -
  nU[        :  a  gT SU m US-  n[        U 4S j[        S[        T 5      S5       5       5      n[        U 4S j[        S[        T 5      S5       5       5      nX2:X  a:  XB-  S:  a2   T R	                  S5      n[        U5      (       a  [        S[        SS9$  [        U 4S	 j[        S
[        T 5      S5       5       5      n[        U 4S j[        S[        T 5      S5       5       5      nXb:X  a;  Xr-  S:  a3   T R	                  S5      n[        U5      (       a  [        S[        SS9$  gg! [         a     Nf = f! [         a     gf = f)av  Check for UTF-32 encoding based on 4-byte unit structure.

For valid Unicode (U+0000 to U+10FFFF = 0x0010FFFF):
- UTF-32-BE: the first byte of each 4-byte unit is always 0x00
- UTF-32-LE: the last byte of each 4-byte unit is always 0x00

For BMP characters (U+0000 to U+FFFF), additionally:
- UTF-32-BE: the second byte is also 0x00
- UTF-32-LE: the third byte is also 0x00
   Nc              3   >   >#    U  H  nTU   S :X  d  M  Sv   M     g7fr      N .0ir   s     r   	<genexpr>_check_utf32.<locals>.<genexpr>T        J#9aT!W\#9   	r   c              3   D   >#    U  H  nTUS -      S:X  d  M  S v   M     g7f)r   r   Nr   r   s     r   r   r   V   s$     O$:qd1q5kQ>N$:s    	 r   z	utf-32-beencoding
confidencelanguagec              3   >   >#    U  H  nTU   S :X  d  M  Sv   M     g7fr   r   r   s     r   r   r   e   s     I"8QDGqLqq"8r!      c              3   >   >#    U  H  nTU   S :X  d  M  Sv   M     g7fr   r   r   s     r   r   r   g   r    r!      z	utf-32-le)	r   _MIN_BYTES_UTF32sumrangedecode_looks_like_textr   r   UnicodeDecodeError)r   trimmed_len	num_unitsbe_first_nullbe_second_nulltextle_last_nullle_third_nulls   `       r   r   r   @   sv    d)s4y1}-K%%Dq I J5CIq#9JJMOE!SY$:OON!n&@3&F		;;{+D%%&(7!  & I%3t9a"8IILJ5CIq#9JJM ]%>%D		;;{+D%%&(7!  & ) " 		" " 		s$   /E "/E% 
E"!E"%
E21E2c                   ^  [        [        T 5      [        5      nXS-  -  nU[        :  a  gUS-  n[	        U 4S j[        SUS5       5       5      n[	        U 4S j[        SUS5       5       5      nX2-  nXB-  n/ nU[        :  a  UR                  SU45        U[        :  a  UR                  SU45        U(       d  g[        U5      S:X  a>  US   S   n T SU R                  U5      n	[        U	5      (       a  [        U[        SS	9$  gSn
S
nU H0  u  p T SU R                  U5      n	[        U	5      nX:  d  M,  UnUn
M2     U
b  U[        :  a  [        U
[        SS	9$ g! [         a     gf = f! [         a     Mn  f = f)a  Check for UTF-16 via null-byte patterns in alternating positions.

UTF-16 encodes each BMP character as two bytes.  For characters whose
code-point high byte is 0x00 (Latin, digits, basic punctuation, many
control structures), one of the two bytes in each unit will be a null.
Even for non-Latin scripts (Arabic, CJK, Cyrillic, etc.) a significant
fraction of code units still contain at least one null byte.

Non-UTF-16 single-byte encodings never contain null bytes, so even a
small null-byte fraction in alternating positions is a strong signal.

When both endiannesses show null-byte patterns (e.g., Latin text where
every other byte is null), we disambiguate by decoding both ways and
comparing text-quality scores.
r*   Nc              3   >   >#    U  H  nTU   S :X  d  M  Sv   M     g7fr   r   r   s     r   r   _check_utf16.<locals>.<genexpr>        K#:ad1gl#:r!   r   c              3   >   >#    U  H  nTU   S :X  d  M  Sv   M     g7fr   r   r   s     r   r   r:      r;   r!   r   z	utf-16-lez	utf-16-ber#         )minr   r   r   r,   r-   _UTF16_MIN_NULL_FRACTIONappendr.   r/   r   r   r0   _text_quality_MIN_TEXT_QUALITY)r   
sample_lenr2   be_null_countle_null_countbe_fracle_frac
candidatesr$   r5   best_encodingbest_quality_qualitys   `             r   r   r   x   s     SY-Jq. J$$aI K5J#:KKMK5J#:KKM'G'G*,J**;01**;01 :!a=#		$++H5D%%&%7!  &  !%ML!	$++H5D  %!"L$M "  \5F%F"/
 	
 5 " 		 " 		s$   2E" E2"
E/.E/2
F ?F r5   c                 j    U (       d  gU SS n[        S U 5       5      nU[        U5      -  [        :  $ )z9Quick check: is decoded text mostly printable characters.FN  c              3   ^   #    U  H#  oR                  5       (       d  US ;   d  M  Sv   M%     g7f)
	r   N)isprintable)r   cs     r   r   #_looks_like_text.<locals>.<genexpr>   s      Jv!AMAAvs   -	-)r,   r   _MIN_PRINTABLE_FRACTION)r5   r   	printables      r   r/   r/      s8    $3ZFJvJJIs6{"%<<<r   limitc                    U SU n[        U5      nUS:X  a  gSnSnSnSnSnU Hr  n	[        R                  " U	5      n
U
S   S:X  a  US-  n[        U	5      S:  a  US-  nM=  M?  U
S   S:X  a  US-  nMO  U
S:X  d  U	S	;   a  US-  nMb  U
S   S
:X  d  Mm  US-  nMt     Xs-  S:  a  gXS-  S:  a  gXC-  nXU-  S-  -  nUS:  a  US:  a  US-  nU$ )u  Score how much *text* looks like real human-readable content.

Returns a score in the range [-1.0, ~1.6).  Higher values indicate
more natural text.  The practical maximum is 1.5 for all-ASCII-letter
input (1.6 approaches as sample size grows with all ASCII letters plus
whitespace).  A score of -1.0 means the content is almost certainly not
valid text (too many control characters or combining marks).

Scoring factors:

* Base score: ratio of Unicode letters (category ``L*``) to sample length.
* ASCII bonus: additional 0.5x weight for ASCII letters.  This is the
  primary signal for disambiguating endianness — correct decoding of
  Latin-heavy text produces ASCII letters, wrong decoding produces CJK.
* Space bonus: +0.1 when the sample contains at least one whitespace
  character and is longer than 20 characters.
* Rejection: returns -1.0 if >10% control characters or >20% combining
  marks (category ``M*``).
Nr   r=   Lr      MZsrP   Cg?g?r      )r   unicodedatacategoryord)r5   rV   r   nlettersmarksspacescontrolsascii_lettersrR   catscores               r   rA   rA      s   ( &5\FFAAvGEFHM""1%q6S=qLG1v|" Vs]QJED[AMaKFVs]MH  |cy3KE	a3&&E2v&1*Lr   )rN   )__doc__r^   chardet.pipeliner   r   r   r+   r   r?   rB   rT   bytesr   r   r   strboolr/   intfloatrA   r   r   r   <module>rp      s   	  F            %  Od,B  ,5u 54!7 5pMu M4!7 M`=3 =4 =9 9C 9% 9r   