
    |-j,                       d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6  e6j7        e8          Z9dZ:dZ;dZ<dZ=dZ>e,dz  Z,ee e!e"dZ?e:e=dZ@ e5e,           G d de/                      ZAeAZBdS )z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)copyfile)Any)is_offline_mode)
AddedToken
processors)Encoding)	Tokenizer)Decoder)BPEUnigram)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainercached_file   )SpmConverter)convert_gguf_tokenizer)load_gguf_checkpoint)INIT_TOKENIZER_DOCSTRINGBatchEncodingPreTokenizedInputPreTrainedTokenizerBase	TextInputTruncationStrategygenerate_merges)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)r   r   	WordLevel	WordPiece)tokenizer_file
vocab_filec            )           e Zd ZdZeZdZdZedLd            Z	 fdZ
edefd            Zedefd            ZdMd	ed
edz  dee         fdZd Zed             Zed             Zej        d             Zej        d             Zd Zedefd            Zdeeef         fdZedeeef         fd            Zedeeef         fd            Zedeeef         fd            ZeZeZ deeef         fdZ!defdZ"defdZ#ede$fd            Z%ede&fd            Z'	 	 	 	 	 	 	 dNde(dedz  dedz  d ed!ed"ed#ed$edeeee)f         e*e(         f         fd%Z+d&edefd'Z,d(ededz  fd)Z-dLd*e*eez           defd+Z.dLd,edefd-Z/dLd.ee*e         z  d/edee*e         z  fd0Z0dOd1ed,edz  d2ede*e         fd3Z1d4e2d5e3d6ed7ed8edz  d9edz  fd:Z4dde2j5        e3j6        dd;ddddddddddddfd1e7e8z  e*e7         z  e*e8         z  d<e7e8z  e*e7         z  e*e8         z  dz  d2ed4e2d5e3d6edz  d7ed=ed8edz  d9edz  d>edz  dedz  dedz  d ed!ed"ed#ed$ed?edz  de9f(d@Z:dAe*e         defdBZ;	 	 dPdCee*e         z  d/edDedz  defdEZ<	 	 dQd	ee=j>        z  dFeedGf         dHedz  d
edz  deedGf         f
dIZ?	 	 	 dRdJZ@e	 	 	 	 	 	 	 dSdK            ZA xZBS )TTokenizersBackendaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    NFc                   $ t          |          }|                    dd          }|Lt          j                            |          r-| t
          u sd| j        vs|rt          j        |          |d<   |S |nt          j                            |          rNt          |d          5 }t          j        |          }ddd           n# 1 swxY w Y   |                    di                               d          }|d	vret          |          }t          |d                   }	i |	d
<   |dk    rg |	d<   |	|d<   g |d<   t          j        t          j        |                    }
nt          j        |          }
|
j        |d<   |
j        |d<   |
j        |d<   |
j        
|
j        |d<   |
j        
|
j        |d<   |                    d          }|r|                    dd          dk    r	|d         }nt%          |t&                    s|g}|D ]C}|                    d          dk    r(d|v r$ddl}|                    |d                   |d<    nD|                    di                               d
d          }| j        8t%          |t&                    r"t'          t/          t0          |                    }n| j        j        dk    rFt%          |t&                    r0|r.t%          |d         t&          t0          f          rd |D             }nx| j        j        dk    rd t5          |          D             }nN| j        j        dk    s| j        j        dk    r.t%          |t&                    rd t5          |          D             }||d
<   t7          | dd          }d|                    di           v r,|r*|j        dk    r|d         d         }d  |D             }||d<   |S |                    d!          }|                    d"          }|                    d
          }|                    d          }t%          |t8                    rd|                    d#          rOt          j                            |          r0d$d%lm}  ||&                               |          \  |d
<   |d<   |S t%          |t8                    rt          j                            |          r|                    d'          r	 d$d(lm!}  ||          } |j"        | j        fi |}	 d$d)lm#} |                    | j                  }|tI          |d*          r |j%        dCi |}n=# tL          $ r0}tN          (                    d+| j         d,| d-           Y d}~nd}~ww xY wtI          | d.          r | j)        dCi |}d|vr| t
          u s
d| j        vr|                    d
d          }|                    dd          }|                    d/          pi }||rd0 |*                                D             }|*                                D ]a\  }}tW          |          }t9          |          }|                    |          }|r'||k    r!||vr|                    |          ||<   |||<   btY          j-        |j.        ||1          }|||d<   |j.        j/        }|j0        dk    r|1                    d2|j2        pd3           |j3        dk    r|1                    d4|j4        pd5           |j5        dk    r|1                    d6|j6        pd7           nu# tL          $ rh}tN          (                    d8| d9| d:           d$d;lm7}  |||                    d<          =          } | 8                                |d<   Y d}~nd}~ww xY w|S |At%          |t8                    r,t          j                            |          r||d
<   |d
         }|At%          |t8                    r,t          j                            |          r||d<   |d         }|| j        | j        j        dk    rt%          |t                     rd>tr          tt                   d?t&          t8                   f$fd@$g dA}!tw                      }"|!D ]+}#|#|v r%|"<                     $||#         g                     ,t{          ||"B          }||d<   |S )Dz
        Build a `tokenizers.Tokenizer` backend from the available serialization files (tokenizer.json, sentencepiece
        models, tekken.json, vocab/merges).
        r%   N__init__tokenizer_objectutf-8encodingmodeltype)Nr   vocabr   mergesadded_tokenspost_processortokenizer_paddingtokenizer_truncation_json_truncation_json_padding
normalizerSequencenormalizersPrecompiledprecompiled_charsmapr   _spm_precompiled_charsmapr   c                 ,    g | ]}t          |          S  )tuple).0items     j/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/tokenization_utils_tokenizers.py
<listcomp>z>TokenizersBackend.convert_to_native_format.<locals>.<listcomp>   s    ;;;TU4[[;;;    r#   c                     i | ]\  }}||	S r@   r@   rB   itokens      rD   
<dictcomp>z>TokenizersBackend.convert_to_native_format.<locals>.<dictcomp>   s    CCChaCCCrF   r$   c                 T    i | ]%\  }}t          |t                    r|d          n||&S r   )
isinstancelistrH   s      rD   rK   z>TokenizersBackend.convert_to_native_format.<locals>.<dictcomp>   s8    pppS[STV[E4)@)@KU1XXeQppprF   c                     g | ]H}t          |t                    r"t          |                    d                     nt          |          IS ) )rN   strrA   split)rB   merges     rD   rE   z>TokenizersBackend.convert_to_native_format.<locals>.<listcomp>   sK    rrrbgZs5K5K]%C 0 0111QVW\Q]Q]rrrrF   r&   merges_fileztekken.jsonr   )MistralConverter)r&   .model)SentencePieceExtractor)SLOW_TO_FAST_CONVERTERSconvert_from_spmz,Could not reorder vocab using converter for z due to z/. Falling back to raw SentencePiece extraction.convert_from_spm_modeladded_tokens_decoderc                     i | ]\  }}||	S r@   r@   )rB   rJ   token_ids      rD   rK   z>TokenizersBackend.convert_to_native_format.<locals>.<dictcomp>   s    &\&\&\?5(x&\&\&\rF   )protor1   r2   	bos_token<s>	eos_token</s>	unk_tokenz<unk>z+Could not extract SentencePiece model from z$ using sentencepiece library due to z%. Falling back to TikToken extractor.)TikTokenConverterextra_special_tokens)r&   rf   valuesreturnc                     g }| D ]b}|t          |t          t          f          r|                     |                     @|                    t          |                     c|S N)rN   rO   rA   extendappendrR   )rg   	collectedval_iter_special_tokenss      rD   ro   zHTokenizersBackend.convert_to_native_format.<locals>._iter_special_tokens)  s{    ')	! 3 3C{ !#e}55 3!(()=)=c)B)BCCCC!((S2222  rF   )		pad_tokenrd   r`   rb   	sep_token	cls_token
mask_tokenadditional_special_tokensrf   )skip_tokensr@   )>dictpopospathisfiler(   __dict__TokenizerFast	from_fileopenjsonloadgetfrom_strdumpsr4   padding
truncationrN   rO   base64	b64decoder/   maprA   __name__	enumerategetattrrR   endswithconvert_slow_tokenizerrV   extract_vocab_merges_from_modelrX   extractrY   hasattrrZ   	Exceptionloggerwarningr[   itemsintr   build_tokenizer_from_spm_protor_   trainer_specbos_id
setdefault	bos_pieceeos_id	eos_pieceunk_id	unk_piecere   	convertedr   r   setupdater   )%clstrust_remote_codekwargslocal_kwargsfast_tokenizer_filetokenizer_handletokenizer_json
model_typeminimal_tokenizer_jsonminimal_modeltok_from_filenormalizer_configr9   r   r1   r2   r&   rU   rV   rX   	extractorrY   converter_classer\   id_to_tokenr^   	new_tokencurrent_tokenr+   
proto_specre   	converterspecial_tokens_keysru   keyro   s%                                       @rD   convert_to_native_formatz*TokenizersBackend.convert_to_native_formate   s    F||*../?FF  +233 ,)))Zs|-K-KO`-K/</FGZ/[/[L+, ,@S1T1T, )G<<< =@P!%+;!<!<= = = = = = = = = = = = = = = (++GR88<<VDDJ!222)-n)=)=& $^G%< = =)+g&&&.0M(+2?&w/9;&~6 - 6tzBX7Y7Y Z Z - 78K L L-:-IL)*0=0EL,-3@3KL/0 '33@3K/0$00=0E_- !/ 2 2< @ @  $((66*DD(9-(H%%#$5t<< <):(;%"3  J!~~f-->>CY]gCgCg%DJDTDT&'=>E E%@A "&&w3377FFEy eT** 4 UE!2!233E#y00eT** <u <E!HtUZm9\9\ <;;U;;;E#{22CC)E2B2BCCC#u,,	0Bk0Q0QeT** qpp_hin_o_opppE$)L! gt44J>--gr::::
:zObfkOkOk'0:rrkqrrr)/X&!%%l33
"&&}55  ))!!(++ j#&& 	 :+>+>}+M+M 	 RTRYR`R`akRlRl 	 @@@@@@<L<L%= = =--j99 :L!<#9   j#&& H	 27>>*+E+E H	 *J]J]^fJgJg H	 FIJJJJJJ 32:>>	0y0KKlKK	OOOOOO&=&A&A#,&O&OO&2wPb7c7c2'G'G'W'W,'W'W    NN Ps|  P  P]^  P  P  P        3 899 N#=3#=#M#M#M#ML
 &\99,,,
#,0N0N(,,Wd;;E)--h==F ,8+;+;<R+S+S+YWY((-A(&\&\ekkmm&\&\&\3G3M3M3O3O B B/Hi'*8}}H(+II,7OOH,E,EM, B)1K1KPYafPfPf3899]3K3Ki 08AH 5'3'R'o#%( ( ($
 (3;K%78 &/_%A
%,11(33KAUA^Y^___%,11(33KAUA_Y_```%,11(33KAUA`Y`aaa 
I 
I 
I:* : :rs : : :   FEEEEE--)@P@PQg@h@h  	 4=3F3F3H3H/000000
I   =Z
C88=RW^^J=W=W=$.L! )E>jc::>rw~~k?Z?Z>%0L"!(+F >ci3	8Je8S8SXbchjnXoXo8S	!Xc] 	!tCy 	! 	! 	! 	! 	! 	!
# 
# 
# %(EEK* R R,&&&&';';\#=N<O'P'PQQQ$UDDDF%+L"sV   )C

CC $] %?U% $] %
V/&V] VF:] 
_$A__c           	      V    |                     dd           }|                     dd           }|                     dd            |                     dd           }|                     dd           }|                     dd           }|                    di           }|                    dd	          }	|                    d
          }
|                    d          }|                    d          }d }|t          j        |          }n|5t          j                            |          rt          j        |          }n|t          |                    dd          |fi |}t          |          }|d         d         }|d         }|d         }t          ||          \  }}|                    |           t          |          dk    r|                    |           n| j        ||Qt          |t                     r|nd t#          |          D             }t          t%          ||dd                     }nt          |t                     r!t          t%          |g dd                     }nt          |t&                    rV|rTt          |d         t(          t&          f          r2t          t+          ||                    dd                              }n| j        t-          d          |5|3| j        ,|                    dd           |                    dd           ||| _        | j        t-          d          |                     dd           p| j        j        p|}| | j        j        d=i | |                    d |d                     |                    d!|d"                    |                    d#|d#                    |                    d$|d%                    n| j                                         |                     d&d           p| j        j        p|}| | j        j        d=i | |                    d'|d'                    |                    d(|d)                    |                    d*|d"                    |                    d |d+                    |                    d,|d,                    d-|vrd.|d-<   d/|v pd0|v }|                    d/d	          | _        |                    d0d	          | _        |                     d1d           x}r|| j        _        |p| j        j        d u | _          tC                      j"        d=i | |
|
| _#        |	| _$        | j%        | j        _&        d2 | j'        D               fd3tQ          |)                                d4 5          D             }t'          | j*        +                                          d6 |D             z   }| j,        -                                D ]/}|t]          |          |vr||vr|/                    |           0| j0        D ],}t]          |          |vr||vr|/                    |           -t          |          dk    rg }d7 | j,        -                                D             }|D ]r}t          |t\                    rtc          |d8          }n4t          |tb                    r|j2        st]          |          |v rd|_2        |/                    |           s|r| 3                    |           	 | j        4                                }n# tj          $ r d}Y nw xY w|d9k    rztm          | j        d:d           d|                     dd             | j7        | j        | j8                            dd           f| j8        |                     d;d           d<|| _        | j         p| j        j        d u | _         | j         r| 9                                 d S d S )>Nr7   r8   r>   r+   	gguf_filer%   r\   add_prefix_spaceFr&   r1   r2   name_or_path configr   	tokenizertokenizer_configr   c                      i | ]\  }\  }}||S r@   r@   )rB   rI   w_s       rD   rK   z.TokenizersBackend.__init__.<locals>.<dictcomp>q  s%    CkCkCkYQPVQRTUAqCkCkCkrF   T)r1   r2   fuse_unkdropoutr   )r1   r   a9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.r`   ra   rb   rc   z3The backend tokenizer is not correctly initialized.r6   
max_lengthtruncation_side	directionstridetruncation_strategystrategyr5   rp   pad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofbackend
tokenizersadd_bos_tokenadd_eos_tokenr4   c                 F    h | ]}t          t          |                    S r@   hashreprrB   rJ   s     rD   	<setcomp>z-TokenizersBackend.__init__.<locals>.<setcomp>  s&    $^$^$^5T$u++%6%6$^$^$^rF   c                 V    g | ]%\  }}t          t          |                    v#|&S r@   r   )rB   indexrJ   added_tokens_decoder_hashs      rD   rE   z.TokenizersBackend.__init__.<locals>.<listcomp>  sA     
 
 
uDKK  (AAA AAArF   c                     | d         S Nr   r@   )xs    rD   <lambda>z,TokenizersBackend.__init__.<locals>.<lambda>  s    STUVSW rF   r   c                 ,    g | ]}t          |          S r@   rR   r   s     rD   rE   z.TokenizersBackend.__init__.<locals>.<listcomp>  s    ;b;b;b5CJJ;b;b;brF   c                 0    g | ]}|t          |          S r@   r   )rB   ts     rD   rE   z.TokenizersBackend.__init__.<locals>.<listcomp>  s$    WWW1UVWAWWWrF   )speciali pre_tokenizerfix_mistral_regex)init_kwargsr   r@   ):rw   r   copydeepcopyrx   ry   rz   r|   r}   r   r   r   r   len
_tokenizerrN   rv   r   r   rO   rA   r   
ValueErrorr   r   enable_truncationno_truncationr   enable_padding_add_bos_token_add_eos_tokenr4   _should_update_post_processorsuperr*   r&   r   split_special_tokensencode_special_tokensr\   sortedr   added_tokens_encoderkeys_special_tokens_maprg   rR   rl   _extra_special_tokensr   r   
add_tokensget_vocab_sizeNotImplementedErrorr   _patch_mistral_regexr   update_post_processor)"selfargsr   r7   r8   r+   r   r   r\   r   r&   r1   r2   fast_tokenizer	gguf_path
gguf_paramarchitecturetokenizer_dictr   additional_kwargs
vocab_dict_truncation_paddingexplicit_bos_eos_in_kwargsr4   tokens_to_addencoderspecial_token_valuerJ   tokensall_named_tokens
vocab_sizer   	__class__s"                                   @rD   r*   zTokenizersBackend.__init__H  s    "::&8$??

?D99 	

.555!::&8$??JJ{D11	$jj)94@@%zz*@"EE!::&8%@@ZZ--


7##H%%'!]+;<<NN ,@S1T1T,*45HIINN"#FJJ~r$B$BIXXQWXXI-i88J%h/=L'4N)*<=0F|Uc0d0d-N-MM*+++$%%))/000_$):!&0&=&=kUUCkCkZcdiZjZjCkCkCk
!.sF]ako/p/p/p!q!qE4(( e!.srTXbf/g/g/g!h!hE4(( eU ez%(UTXM7Z7Z e!.wU6::V^`aKbKb/c/c/c!d!d_$r   &+;+CH_k5111k6222%,DO?"RSSSjj!7>>p$/B\p`p"-DO-<<<<<lK,EFFF/[1IJJJhH(=>>>3[5LMMMMO))+++::1488dDO<SdWd*DO*66X666k8K+@AAA18M3JKKKnh{.CDDDlHX,>???2H=Q4RSSS F"" ,F9%4%>%[/U[B["$jj%@@$jj%@@#ZZ(8$???> 	<-;DO*-G-q4?KimqKq*""6"""!(DO 0040I-$^$^DD]$^$^$^!
 
 
 
 &';'A'A'C'C X X X
 
 

 t0557788;b;bTa;b;b;bb $(#;#B#B#D#D 	: 	:"*&''w66;NVc;c;c$$%8999 / 	, 	,E5zz((U--G-G$$U+++}!!FWW0H0O0O0Q0QWWW& % %eS)) -&ud;;;EEz22 - = -SZZ;K-K-K(,e$$$$ ('''	7799JJ" 	 	 	JJJ	 74?OT#R#R#^JJ{D)))7d7 $$^T:: !,"(**-@$"G"G	 
  DO .X$/2PTX2X 	* - 	)&&(((((	) 	)s   ]   ]/.]/rh   c                     dS )NTr@   r  s    rD   is_fastzTokenizersBackend.is_fast  s    trF   c                     d| j         v r]| j         d                             d          r=t          | d          r+| j        r$t          j                            | j                  S dS dS )z
        `bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        r&   rW   FT)vocab_files_namesr   r   r&   rx   ry   rz   r  s    rD   can_save_slow_tokenizerz)TokenizersBackend.can_save_slow_tokenizer  si     4111d6L\6Z6c6cdl6m6m1t\** 7t 7w~~do66654rF   save_directoryfilename_prefixc                    t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            | j                  t           j                            |          k    rt          | j        |           |fS )NzVocabulary path (z) should be a directory-r   r&   )
rx   ry   isdirr   errorjoinVOCAB_FILES_NAMESabspathr&   r   )r  r  r  out_vocab_files       rD   save_vocabularyz!TokenizersBackend.save_vocabulary  s    w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNT_n555  rF   c                    | j         }| j        }|| j        rd| _        | j        }| j        }|| j        rd| _        | j        r|dz   nd d| j        rd|z   dz   nd }| | j        rd|z   dz   nd d	| j        rd|z   dz   nd }g }| j        r|                    ||f           | j        r|                    ||f           t          j        |||
          | j	        _
        dS )ze
        Updates the underlying post processor with the current `bos_token` and `eos_token`.
        NFz:0 r   z$A:0rQ   z:0z:1z $B:1)singlepairspecial_tokens)r`   bos_token_idr   rb   eos_token_idr   rl   r	   TemplateProcessingr   r4   )r  bosr+  eosr,  r(  r)  r*  s           rD   r  z'TokenizersBackend.update_post_processor
  s]    n(;4-;!&Dn(;4-;!&D%)%7?S5[[Rww[_[mEucCiRVFVFVsuww  D0BJ39t++  D  Dgkgy  RBRUX[R[^bRbRb  @B  D  D 	7!!3"5666 	7!!3"5666)3)F^*
 *
 *
&&&rF   c                 $    t          | dd          S )Nr   Fr   r  s    rD   r   zTokenizersBackend.add_eos_token$      t-u555rF   c                 $    t          | dd          S )Nr   Fr1  r  s    rD   r   zTokenizersBackend.add_bos_token(  r2  rF   c                 f    t                               | d|           |                                  d S )Nr   object__setattr__r  r  values     rD   r   zTokenizersBackend.add_eos_token,  3    4!15999""$$$$$rF   c                 f    t                               | d|           |                                  d S )Nr   r5  r8  s     rD   r   zTokenizersBackend.add_bos_token1  r:  rF   c                    g }| j                                         D ]j}|t          |t                    r|                    |           0t          |t
                    r%|                    t          |dd                     k| j        D ]g}t          |t                    r|                    |           -t          |t
                    r%|                    t          |dd                     h|r|                     |d           t          | dd          s| j	        j
        |                                  dS dS )a[  
        Post-initialization hook that runs after the tokenizer is fully set up.
        This is called by from_pretrained() after loading the tokenizer, which allows
        us to add any special tokens that may have been passed as AddedToken objects.

        Child classes should call super()._post_init() if they override this method.
        NTF)r   
normalized)r*  r   )r   rg   rN   r   rl   rR   r   r   r   r   r4   r  )r  r  token_valuerJ   s       rD   
_post_initzTokenizersBackend._post_init6  sl    3::<< 	^ 	^K"+z22 ^$$[1111K-- ^$$ZTV[%\%\%\]]] / 	X 	XE%,, X$$U++++E3'' X$$ZtPU%V%V%VWWW 	@OOM$O???48$?? 	)4?CaCi&&((((( DjCirF   c                 8    | j                             d          S )zP
        `int`: Size of the base vocabulary (without the added tokens).
        Fwith_added_tokensr   r   r  s    rD   r  zTokenizersBackend.vocab_sizeV  s    
 ---FFFrF   c                 8    | j                             d          S )NTrA  )r   	get_vocabr  s    rD   rE  zTokenizersBackend.get_vocab]  s    ((4(@@@rF   c                 *    |                                  S rj   )rE  r  s    rD   r1   zTokenizersBackend.vocab`  s    ~~rF   c                 h    d t          | j                                        d           D             S )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                 $    i | ]\  }}|j         |S r@   contentrB   vks      rD   rK   z:TokenizersBackend.added_tokens_encoder.<locals>.<dictcomp>j       mmmA	1mmmrF   c                     | d         S r   r@   rC   s    rD   r   z8TokenizersBackend.added_tokens_encoder.<locals>.<lambda>j      dhijdk rF   r   r   r\   r   r  s    rD   r   z&TokenizersBackend.added_tokens_encoderd  s9     nm0I0O0O0Q0QWkWk)l)l)lmmmmrF   c                 4    | j                                         S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `dict[str, int]`: The added tokens.
        )r   get_added_tokens_decoderr  s    rD   r\   z&TokenizersBackend.added_tokens_decoderl  s     77999rF   c                 h    d t          | j                                        d           D             S )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `dict[str, int]`: The added tokens.
        c                 $    i | ]\  }}|j         |S r@   rI  rK  s      rD   rK   z5TokenizersBackend.get_added_vocab.<locals>.<dictcomp>  rN  rF   c                     | d         S r   r@   rP  s    rD   r   z3TokenizersBackend.get_added_vocab.<locals>.<lambda>  rQ  rF   r   rR  r  s    rD   get_added_vocabz!TokenizersBackend.get_added_vocab{  s9     nm0I0O0O0Q0QWkWk)l)l)lmmmmrF   c                     dS )zN
        Returns True, to avoid expensive `assert tokenizer` gotchas.
        Tr@   r  s    rD   __bool__zTokenizersBackend.__bool__  s	     trF   c                 8    | j                             d          S )zD
        Size of the full vocabulary with the added tokens.
        TrA  rC  r  s    rD   __len__zTokenizersBackend.__len__  s     ---EEErF   c                     | j         S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )r   r  s    rD   backend_tokenizerz#TokenizersBackend.backend_tokenizer  s    
 rF   c                     | j         j        S )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )r   decoderr  s    rD   r`  zTokenizersBackend.decoder  s    
 &&rF   Tr.   return_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                 F   |	d| j         v }|	d| j         v }|r|j        |g|j        z   }	n|g}	t          t                    }
|	D ]}|
d                             |j                   |r |
d                             |j                   |r |
d                             |j                   |r |
d                             |j                   |r |
d                             |j	                   |r-|
d                             t          |j                             |
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        Ntoken_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingr   )model_input_namesoverflowingr   rO   rl   idstype_idsrj  rl  offsetsr   )r  r.   ra  rb  rc  rd  re  rf  rg  	encodingsencoding_dictr   s               rD   _convert_encodingz#TokenizersBackend._convert_encoding  sV   ( !($48N$N! ($48N$N!$ 	#)=)I!
X%99II!
I#D)) 	; 	;A+&--ae444$ C./66qzBBB$ I./66q7GHHH) S34;;A<QRRR% B./66qyAAA ;h'..s15zz:::i''rF   rJ   c                 L    | j                             |          }|| j        S |S rj   )r   token_to_idunk_token_id)r  rJ   r   s      rD   #_convert_token_to_id_with_added_vocz5TokenizersBackend._convert_token_to_id_with_added_voc  s*    ++E22=$$rF   r   c                 P    | j                             t          |                    S rj   )r   r   r   )r  r   s     rD   _convert_id_to_tokenz&TokenizersBackend._convert_id_to_token  s    **3u::666rF   
new_tokensc                 n    |r| j                             |          S | j                             |          S rj   )r   add_special_tokensr   )r  r|  r*  s      rD   _add_tokenszTokenizersBackend._add_tokens  s7     	B?55jAAA))*555rF   r)  c                 6    | j                             |          S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )r   num_special_tokens_to_add)r  r)  s     rD   r  z+TokenizersBackend.num_special_tokens_to_add  s    & 88>>>rF   rp  skip_special_tokensc                 <   t          |t                    r| j                            |          S g }|rt	          | j                  nt	                      }|D ]C}t          |          }||v r|                    | j                            |                     D|S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `list[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `list[str]`: The decoded token(s).
        )rN   r   r   r   r   all_special_idsrl   )r  rp  r  r  ids_to_skipr   s         rD   convert_ids_to_tokensz'TokenizersBackend.convert_ids_to_tokens  s     c3 	4?..s3333FQc$.///CEE 	> 	>EJJE##MM$/55e<<====rF   textr~  c                 H     | j         d|||d|                                S )N)r  	text_pairr~  r@   )_encode_plusr  )r  r  r)  r~  r   s        rD   tokenizezTokenizersBackend.tokenize  s2     t lddOallekllssuuurF   padding_strategyr   r   r   r   r   c                    | j         j        | j         j        }|t          j        k    r| j                                          n<|||j        | j        d}d}	nfd|D             }	|	|k    r | j         j        di | |t          j
        k    r|| j                                          dS dS |t          j        k    r|nd}
|
||n| j        | j        | j        | j        |d}||k    r | j         j        di | dS dS )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
        N)r   r   r   r   c                 >    i | ]}|                     |d           S rj   r   )rB   rM  r  s     rD   rK   z@TokenizersBackend.set_truncation_and_padding.<locals>.<dictcomp>D  s)    GGG11kooa66GGGrF   )r   r   pad_idrp   r   r   r@   )r   r   r   r   DO_NOT_TRUNCATEr   r9  r   r   r    
DO_NOT_PAD
no_padding
MAX_LENGTHr   pad_token_idrp   r   r   )r  r  r   r   r   r   r   r  targetcurrentr   r  s              @rD   set_truncation_and_paddingz,TokenizersBackend.set_truncation_and_padding  si   B o0?*"4"DDD&--/// ) /5!1	 F "GGGGGGG&  11;;F;;;999#**,,,,, $# $47Q#Q#QZZW[F -9-E\\4K\+!^#5&8 F 6!!..8888888 "!rF   r   r  is_split_into_wordsreturn_tensorsr   c                    # d } ||          st          d          | ||          st          d          |rAt          |t          t          f          o#|o!t          |d         t          t          f          }nt          |t          t          f          }|rt          |t                    rt          d          |Pt          |          t          |          k    r0t          dt          |           dt          |           d          |t          t          ||                    n|}n
|r||fgn|g}t          |t          t          f          s t          dt          |           d	           	                    |||||	|

           | j
        } j        j        |k    r| j        _         j                            |||          } fd|D             }i }|d         d         D ]##fd|D             }||#<   d |D             }r;g }t          |          D ]$\  }\  }} ||gt          |d                   z  z  }%||d<   |d         D ]}!                     |!|           t!          |||          }"|s5|3s1t!          d |"                                D             |"j                  }"|"S )Nc                 r   t          | t                    rdS t          | t          t          f          rt	          |           dk    rdS t          | d         t                    rdS t          | d         t          t          f          rt	          | d                   dk    s!t          | d         d         t                    rdS t          | d         d         t          t          f          rFt	          | d         d                   dk    p&t          | d         d         d         t                    S dS dS dS )NTr   F)rN   rR   rO   rA   r   )r   s    rD   _is_valid_text_inputz<TokenizersBackend._encode_plus.<locals>._is_valid_text_inputq  s   !S!! tAe}-- q66Q;;4!c** 
!4!tUm44 !1Q4yyA~~AaDGS)A)A~#t#AaDGdE];; %"1Q47||q0OJqtAwqz34O4OO$u 5urF   ztext input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples) or `list[tuple[list[str], list[str]]]` (batch of pretokenized sequence pairs).r   zdwhen tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`.zbatch length of `text`: z- does not match batch length of `text_pair`: .z:batch_text_or_text_pairs has to be a list or a tuple (got ))r  r   r   r   r   r   )r~  is_pretokenizedc                 J    g | ]}                     |	            S ))r.   ra  rb  rc  rd  re  rf  rg  )ru  )
rB   r.   rb  rf  re  rc  rd  ra  r  rg  s
     rD   rE   z2TokenizersBackend._encode_plus.<locals>.<listcomp>  sX      
  
  
  ""!&;&;*C+E'=+ # 	 	 
  
  
rF   c                 0    g | ]\  }}|         D ]}|S r@   r@   )rB   rC   r   r   r   s       rD   rE   z2TokenizersBackend._encode_plus.<locals>.<listcomp>  s.    NNN74DINNqQNNNNrF   c                 "    g | ]\  }}|D ]}|S r@   r@   )rB   r   rC   r   s       rD   rE   z2TokenizersBackend._encode_plus.<locals>.<listcomp>  s)    SSSWQdSSqSSSSrF   rk  overflow_to_sample_mapping)tensor_typec                     i | ]>\  }}|t          |          d k    r#t          |d          t                    r|d          n|?S rM   )r   rN   rO   )rB   r   r9  s      rD   rK   z2TokenizersBackend._encode_plus.<locals>.<dictcomp>  sW       "U c%jj1nnE!Hd9S9Sn%((Y^  rF   )r   rN   rO   rA   rR   	TypeErrorr   zipr0   r  r   r   r   encode_batchr   &_eventual_warn_about_too_long_sequencer   r   rs  )$r  r  r  r~  r  r   r   r   r  r   r   r  ra  rb  rc  rd  re  rf  rg  r   r   r  
is_batchedbatch_text_or_text_pairsrs  tokens_and_encodingssanitized_tokensstacksanitized_encodingsr  rI   toksr   rk  batched_outputr   s$   `           ```````                @rD   r  zTokenizersBackend._encode_plusY  s   0	 	 	( $#D)) 	W  
  )=)=i)H)H W    	9#D4-88hThjQUVWQX[_afZgFhFhJJ#D4-88J 	T)S))    $Tc)nn)D)D *s4yy * *I* * *   FOEZtCi,@,@'A'A'A`d$$ ?H'Sy(9':':dV$ 2UDMBB 	nTRjMkMknnn   	''- 3!1% 	( 	
 	
 	
  '#'#< ?04HHH4HDO1 O00$1/ 1 
 
	 
  
  
  
  
  
  
  
  
  
  
 & 
  
  
 '*1- 	* 	*CNNNN&:NNNE$)S!!SS0DSSS % 	X)+& )*> ? ? K K9D!*qcC[8I4J4J.JJ**=W9:)+6 	X 	XI77	:wWWWW&'79LZhiii  	n4=V4* &4&:&:&<&<   ( N rF   r  c                     | j         j        | j         j                            |          nd                    |          S )NrQ   )r^  r`  decoder"  )r  r  s     rD   convert_tokens_to_stringz*TokenizersBackend.convert_tokens_to_string  s@     %-9 "*11&999&!!	
rF   	token_idsclean_up_tokenization_spacesc                    |                     dd            t          |t                    r|g}t          |t                    r|d         }| j                            ||          }||n| j        }|rgt          | j        j	                  j
        dk    r0| j        s)t                              d| j        j
         d           n|                     |          }|S )Nuse_source_tokenizerrk  )r  r   z=Ignoring clean_up_tokenization_spaces=True for BPE tokenizer aE  . The clean_up_tokenization post-processing step is designed for WordPiece tokenizers and is destructive for BPE (it strips spaces before punctuation). Set clean_up_tokenization_spaces=False to suppress this warning, or set clean_up_tokenization_spaces_for_bpe_even_though_it_will_corrupt_output=True to force cleanup anyway.)rw   rN   r   rv   r   r  r  r0   r^  r/   r   Gclean_up_tokenization_spaces_for_bpe_even_though_it_will_corrupt_outputr   warning_oncer  clean_up_tokenization)r  r  r  r  r   r  s         rD   _decodezTokenizersBackend._decode  s    	

)4000i%% 	$"Ii&& 	/!+.I%%iEX%YY ,7 )(2 	%
 ( 	8
 T+122;uDDd E ##-/- - -    11$77rF   
file_names.legacy_formatc                     t          |          }t          j                            ||r|dz   ndt          z             }| j                            |           ||fz   }|S )Nr  r   )rR   rx   ry   r"  TOKENIZER_FILEr^  save)r  r  r  r  r  r%   s         rD   _save_pretrainedz"TokenizersBackend._save_pretrained%  sl     ^,,oM_s222Q__
 
 	##N333>"33
rF   c           	         t          j        | j                                                  }|                    d          }|                    d          }	d}
|d         d         dk    ri |d         d<   g |d         d<   n|d         d         d	k    r^|d         d
         O|d         d
         }|d         d         |         d         }
|
v r|
         }
d|d         d
<   |
dgg|d         d<   n;|d         d         dv ri |d         d<   nt          d|d         d          d          7d|d         v r-|d         d         v r|d         d                  |d         d<   t          j        t          j        |                    g }|D ]}|                    dd          }|                    dd          }|d         d         d	k    r|sC|d         v r|d                  |d<   |	                    t          d*i |           ||                    |           |d         d         dk    r#d|vr|d         d         |d         d         |d<   |d         d         dk    r#d|vr|d         d         |d         d         |d<   |d         d         d	k    r|
|
|d<   |d         t|d         d         dk    sA|d         d         dk    rPd|d         v rFt          d |d         d         D                       r!t          j                                        |d<   t           |d         d                  } |d*||d|}                    |||           |	,t          j                                                  }d|	v r|	d         D ]}|	d         |         d         }fd |D             }||	d         |         d<   |D ](}                    |          }|t          d!          )fd"|D             |	d         |         d#<   d$D ]L}||	v rF|	|         \  }}|v r|         }                    |          }|t          d!          ||g|	|<   M|	|d<   t          j        t          j        |                    | j                                        }t*          j        D ]}t/          | |          t/          | |          }|v r|         }| j                            |d          }t5          |t                    r-t          ||j        |j        |j        |j        d%&          ||<   |||<   | j        r| j                                        ng }||                    |           tA          |          dk    r||d'<   |d(<   	  | j!        d*i |S # tD          $ rH}d)tG          |          v r1|                    d(d            | j!        d*i |}|_        |cY d}~S  d}~ww xY w)+uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `list[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        r3   r4   Nr/   r0   r   r1   r2   r   r   r   g        )r#   r$   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.rd   r   idrJ  continuing_subword_prefixend_of_word_suffixr   	ByteLevelr:   pretokenizersc              3   .   K   | ]}|d          dk    V  dS )r0   r  Nr@   )rB   pretokenizers     rD   	<genexpr>z<TokenizersBackend.train_new_from_iterator.<locals>.<genexpr>  s@        $ !(K7     rF   initial_alphabet)r  r*  )r   trainerr*  r  c                 <    g | ]}                     ||          S r@   r  )rB   rJ   special_tokens_maps     rD   rE   z=TokenizersBackend.train_new_from_iterator.<locals>.<listcomp>  s*    ![![![5"4"8"8"F"F![![![rF   zQAttempted to set a token in the post processor that does not exist in the mappingc                 :    g | ]}                     |          S r@   )rw  )rB   rJ   r   s     rD   rE   z=TokenizersBackend.train_new_from_iterator.<locals>.<listcomp>  s)    CuCuCuejIDYDYZ_D`D`CuCuCurF   rp  )r   sepT)single_wordlstriprstripr=  r   rf   r+   z7multiple values for keyword argument 'tokenizer_object'r@   )$r   loadsr   to_strrw   r   r|   r   r   rl   r   rk   anypre_tokenizers_fastr  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorrw  r   r   r   SPECIAL_TOKENS_ATTRIBUTESr   r   r   rN   r  r  r  r=  rf   r   r  r  rR   )r  text_iteratorr  r   new_special_tokensr  r   r   r3   r4   rd   r   r*  added_tokenr   r   trainer_classr  trained_tokenizer_jsonr   r  rJ   r^   special_tokenspecial_token_fullrf   r   new_tokenizerr   s        `                      @rD   train_new_from_iteratorz)TokenizersBackend.train_new_from_iterator6  s   D DO$:$:$<$<==%)).99'++,<==	'"6*e33/1N7#G,02N7#H--G$V,	99g&x0<'0:*73G<VDQG	%1iCU6U6U 29 =I45w'15>4D3Ew'0G$V,0JJJ/1N7#G,,>n]dNeflNm > > >   *~g666w'48JJJ3EnU\F]^iFj3kN7#K0!*4:n+E+EFF	 ' 	= 	=K!ooi66Gd++Ag&v.);;G;!-+i2HL^2^2^);K	<R)SI&!!*";";{";";<<<<)!!"4555 7#F+u44+699w'(CDP2@2IJe2fF./7#F+u44$F22w'(<=I+9'+BCW+XF'('"6*i77I<Q"+F;/*6/7;FF!/26:jHH#~o'FFF  (6(G(X     G .A-J-S-S-U-U)*01H1PQ-_:n__X^__%%mFG%TTT%%)Z	0@0@0B0B%C%C">11)*:; v vC+,<=cB8LF)5![![![![TZ![![![FLN#34S9(C!'  #,#8#8#?#?#+", s# #  ,
 DvCuCuCuntCuCuCuN#34S9%@@!/ 
F 
F N22-m<HE1)5%CU:U:U 25 9(44U;;H'(o   6;H4EN=17E"#34%.tz:P/Q/QRRI!&&((,F 	2 	2EtU##/ 'e 4 4%1mGY6Y6Y$6}$EM%)%=%A%A%%N%N"0*== 2$.%$6$B1818#5#@ $% % %F5MM %2F5M DHC\dt8==???bd) ''(:;;;#$$q((-AF)* &/!"	!4>++F+++ 	 	 	HCPQFFRR 

-t444 . 8 8 8 8+4($$$$$$$ 	s$   !V. .
X 8<W;4X :W;;X c
           
      H   ddl ddlm} ddlm ddlm} ddlm}  |d          d	t          d
t          ffd            }|st                      rd}|G|s|sB ||          r6 ||d|||dd|          }d}|t          |d          5 }t          j        |          }ddd           n# 1 swxY w Y   |                    d          }|                    d          }|r7|                    |          |                    d          k     r|r||dvr|S n0|r.|                    |          |                    d          k    r|S d}|s|sH ||          r<|rd|v rt#          |d|d                    |	At%          |dd          s0t#          |dd           t&                              d| d           n|	du st%          |dd          rt#          |dd           ddl}|j                            |                    d          d          }|j        }t5          ||j        j                  r||j        d<   nWt5          ||j        j                  r|j                            dd          }|j                            ||g          |_        |S )af  
        Patches mistral related tokenizers with incorrect regex if detected
            1) Local file with an associated config saved next to it
                >> Model type one of the mistral models (on older versions)
            2) Remote models on the hub from official mistral models
                >> Tags including `base_model:.*mistralai`
        r   N)	lru_cache)
model_info)versionr      )maxsizemodel_idrh   c                     	  |           }n# t           $ r Y dS w xY w|j        0                    dd                    |j                            rdS dS )NFzbase_model:.*mistralair   T)r   tagssearchr"  )r  r/   r  res     rD   is_base_mistralz?TokenizersBackend._patch_mistral_regex.<locals>.is_base_mistral  st    "
8,,   uu z%995rwwuz7J7JKK  45s    
Tzconfig.jsonF)	cache_dirrJ   local_files_only%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors_commit_hashr,   r-   transformers_versionr   z5.0.0)mistralmistral3voxtral	ministralpixtralr   z$The tokenizer you are loading from 'a  ' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.z[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+isolated)patternbehavior)r   	use_regex)r  	functoolsr  huggingface_hubr  	packagingr  transformers.utils.hubr   rR   boolr   r~   r   r   r   parsesetattrr   r   r   r   pre_tokenizersSplitRegexr   rN   r:   	Metaspacer  )r   r   pretrained_model_name_or_pathrJ   r  r  r  is_localr   r   r   r  r  r   r  _config_filemistral_config_detectedf_configr  transformers_model_typer   split_pretokenizercurrent_pretokenizerr  r  s                           @@rD   r  z&TokenizersBackend._patch_mistral_regex  s    * 				''''''......%%%%%%666666	3					c 		d 		 		 		 		 		 		 
 			  	00 	H(4 5% 5*9/:W*X*X 5 ';-#!16;8=)	 	 	L ',#',999 +Q"illG+ + + + + + + + + + + + + + +'.{{3I'J'J$*1++l*C*C'
 ( %GMM:N,O,ORYR_R_`gRhRh,h,h )3?3    )() %gmm<P.Q.QU\UbUbcjUkUk.k.k$$*.'& *x *OOLi<j<j * ^#6+#E#EI':KH[<\]]] %,WYH[]b5c5c,I':EBBBNNe?\ e e e   
 '$..')EXZ_2`2`.I':DAAA%%%%)3)B)H)H * 0 0 s! ! ",	 *I * *& ,5+B(!"6
8Q8Z[[ 5G	/22 &&:J<U<_`` 3=3L3V3V16% 4W 4 40
 3=2K2T2T 2 43 3	/ s   B55B9<B9)Frj   )NNFFFFT)NF)FN)NN)NNN)NNFNFNN)Cr   
__module____qualname____doc__r#  r  r/   r   classmethodr   r*   propertyr  r  r  rR   rA   r&  r  r   r   setterr?  r   r  rv   rE  r1   r   r   r\   _added_tokens_encoder_added_tokens_decoderrX  rZ  r\  r|   r^  DecoderFastr`  EncodingFastr   rO   ru  ry  r{  r  r  r  r  r    r   r  r  r  r   r   r   r  r  r  rx   PathLiker  r  r  __classcell__)r  s   @rD   r(   r(   S   sO       
 
 *EJ` ` ` [`Da) a) a) a) a)F     X     X! !c !C$J !Z_`cZd ! ! ! !
 
 
4 6 6 X6 6 6 X6 % % % % % %) ) )@ GC G G G XGA4S> A A A A  tCH~       X  nd38n n n n Xn :d3
?&; : : : X: 10nc3h n n n n$    F F F F F =    X ' ' ' ' X' .2-1*/+0',#-( -(-(  $d{-(  $d{	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-( -( -( -(^     7# 7#* 7 7 7 76 6d3+;&< 6WZ 6 6 6 6? ?d ?s ? ? ? ?* tCy t `cfjknfo`o    4v vS vd
 vt vjnorjs v v v vI9)I9 0I9 	I9
 I9  $JI9 DjI9 I9 I9 I9\ gk#',;,F2D2T!%$))-#'&*-1-1*/+0',#,0)X X++d9o=EV@WWX 004	?BTJ[E\\_ccX !	X
 *X 0X $JX X "X  $JX DjX tX  $d{X  $d{X $(X  %)!X" !%#X$ %X& 'X( #Tk)X, 
-X X X Xt
tCy 
S 
 
 
 
 %*48	) )c?) ") '+Tk	) 
) ) ) )^ &*&* bk) #s(O d{	
 t 
sCx   * C C C CJ 
 C C C [C C C C CrF   r(   )Cr  r   r   rx   collectionsr   collections.abcr   shutilr   typingr   tokenizers.pre_tokenizersr	  r  r  r   r   r   r	   r
   r  r   r|   tokenizers.decodersr   r  tokenizers.modelsr   r   tokenizers.trainersr   r   r   r   r  r   r   r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utils_baser   r   r   r   r   r   r   utilsr    r!   r"   
get_loggerr   r   r  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILEADDED_TOKENS_FILEr  r#  r(   PreTrainedTokenizerFastr@   rF   rD   <module>r4     s   
   				 # # # # # # $ $ $ $ $ $             7 7 7 7 7 7 + + + + + + - - - - - - - - / / / / / / 1 1 1 1 1 1 6 6 6 6 6 6 * * * * * * * * ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ . . . . . . 0 0 0 0 0 0 5 5 5 5 5 5 = = = = = =                  @ ? ? ? ? ? ? ? ? ? 
	H	%	% "3 / '  (      !!	   (6EXYY  ,--k k k k k/ k k .-k^) ,   rF   