
    ёi@                        S SK Jr  S SKrS SKrS SKJrJr  S SKrS SK	J
r
  S SKJr  \(       a  S SKJr  \S   r\S   r/ rSrS	r " S
 S\5      rg)    )annotationsN)TYPE_CHECKINGLiteral)_check_exists_and_download)DatasetNGRAMSEQtraintestz<https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgz 30177ea32e27c525793142b6bf2c8e2dc                      \ rS rSr% SrS\S'   S\S'   S\S'   S	\S
'   S\S'   S\S'         S             SS jjrSS jrSS jrSS jr	    SS jr
SS jrSrg)Imikolov$   a  
Implementation of imikolov dataset.

Args:
    data_file(str|None): path to data tar file, can be set None if
        :attr:`download` is True. Default None.
    data_type(str): 'NGRAM' or 'SEQ'. Default 'NGRAM'.
    window_size(int): sliding window size for 'NGRAM' data. Default -1.
    mode(str): 'train' 'test' mode. Default 'train'.
    min_word_freq(int): minimal word frequencies for building word dictionary. Default 50.
    download(bool): whether to download dataset automatically if
        :attr:`data_file` is not set. Default True

Returns:
    Dataset: instance of imikolov dataset

Examples:

    .. code-block:: python

        >>> # doctest: +TIMEOUT(60)
        >>> import paddle
        >>> from paddle.text.datasets import Imikolov

        >>> class SimpleNet(paddle.nn.Layer):
        ...     def __init__(self):
        ...         super().__init__()
        ...
        ...     def forward(self, src, trg):
        ...         return paddle.sum(src), paddle.sum(trg)


        >>> imikolov = Imikolov(mode='train', data_type='SEQ', window_size=2)

        >>> for i in range(10):
        ...     src, trg = imikolov[i]
        ...     src = paddle.to_tensor(src)
        ...     trg = paddle.to_tensor(trg)
        ...
        ...     model = SimpleNet()
        ...     src, trg = model(src, trg)
        ...     print(src.item(), trg.item())
        2076 2075
        2076 2075
        675 674
        4 3
        464 463
        2076 2075
        865 864
        2076 2075
        2076 2075
        1793 1792


str | None	data_file_ImikolovDataType	data_typeintwindow_size_ImikolovDataSetModemodemin_word_freqdict[str, int]word_idxNc                   UR                  5       S;   d
   SU 35       eUR                  5       U l        UR                  5       S;   d
   SU 35       eUR                  5       U l        X0l        XPl        Xl        U R                  c*  U(       d   S5       e[        U[        [        SU5      U l        U R                  U5      U l        U R                  5         g )Nr   z,data type should be 'NGRAM', 'SEQ', but got r   z(mode should be 'train', 'test', but got z;data_file is not set and downloading automatically disabledimikolov)upperr   lowerr   r   r   r   r   URLMD5_build_work_dictr   
_load_anno)selfr   r   r   r   r   downloads          ]/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/text/datasets/imikolov.py__init__Imikolov.__init__c   s       %
 
 	F :)E	F 
 #*zz|  
 
 	= 6dV<	= 
 JJL	&*">>! M8 83ZDN
 --m< 	    c                    Uc  [         R                  " [        5      nU HN  nUR                  5       R	                  5        H  nX$==   S-  ss'   M     US==   S-  ss'   US==   S-  ss'   MP     U$ )N   <s><e>)collectionsdefaultdictr   stripsplit)r%   f	word_freqlws        r'   
word_countImikolov.word_count   sm    #//4IAWWY__&! 'e!e!	  r*   c                D   SnSn[         R                  " U R                  5       nUR                  U5      nUR                  U5      nU R	                  X`R	                  U5      5      nSU;   a  US	 UR                  5        Vs/ s H  oS   U R                  :  d  M  UPM     nn[        US S9n	[        [        U	6 5      u  p[        [        [        U
[        [        U
5      5      5      5      5      n[        U
5      US'   S S S 5        U$ s  snf ! , (       d  f       W$ = f)Nz$./simple-examples/data/ptb.train.txtz$./simple-examples/data/ptb.valid.txt<unk>r,   c                    U S   * U S   4$ )Nr,   r    )xs    r'   <lambda>+Imikolov._build_work_dict.<locals>.<lambda>   s    1qt}r*   )key)tarfileopenr   extractfiler7   itemsr   sortedlistzipdictrangelen)r%   cutofftrain_filenametest_filenametftrainftestfr4   r=   word_freq_sortedwords_r   s                r'   r#   Imikolov._build_work_dict   s   ?>\\$..)R^^N3FNN=1Ev/FGI)#g& %??,,a!t7I7I0I,    &i5LMC!123HEDUE#e*,=!>?@H #E
HW *"  *)" s%   ADDD#ADD
Dc           
        / U l         [        R                  " U R                  5       nSU R                   S3nUR                  U5      nU R                  S   nU GH  nU R                  S:X  a  U R                  S:  d   S5       eS/UR                  5       R                  5       QSPn[        U5      U R                  :  a  U Vs/ s H  o`R                  R                  Xd5      PM     nn[        U R                  [        U5      S	-   5       H6  nU R                   R                  [        XWU R                  -
  U 5      5        M8     M  M  U R                  S
:X  a  UR                  5       R                  5       nU Vs/ s H  o`R                  R                  Xd5      PM     nnU R                  S   /UQn/ UQU R                  S   Pn	U R                  S:  a  [        U5      U R                  :  a  GM  U R                   R                  X45        GM  [!        S5      e   S S S 5        g s  snf s  snf ! , (       d  f       g = f)Nz./simple-examples/data/ptb.z.txtr:   r	   zInvalid gram lengthr-   r.   r,   r
   r   zUnknown data type)datarA   rB   r   r   rC   r   r   r   r1   r2   rJ   getrI   appendtupleAssertionError)
r%   rN   filenamer3   UNKr5   r6   isrc_seqtrg_seqs
             r'   r$   Imikolov._load_anno   s   	\\$..)R4TYYKtDHx(A--(C>>W,++b0G2GG0:!2:E:A1v!1!11@AB1]]..q6B!&t'7'7Q!!DA II,,U19I9I5IA3N-OP "E 2 ^^u,	)A<=>Aq**12AA>#}}U38a8G884==#78G''!+Gt?O?O0O II$$g%78()<==#  *) C
 ? *)s,   BH4$H**BH4>$H/"A?H4*
H44
Ic                    [        U R                  U    Vs/ s H  n[        R                  " U5      PM     sn5      $ s  snf N)rZ   rW   nparray)r%   idxds      r'   __getitem__Imikolov.__getitem__   s1     499S>:>abhhqk>:;;:s    >c                ,    [        U R                  5      $ rc   )rJ   rW   )r%   s    r'   __len__Imikolov.__len__   s    499~r*   )rW   r   r   r   r   r   r   )Nr	   rV   r   2   T)r   r   r   r   r   r   r   r   r   r   r&   boolreturnNonerc   )rK   r   ro   r   )ro   rp   )rf   r   ro   z1tuple[npt.NDArray[np.int_], npt.NDArray[np.int_]])ro   r   )__name__
__module____qualname____firstlineno____doc____annotations__r(   r7   r#   r$   rh   rk   __static_attributes__r<   r*   r'   r   r   $   s    5n   
 !%'.%,%% %% 	%
 #% % % 
%N
,>4<<	:<
r*   r   )
__future__r   r/   rA   typingr   r   numpyrd   paddle.dataset.commonr   	paddle.ior   numpy.typingnptr   r   __all__r!   r"   r   r<   r*   r'   <module>r      sU    #   )  < /"?3
D(hw hr*   