
    ёi                        S SK Jr  S SKrS SKrS SKrS SKrS SKJrJr  S SK	r
S SKJr  S SKJr  \(       a  S SKJr  S SKJr  \S   r/ rSrS	r " S
 S\5      rg)    )annotationsN)TYPE_CHECKINGLiteral)_check_exists_and_download)Dataset)Patterntraintestz6https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz 7c2ac02c03563afcf9b574c7e56c153ac                      \ rS rSr% SrS\S'   S\S'   S\S'   S	\S
'   S	\S'       S         SS jjrSS jrSS jrSS jr	    SS jr
SS jrSrg)Imdb'   at  
Implementation of `IMDB <https://www.imdb.com/interfaces/>`_ dataset.

Args:
    data_file(str|None): path to data tar file, can be set None if
        :attr:`download` is True. Default None.
    mode(str): 'train' 'test' mode. Default 'train'.
    cutoff(int): cutoff number for building word dictionary. Default 150.
    download(bool): whether to download dataset automatically if
        :attr:`data_file` is not set. Default True.

Returns:
    Dataset: instance of IMDB dataset

Examples:

    .. code-block:: pycon

        >>> # doctest: +TIMEOUT(75)
        >>> import paddle
        >>> from paddle.text.datasets import Imdb

        >>> class SimpleNet(paddle.nn.Layer):
        ...     def __init__(self):
        ...         super().__init__()
        ...
        ...     def forward(self, doc, label):
        ...         return paddle.sum(doc), label


        >>> imdb = Imdb(mode='train')

        >>> for i in range(10):
        ...     doc, label = imdb[i]
        ...     doc = paddle.to_tensor(doc)
        ...     label = paddle.to_tensor(label)
        ...
        ...     model = SimpleNet()
        ...     image, label = model(doc, label)
        ...     print(doc.shape, label.shape)
        paddle.Size([121]) paddle.Size([1])
        paddle.Size([115]) paddle.Size([1])
        paddle.Size([386]) paddle.Size([1])
        paddle.Size([471]) paddle.Size([1])
        paddle.Size([585]) paddle.Size([1])
        paddle.Size([206]) paddle.Size([1])
        paddle.Size([221]) paddle.Size([1])
        paddle.Size([324]) paddle.Size([1])
        paddle.Size([166]) paddle.Size([1])
        paddle.Size([598]) paddle.Size([1])

str | None	data_file_ImdbDataSetModemodedict[str, int]word_idxlistdocslabelsNc                0   UR                  5       S;   d
   SU 35       eUR                  5       U l        Xl        U R                  c*  U(       d   S5       e[        U[        [
        SU5      U l        U R                  U5      U l        U R                  5         g )Nr	   z(mode should be 'train', 'test', but got z>data_file is not set and downloading automatically is disabledimdb)	lowerr   r   r   URLMD5_build_work_dictr   
_load_anno)selfr   r   cutoffdownloads        Y/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/text/datasets/imdb.py__init__Imdb.__init__b   s     zz|  
 
 	= 6dV<	= 
 JJL	">>! P8 83VXDN
 --f5 	    c                   [         R                  " [        5      n[        R                  " S5      nU R                  U5       H  nU H  nX%==   S-  ss'   M     M     UR                  5        Vs/ s H  ofS   U:  d  M  UPM     nn[        US S9n[        [        U6 5      u  p[        [        [        U[        [        U5      5      5      5      5      n
[        U5      U
S'   U
$ s  snf )Nz/aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$   c                    U S   * U S   4$ )Nr(   r    )xs    r#   <lambda>'Imdb._build_work_dict.<locals>.<lambda>   s    qteQqT]r&   )key<unk>)collectionsdefaultdictintrecompile	_tokenizeitemssortedr   zipdictrangelen)r    r!   	word_freqpatterndocwordr+   
dictionarywords_r   s              r#   r   Imdb._build_work_dict~   s    ++C0	**OP>>'*C1$  +
 !* 1C 11qTF]Q 1	CI+BC
Z()Sc%j(9:;<J Ds   0C& C&c           	     .   / n[         R                  " U R                  5       nUR                  5       nUb  [	        UR                  UR                  5      5      (       a  UR                  UR                  U5      R                  5       R                  S5      R                  S [        R                  R                  S5      5      R                  5       R!                  5       5        UR                  5       nUb  M  S S S 5        U$ ! , (       d  f       U$ = f)Ns   
zlatin-1)tarfileopenr   nextboolmatchnameappendextractfilereadrstrip	translatestringpunctuationencoder   split)r    r=   datatarftfs        r#   r5   Imdb._tokenize   s    \\$..)TB.bgg.//KK((,"4););)B)B9)MN YY[ . *  *) s   CD
Dc           
        [         R                  " SU R                   S35      n[         R                  " SU R                   S35      nU R                  S   n/ U l        / U l        U R                  U5       Hb  nU R                  R                  U Vs/ s H  oPR                  R                  XS5      PM     sn5        U R
                  R                  S5        Md     U R                  U5       Hb  nU R                  R                  U Vs/ s H  oPR                  R                  XS5      PM     sn5        U R
                  R                  S5        Md     g s  snf s  snf )NzaclImdb/z/pos/.*\.txt$z/neg/.*\.txt$r/   r   r(   )	r3   r4   r   r   r   r   r5   rK   get)r    pos_patternneg_patternUNKr>   ws         r#   r   Imdb._load_anno   s   jjHTYYK}!EFjjHTYYK}!EFmmG$	>>+.CIIEAmm//7EFKKq! / >>+.CIIEAmm//7EFKKq! / F Fs   $E

$E
c                    [         R                  " U R                  U   5      [         R                  " U R                  U   /5      4$ N)nparrayr   r   )r    idxs     r#   __getitem__Imdb.__getitem__   s5     3("((DKK4D3E*FGGr&   c                ,    [        U R                  5      $ r`   )r;   r   )r    s    r#   __len__Imdb.__len__   s    499~r&   )r   r   r   r   r   )Nr
      T)
r   r   r   r   r!   r2   r"   rH   returnNone)r!   r2   rj   r   )r=   zPattern[str]rj   zlist[list[str]])rj   rk   )rc   r2   rj   z1tuple[npt.NDArray[np.int_], npt.NDArray[np.int_]])rj   r2   )__name__
__module____qualname____firstlineno____doc____annotations__r$   r   r5   r   rd   rg   __static_attributes__r*   r&   r#   r   r   '   s    2h 

JL !%!(  	
  
8 &"HH	:H
r&   r   )
__future__r   r0   r3   rP   rE   typingr   r   numpyra   paddle.dataset.commonr   	paddle.ior   r   numpy.typingnptr   __all__r   r   r   r*   r&   r#   <module>r{      sS    #  	   )  < /
>(O7 Or&   