
    Αi.                         S r SSKrSSKrSSKrSSKrSSKrSSKJr  / r	Sr
SrS rS r\" SS	S
SS9S 5       r\" SS	S
SS9S 5       r\" SS	S
SS9S 5       r\" SS	S
SS9S 5       r\" SS	S
SS9S 5       rg)a  
IMDB dataset.

This module downloads IMDB dataset from
http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
of 25,000 highly polar movie reviews for training, and 25,000 for testing.
Besides, this module also provides API for building dictionary.
    N)
deprecatedz6https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz 7c2ac02c03563afcf9b574c7e56c153ac              #   b  #    [         R                  " [        R                  R                  R                  [        S[        5      5       nUR                  5       nUb  [        U R                  UR                  5      5      (       ay  UR                  U5      R                  5       R                  S5      R                  S[         R"                  R%                  S5      5      R'                  5       R)                  5       v   UR                  5       nUb  M  SSS5        g! , (       d  f       g= f7f)zI
Read files that match the given pattern.  Tokenize and yield each file.
imdbNs   
zlatin-1)tarfileopenpaddledatasetcommondownloadURLMD5nextboolmatchnameextractfilereadrstrip	translatestringpunctuationencodelowersplit)patterntarftfs      S/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/dataset/imdb.pytokenizer    &   s     
 
fnn++44S&#F	G4
 YY[nGMM"''*++ $$R(TVVG_YtV%7%7%>%>y%IJUWUW B n 
H	G	Gs   AD/
C	D	D/
D,(D/c                    [         R                  " [        5      n[        U 5       H  nU H  nX$==   S-  ss'   M     M     UR	                  5        Vs/ s H  oUS   U:  d  M  UPM     nn[        US S9n[        [        U6 5      u  px[        [        [        U[        [        U5      5      5      5      5      n	[        U5      U	S'   U	$ s  snf )zz
Build a word dictionary from the corpus. Keys of the dictionary are words,
and values are zero-based IDs of these words.
   c                     U S   * U S   4$ )Nr"   r    )xs    r   <lambda>build_dict.<locals>.<lambda>L   s    1Q4%1    )key<unk>)collectionsdefaultdictintr    itemssortedlistzipdictrangelen)
r   cutoff	word_freqdocwordr%   
dictionarywords_word_idxs
             r   
build_dictr=   ?   s    
 '',I DOq O  !
 &OO-?-q1-I?	'>?JC$%HEDUE#e*$5678HE
HWO @s   C
$C
z2.0.0zpaddle.text.datasets.Imdbr"   z>Please use new dataset API which supports paddle.io.DataLoader)since	update_tolevelreasonc                 \   ^^^ TS   m/ mUU4S jnU" U TS5        U" UTS5        U4S jnU$ )Nr*   c           
         > [        U 5       H7  nUR                  U Vs/ s H  nTR                  UT5      PM     snU45        M9     g s  snf N)r    appendget)r   outlabelr7   wUNKr<   s        r   loadreader_creator.<locals>.load]   s@    G$CJJs;s!a-s;UCD %;s   A	
r   r"   c               3   &   >#    T  S h  vN   g  N7frD   r$   )INSs   r   readerreader_creator.<locals>.readerd   s     s   r$   )pos_patternneg_patternr<   rK   rO   rN   rJ   s     `  @@r   reader_creatorrS   S   s@     7
C
CE 	c1c1 Mr(   c                 l    [        [        R                  " S5      [        R                  " S5      U 5      $ )z
IMDB training set creator.

It returns a reader creator, each sample in the reader is an zero-based ID
sequence and label in [0, 1].

:param word_idx: word dictionary
:type word_idx: dict
:return: Training reader creator
:rtype: callable
zaclImdb/train/pos/.*\.txt$zaclImdb/train/neg/.*\.txt$rS   recompiler<   s    r   trainrY   j   s.    $ 


01


01 r(   c                 l    [        [        R                  " S5      [        R                  " S5      U 5      $ )z
IMDB test set creator.

It returns a reader creator, each sample in the reader is an zero-based ID
sequence and label in [0, 1].

:param word_idx: word dictionary
:type word_idx: dict
:return: Test reader creator
:rtype: callable
zaclImdb/test/pos/.*\.txt$zaclImdb/test/neg/.*\.txt$rU   rX   s    r   testr[      s.    $ 


/0


/0 r(   c                  B    [        [        R                  " S5      S5      $ )zQ
Build a word dictionary from the corpus.

:return: Word dictionary
:rtype: dict
z/aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$   )r=   rV   rW   r$   r(   r   	word_dictr^      s      


EF r(   c                  j    [         R                  R                  R                  [        S[
        5        g )Nr   )r	   r
   r   r   r   r   r$   r(   r   fetchr`      s      NN""34r(   )__doc__r+   rV   r   r   paddle.dataset.commonr	   paddle.utilsr   __all__r   r   r    r=   rS   rY   r[   r^   r`   r$   r(   r   <module>re      s     	    #
 ?(2( 
)
K	" 
)
K	& 
)
K	& 
)
K			 
)
K	55r(   