
    x-j.                     F   d Z ddlZddlZddlZddlZddlZddlmZ g Z	dZ
dZd Zd Z edd	d
d          d             Z edd	d
d          d             Z edd	d
d          d             Z edd	d
d          d             Z edd	d
d          d             ZdS )a  
IMDB dataset.

This module downloads IMDB dataset from
http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
of 25,000 highly polar movie reviews for training, and 25,000 for testing.
Besides, this module also provides API for building dictionary.
    N)
deprecatedz6https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz 7c2ac02c03563afcf9b574c7e56c153ac              #     K   t          j        t          j        j                            t          dt                              5 }|                                }|t          | 
                    |j                            r|                    |                                                              d                              dt           j                            d                                                                                    V  |                                }|ddd           dS # 1 swxY w Y   dS )zQ
    Read files that match the given pattern.  Tokenize and yield each file.
    imdbNs   
zlatin-1)tarfileopenpaddledatasetcommondownloadURLMD5nextboolmatchnameextractfilereadrstrip	translatestringpunctuationencodelowersplit)patterntarftfs      S/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/dataset/imdb.pytokenizer    &   s@     
 
fn+44S&#FF	G	G 4
 YY[[nGMM"'**++ 	 $$R((TVVVG__YtV%7%>%>y%I%IJJUWWUWW   B n                 s   C%D77D;>D;c                    t          j        t                    }t          |           D ]}|D ]}||xx         dz  cc<   fd|                                D             }t          |d           }t          t          |           \  }}t          t          t          |t          t          |                                                  }t          |          |d<   |S )z
    Build a word dictionary from the corpus. Keys of the dictionary are words,
    and values are zero-based IDs of these words.
       c                 ,    g | ]}|d          k    |S )r"    ).0xcutoffs     r   
<listcomp>zbuild_dict.<locals>.<listcomp>J   s"    ???q1    c                 $    | d          | d         fS )Nr"   r   r$   )r&   s    r   <lambda>zbuild_dict.<locals>.<lambda>L   s    1Q4%1 r)   )key<unk>)collectionsdefaultdictintr    itemssortedlistzipdictrangelen)	r   r'   	word_freqdocword
dictionarywords_word_idxs	    `       r   
build_dictr?   ?   s    
 ',,I   ! ! 	! 	!DdOOOq OOOO	! @???IOO--???I	'>'>???JC$%%HE1DUE#e**$5$5667788HE

HWOr)   z2.0.0zpaddle.text.datasets.Imdbr"   z>Please use new dataset API which supports paddle.io.DataLoader)since	update_tolevelreasonc                 j    d         g fd} || d            ||d           fd}|S )Nr-   c                 t    t          |           D ]&}|                    fd|D             |f           'd S )Nc                 <    g | ]}                     |          S r$   )get)r%   wUNKr>   s     r   r(   z0reader_creator.<locals>.load.<locals>.<listcomp>_   s'    ;;;!a--;;;r)   )r    append)r   outlabelr9   rI   r>   s       r   loadzreader_creator.<locals>.load]   sZ    G$$ 	E 	ECJJ;;;;;s;;;UCDDDD	E 	Er)   r   r"   c               3      K    E d {V  d S )Nr$   )INSs   r   readerzreader_creator.<locals>.readerd   s      r)   r$   )pos_patternneg_patternr>   rM   rP   rO   rI   s     `  @@r   reader_creatorrS   S   s     7
C
CE E E E E E 	Dc1Dc1     Mr)   c                 l    t          t          j        d          t          j        d          |           S )a  
    IMDB training set creator.

    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Training reader creator
    :rtype: callable
    zaclImdb/train/pos/.*\.txt$zaclImdb/train/neg/.*\.txt$rS   recompiler>   s    r   trainrY   j   s5    $ 

011

011  r)   c                 l    t          t          j        d          t          j        d          |           S )a  
    IMDB test set creator.

    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Test reader creator
    :rtype: callable
    zaclImdb/test/pos/.*\.txt$zaclImdb/test/neg/.*\.txt$rU   rX   s    r   testr[      s5    $ 

/00

/00  r)   c                  F    t          t          j        d          d          S )za
    Build a word dictionary from the corpus.

    :return: Word dictionary
    :rtype: dict
    z/aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$   )r?   rV   rW   r$   r)   r   	word_dictr^      s&     

EFF  r)   c                  f    t           j        j                            t          dt
                     d S )Nr   )r	   r
   r   r   r   r   r$   r)   r   fetchr`      s'     N""344444r)   )__doc__r.   rV   r   r   paddle.dataset.commonr	   paddle.utilsr   __all__r   r   r    r?   rS   rY   r[   r^   r`   r$   r)   r   <module>re      s        				       # # # # # #
 ?(  2  ( 
)
K	    " 
)
K	    & 
)
K	    & 
)
K	  	 	 	 
)
K	  5 5 5 5 5r)   