
    x-j                        d Z ddlZddlZddlZddlmZ g ZdZdZ	 G d d          Z
ddZdd
Zd Z edddd          e
j        fd            Z edddd          e
j        fd            Z edddd          d             ZdS )z
imikolov's simple dataset.

This module will download dataset from
http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
into paddle reader creators.
    N)
deprecatedz<https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgz 30177ea32e27c525793142b6bf2c8e2dc                       e Zd ZdZdZdS )DataType      N)__name__
__module____qualname__NGRAMSEQ     W/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/dataset/imikolov.pyr   r   #   s        E
CCCr   r   c                     |t          j        t                    }| D ][}|                                                                D ]}||xx         dz  cc<   |dxx         dz  cc<   |dxx         dz  cc<   \|S )Nr   <s><e>)collectionsdefaultdictintstripsplit)f	word_freqlws       r   
word_countr   (   s    +C00	  "" 	 	AaLLLALLLL%A%Ar   2   c                     d}d}t          j        t          j        j                            t          j        j        j        dt          j        j        j                            5 }|	                    |          }|	                    |          }t          |t          |                    }d|v r|d=  fd|                                D             }t          |d           }t          t          |           \  }}	t          t          t          |t!          t#          |                                                  }
t#          |          |
d<   ddd           n# 1 swxY w Y   |
S )	z
    Build a word dictionary from the corpus,  Keys of the dictionary are words,
    and values are zero-based IDs of these words.
    $./simple-examples/data/ptb.train.txt$./simple-examples/data/ptb.valid.txtimikolov<unk>c                 ,    g | ]}|d          k    |S )r   r   ).0xmin_word_freqs     r   
<listcomp>zbuild_dict.<locals>.<listcomp>H   s'    JJJ1QqTM5I5IQ5I5I5Ir   c                 $    | d          | d         fS )Nr   r   r   )r&   s    r   <lambda>zbuild_dict.<locals>.<lambda>J   s    QqTE1Q4= r   )keyN)tarfileopenpaddledatasetcommondownloadr"   URLMD5extractfiler   itemssortedlistzipdictrangelen)r'   train_filenametest_filenametftrainftestfr   word_freq_sortedwords_word_idxs   `          r   
build_dictrE   5   s   
 <N:M	&&N#'V^5L5P	
 	

 
 ' 
//}--uj&8&899	i'"JJJJ	 1 1JJJ	!)1H1HIII-.//qSc%jj(9(9::;;<<JJ#' ' ' ' ' ' ' ' ' ' ' ' ' ' '& Os   &C0E""E&)E&c                       fd}|S )Nc               3     K   t          j        t          j        j                            t          j        j        j        dt          j        j        j                            5 } | 	                              }
d         |D ]<}t          j        k    r	dk    s
J d            dg|                                                                d}t          |          	k    rN
fd|D             }t          	t          |          dz             D ]}t!          ||	z
  |                   V  t          j        k    ro|                                                                }
fd	|D             }
d         g|}g |
d         }	d
k    rt          |          	k    r'||fV  /t%          d          	 d d d            d S # 1 swxY w Y   d S )Nr"   r#   zInvalid gram lengthr   r   c                 <    g | ]}                     |          S r   getr%   r   UNKrD   s     r   r(   z2reader_creator.<locals>.reader.<locals>.<listcomp>c   s'    ===aX\\!S11===r   r   c                 <    g | ]}                     |          S r   rJ   rL   s     r   r(   z2reader_creator.<locals>.reader.<locals>.<listcomp>h   s'    999!a--999r   r   zUnknown data type)r,   r-   r.   r/   r0   r1   r"   r2   r3   r4   r   r   r   r   r;   r:   tupler   AssertionError)r>   r   r   isrc_seqtrg_seqrM   	data_typefilenamenrD   s         @r   readerzreader_creator.<locals>.readerS   s8     \N!**'+'+ 
 
 	> x((A7#C > >>Y..r666#8666:!2!2:E:A1vv{{=====1===!&q#a&&1*!5!5 6 6A"'!a%!)"5"55555\Y..		))A99999q999A'33G338E?3G1uuW!1!1 !7*****()<===#>	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>s   %EGGGr   )rU   rD   rV   rT   rW   s   ```` r   reader_creatorrX   R   s5    > > > > > > > >< Mr   z2.0.0zpaddle.text.datasets.Imikolovr   z>Please use new dataset API which supports paddle.io.DataLoader)since	update_tolevelreasonc                 &    t          d| ||          S )a  
    imikolov training set creator.

    It returns a reader creator, each sample in the reader is a word ID
    tuple.

    :param word_idx: word dictionary
    :type word_idx: dict
    :param n: sliding window size if type is ngram, otherwise max length of sequence
    :type n: int
    :param data_type: data type (ngram or sequence)
    :type data_type: member variable of DataType (NGRAM or SEQ)
    :return: Training reader creator
    :rtype: callable
    r    rX   rD   rV   rT   s      r   trainr`   t       , .!Y  r   c                 &    t          d| ||          S )a  
    imikolov test set creator.

    It returns a reader creator, each sample in the reader is a word ID
    tuple.

    :param word_idx: word dictionary
    :type word_idx: dict
    :param n: sliding window size if type is ngram, otherwise max length of sequence
    :type n: int
    :param data_type: data type (ngram or sequence)
    :type data_type: member variable of DataType (NGRAM or SEQ)
    :return: Test reader creator
    :rtype: callable
    r!   r^   r_   s      r   testrc      ra   r   c                  f    t           j        j                            t          dt
                     d S )Nr"   )r.   r/   r0   r1   r2   r3   r   r   r   fetchre      s'     N""3
C88888r   )N)r   )__doc__r   r,   paddle.dataset.commonr.   paddle.utilsr   __all__r2   r3   r   r   rE   rX   r   r`   rc   re   r   r   r   <module>rj      s             # # # # # #
 E(       

 
 
 
   :  D 
-
K	   "*    * 
-
K	   !)    * 
-
K	  9 9 9 9 9r   