
    Αi                         S r SSKrSSKrSSKrSSKJr  / rSrSr	 " S S5      r
SS jrSS	 jrS
 r\" SSSSS9\
R                  4S j5       r\" SSSSS9\
R                  4S j5       r\" SSSSS9S 5       rg)z
imikolov's simple dataset.

This module will download dataset from
http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
into paddle reader creators.
    N)
deprecatedz<https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgz 30177ea32e27c525793142b6bf2c8e2dc                       \ rS rSrSrSrSrg)DataType#          N)__name__
__module____qualname____firstlineno__NGRAMSEQ__static_attributes__r
       W/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/dataset/imikolov.pyr   r   #   s    E
Cr   r   c                     Uc  [         R                  " [        5      nU  HN  nUR                  5       R	                  5        H  nX==   S-  ss'   M     US==   S-  ss'   US==   S-  ss'   MP     U$ )Nr   <s><e>)collectionsdefaultdictintstripsplit)f	word_freqlws       r   
word_countr    (   sm    ++C0	"ALAL #%A%A	  r   c                    SnSn[         R                  " [        R                  R                  R                  [        R                  R                  R                  S[        R                  R                  R                  5      5       nUR                  U5      nUR                  U5      n[        U[        U5      5      nSU;   a  US	 UR                  5        Vs/ s H  owS   U :  d  M  UPM     nn[        US S9n[        [        U6 5      u  p[        [        [        U	[!        [#        U	5      5      5      5      5      n[#        U	5      US'   SSS5        U$ s  snf ! , (       d  f       W$ = f)	z{
Build a word dictionary from the corpus,  Keys of the dictionary are words,
and values are zero-based IDs of these words.
$./simple-examples/data/ptb.train.txt$./simple-examples/data/ptb.valid.txtimikolov<unk>r   c                     U S   * U S   4$ )Nr   r   r
   )xs    r   <lambda>build_dict.<locals>.<lambda>J   s    QqTE1Q4=r   )keyN)tarfileopenpaddledatasetcommondownloadr$   URLMD5extractfiler    itemssortedlistzipdictrangelen)min_word_freqtrain_filenametest_filenametftrainftestfr   r'   word_freq_sortedwords_word_idxs               r   
build_dictrE   5   s2   
 <N:M	&&NN##''V^^5L5L5P5P	

 
/}-uj&89	i'" ) 1J 11qTM5IQ 1	J!)1HI-./Sc%j(9:;<J#
& O K
 
& Os&   AEE+E1AEE
E-c                     ^ ^^^ UU UU4S jnU$ )Nc            	   3     >#    [         R                  " [        R                  R                  R                  [        R                  R                  R                  S[        R                  R                  R                  5      5       n U R                  T	5      nTS   nU GH9  n[        R                  T:X  a  T
S:  d   S5       eS/UR                  5       R                  5       QSPn[        U5      T
:  aT  U Vs/ s H  nTR                  XB5      PM     nn[!        T
[        U5      S-   5       H  n[#        X5T
-
  U 5      v   M     M  M  [        R$                  T:X  ap  UR                  5       R                  5       nU Vs/ s H  nTR                  XB5      PM     nnTS   /UQn/ UQTS   PnT
S:  a  [        U5      T
:  a  GM)  Xg4v   GM1  ['        S	5      e   S S S 5        g s  snf s  snf ! , (       d  f       g = f7f)
Nr$   r%   zInvalid gram lengthr   r   r   r   zUnknown data type)r+   r,   r-   r.   r/   r0   r$   r1   r2   r3   r   r   r   r   r:   getr9   tupler   AssertionError)r>   r   UNKr   r   isrc_seqtrg_seq	data_typefilenamenrD   s           r   readerreader_creator.<locals>.readerS   s    \\NN!!**''++''++
 x(A7#C>>Y.r68#886:!2:E:A1v{;<=1aX\\!11=!&q#a&1*!5A"'a%!"55 "6 # \\Y.	)A789q!a-qA9'33G338E?3G1uW!1 !**()<==# 
 
 >
 :)
 
s?   BHA4G1;G'A-G1G,A G1	H'
G11
G?;Hr
   )rQ   rD   rR   rP   rS   s   ```` r   reader_creatorrU   R   s    > >< Mr   z2.0.0zpaddle.text.datasets.Imikolovr   z>Please use new dataset API which supports paddle.io.DataLoader)since	update_tolevelreasonc                     [        SXU5      $ )a  
imikolov training set creator.

It returns a reader creator, each sample in the reader is a word ID
tuple.

:param word_idx: word dictionary
:type word_idx: dict
:param n: sliding window size if type is ngram, otherwise max length of sequence
:type n: int
:param data_type: data type (ngram or sequence)
:type data_type: member variable of DataType (NGRAM or SEQ)
:return: Training reader creator
:rtype: callable
r"   rU   rD   rR   rP   s      r   trainr]   t       , .Y r   c                     [        SXU5      $ )a  
imikolov test set creator.

It returns a reader creator, each sample in the reader is a word ID
tuple.

:param word_idx: word dictionary
:type word_idx: dict
:param n: sliding window size if type is ngram, otherwise max length of sequence
:type n: int
:param data_type: data type (ngram or sequence)
:type data_type: member variable of DataType (NGRAM or SEQ)
:return: Test reader creator
:rtype: callable
r#   r[   r\   s      r   testr`      r^   r   c                  j    [         R                  R                  R                  [        S[
        5        g )Nr$   )r-   r.   r/   r0   r1   r2   r
   r   r   fetchrb      s      NN""3
C8r   )N)2   )__doc__r   r+   paddle.dataset.commonr-   paddle.utilsr   __all__r1   r2   r   r    rE   rU   r   r]   r`   rb   r
   r   r   <module>rh      s       #
 E( 

:D 
-
K	 "* * 
-
K	 !) * 
-
K	99r   