
    x-j                     P   d Z ddlZddlZddlmZ g ZdZdZdZ	dZ
dZd	Zd
ZdZdZdZd Zd Z edddd          d             Z edddd          d             Z edddd          d             Z edddd          dd            Z edddd          d             ZdS )a  
WMT14 dataset.
The original WMT14 dataset is too large and a small set of data for set is
provided. This module will download dataset from
http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
parse training set and test set into paddle reader creators.

    N)
deprecatedzJhttp://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz 7d7897317ddd8ba0ae5c5fa7248d3ff5z/http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz 0791583d57d5beb693b9414c5b36798cz1http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz 0cb4a5366189b6acba876491c8724fa3z<s>z<e>z<unk>   c                    d }t          j        | d          5 }d |D             }t          |          dk    sJ  ||                    |d                   |          }d |D             }t          |          dk    sJ  ||                    |d                   |          }||fcd d d            S # 1 swxY w Y   d S )Nc                     i }t          |           D ]5\  }}||k     r*|||                                                                <   5 |S )N)	enumeratestripdecode)fdsizeout_dict
line_countlines        T/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/dataset/wmt14.py	__to_dictz!__read_to_dict.<locals>.__to_dict1   sU     )" 	 	JD  2<,,..//    rmodec                 P    g | ]#}|j                             d           |j         $S )zsrc.dictnameendswith.0	each_items     r   
<listcomp>z"__read_to_dict.<locals>.<listcomp>;   @     
 
 
~&&z22
N
 
 
r      r   c                 P    g | ]#}|j                             d           |j         $S )ztrg.dictr   r   s     r   r   z"__read_to_dict.<locals>.<listcomp>B   r    r   )tarfileopenlenextractfile)tar_file	dict_sizer   fnamessrc_dicttrg_dicts          r   __read_to_dictr-   0   s9      
hS	)	)	) "Q
 

 
 

 5zzQ9Q]]5844i@@
 

 
 

 5zzQ9Q]]5844i@@!" " " " " " " " " " " " " " " " " "s   BB77B;>B;c                       fd}|S )Nc               3     K   t                    \  t          j        d          5 } fd| D             }|D ]}|                     |          D ]}|                                }|                                                    d          }t          |          dk    rR|d         }|                                }fdt          g|t          D             }|d         }|                                }	fd	|	D             }
t          |          d
k    st          |
          d
k    rg |
t                   }t                   g|
}
||
|fV   	 d d d            d S # 1 swxY w Y   d S )Nr   r   c                 R    g | ]#}|j                                       |j         $S  r   )r   r   	file_names     r   r   z2reader_creator.<locals>.reader.<locals>.<listcomp>P   sA       >**955  r   	r   r   c                 F    g | ]}                     |t                    S r1   getUNK_IDX)r   wr+   s     r   r   z2reader_creator.<locals>.reader.<locals>.<listcomp>]   s7        !Q00  r   r!   c                 F    g | ]}                     |t                    S r1   r5   )r   r8   r,   s     r   r   z2reader_creator.<locals>.reader.<locals>.<listcomp>d   s'    KKKAx||Aw77KKKr   P   )
r-   r#   r$   r&   r   r   splitr%   STARTEND)r)   r*   r   r   
line_splitsrc_seq	src_wordssrc_idstrg_seq	trg_wordstrg_idstrg_ids_nextr+   r,   r(   r2   r'   s               @@r   readerzreader_creator.<locals>.readerM   s	     +Hi@@(\(--- 	9   !"  E
  9 9MM$// 9 9D;;==D!%!3!3D!9!9J:!++ (mG 'I   "'!9)!9S!9  G
 )mG 'IKKKKKKKG 7||b((CLL2,=,= #<W#<hsm#<L'99G!7L88888-99	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9s   D1E..E25E2r1   )r'   r2   r(   rF   s   ``` r   reader_creatorrG   L   s0    9 9 9 9 9 9 9B Mr   z2.0.0zpaddle.text.datasets.WMT14r!   z>Please use new dataset API which supports paddle.io.DataLoader)since	update_tolevelreasonc                     t          t          j        j                            t
          dt                    d|           S )a  
    WMT14 training set creator.

    It returns a reader creator, each sample in the reader is source language
    word ID sequence, target language word ID sequence and next word ID
    sequence.

    :return: Training reader creator
    :rtype: callable
    wmt14ztrain/trainrG   paddledatasetcommondownload	URL_TRAIN	MD5_TRAINr(   s    r   trainrV   q   s6    " &&y'9EE  r   c                     t          t          j        j                            t
          dt                    d|           S )z
    WMT14 test set creator.

    It returns a reader creator, each sample in the reader is source language
    word ID sequence, target language word ID sequence and next word ID
    sequence.

    :return: Test reader creator
    :rtype: callable
    rM   z	test/testrN   rU   s    r   testrX      s6    " &&y'9EE  r   c                     t          t          j        j                            t
          dt                    d|           S )NrM   zgen/genrN   rU   s    r   genrZ      s6     &&y'9EE  r   Tc                    t           j        j                            t          dt
                    }t          ||           \  }}|r<d |                                D             }d |                                D             }||fS )NrM   c                     i | ]\  }}||	S r1   r1   r   kvs      r   
<dictcomp>zget_dict.<locals>.<dictcomp>       666TQAq666r   c                     i | ]\  }}||	S r1   r1   r]   s      r   r`   zget_dict.<locals>.<dictcomp>   ra   r   )rO   rP   rQ   rR   rS   rT   r-   items)r(   reverser'   r+   r,   s        r   get_dictre      s     ~$--i)LLH')<<Hh 766X^^%5%566666X^^%5%5666Xr   c                      t           j        j                            t          dt
                     t           j        j                            t          dt                     d S )NrM   )rO   rP   rQ   rR   rS   rT   	URL_MODEL	MD5_MODELr1   r   r   fetchri      sD     N""9gyAAA
N""9gyAAAAAr   )T)__doc__r#   paddle.dataset.commonrO   paddle.utilsr   __all__URL_DEV_TESTMD5_DEV_TESTrS   rT   rg   rh   r<   r=   UNKr7   r-   rG   rV   rX   rZ   re   ri   r1   r   r   <module>rq      s         # # # # # #
 Q  2 >	.	?	.	
" " "8" " "J 
*
K	    $ 
*
K	    $ 
*
K	     
*
K	      
*
K	  B B B B Br   