
    Αi                         S r SSKrSSKrSSKJr  / rSrSrSr	Sr
SrS	rS
rSrSrSrS rS r\" SSSSS9S 5       r\" SSSSS9S 5       r\" SSSSS9S 5       r\" SSSSS9SS j5       r\" SSSSS9S 5       rg)a  
WMT14 dataset.
The original WMT14 dataset is too large and a small set of data for set is
provided. This module will download dataset from
http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
parse training set and test set into paddle reader creators.

    N)
deprecatedzJhttp://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz 7d7897317ddd8ba0ae5c5fa7248d3ff5z/http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz 0791583d57d5beb693b9414c5b36798cz1http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz 0cb4a5366189b6acba876491c8724fa3z<s>z<e>z<unk>   c                 *   S n[         R                  " U SS9 nU Vs/ s H1  nUR                  R                  S5      (       d  M%  UR                  PM3     nn[	        U5      S:X  d   eU" UR                  US   5      U5      nU Vs/ s H1  nUR                  R                  S5      (       d  M%  UR                  PM3     nn[	        U5      S:X  d   eU" UR                  US   5      U5      nXg4sS S S 5        $ s  snf s  snf ! , (       d  f       g = f)Nc                     0 n[        U 5       H-  u  p4X1:  a"  X2UR                  5       R                  5       '   M,    U$    U$ )N)	enumeratestripdecode)fdsizeout_dict
line_countlines        T/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/dataset/wmt14.py	__to_dict!__read_to_dict.<locals>.__to_dict1   sD     )"J 2<,,./ !.
     rmodezsrc.dict   r   ztrg.dict)tarfileopennameendswithlenextractfile)tar_file	dict_sizer   f	each_itemnamessrc_dicttrg_dicts           r   __read_to_dictr'   0   s    
hS	)Q 
	~~&&z2 INN 	 

 5zQQ]]584i@ 
	~~&&z2 INN 	 

 5zQQ]]584i@! 
*	)

 
*	)s4   D$C:C:2D$C?0C? 0D:
D
Dc                    ^ ^^ UUU 4S jnU$ )Nc            
   3   n  >#    [        TT5      u  p[        R                  " TSS9 nU Vs/ s H1  nUR                  R	                  T5      (       d  M%  UR                  PM3     nnU GH  nUR                  U5       GH  nUR                  5       nUR                  5       R                  S5      n[        U5      S:w  a  MD  US   nUR                  5       n	[        /U	Q[        P V
s/ s H  n
U R                  U
[        5      PM     nn
US   nUR                  5       nU V
s/ s H  oR                  U
[        5      PM     nn
[        U5      S:  d  [        U5      S:  a  M  / UQU[           PnU[           /UQnXU4v   GM     GM"     S S S 5        g s  snf s  sn
f s  sn
f ! , (       d  f       g = f7f)Nr   r   	r   r   r   P   )r'   r   r   r   r   r   r   r   splitr   STARTENDgetUNK_IDX)r%   r&   r"   r#   r$   r   r   
line_splitsrc_seq	src_wordswsrc_idstrg_seq	trg_wordstrg_idstrg_ids_nextr!   	file_namer    s                   r   readerreader_creator.<locals>.readerM   s    +Hi@\\(- "#!"I>>**95 	!"  
 MM$/D;;=D!%!3!3D!9J:!+ (mG 'I #(!9)!9S!9!9A !Q0!9  
 )mG 'IAJKA||Aw7GK 7|b(CL2,= #<W#<hsm#<L'99G!L88- 0  .- L+ .-sM   $F5F$$FF$BF$' F
F$"F
AF$	F5F$$
F2.F5 )r    r:   r!   r;   s   ``` r   reader_creatorr>   L   s    9B Mr   z2.0.0zpaddle.text.datasets.WMT14r   z>Please use new dataset API which supports paddle.io.DataLoader)since	update_tolevelreasonc                 ~    [        [        R                  R                  R	                  [
        S[        5      SU 5      $ )z
WMT14 training set creator.

It returns a reader creator, each sample in the reader is source language
word ID sequence, target language word ID sequence and next word ID
sequence.

:return: Training reader creator
:rtype: callable
wmt14ztrain/trainr>   paddledatasetcommondownload	URL_TRAIN	MD5_TRAINr!   s    r   trainrM   q   s3    " &&y'9E r   c                 ~    [        [        R                  R                  R	                  [
        S[        5      SU 5      $ )z
WMT14 test set creator.

It returns a reader creator, each sample in the reader is source language
word ID sequence, target language word ID sequence and next word ID
sequence.

:return: Test reader creator
:rtype: callable
rD   z	test/testrE   rL   s    r   testrO      s3    " &&y'9E r   c                 ~    [        [        R                  R                  R	                  [
        S[        5      SU 5      $ )NrD   zgen/genrE   rL   s    r   genrQ      s3     &&y'9E r   c                 >   [         R                  R                  R                  [        S[
        5      n[        X 5      u  p4U(       aH  UR                  5        VVs0 s H  u  pVXe_M	     nnnUR                  5        VVs0 s H  u  pVXe_M	     nnnX44$ s  snnf s  snnf NrD   )rF   rG   rH   rI   rJ   rK   r'   items)r!   reverser    r%   r&   kvs          r   get_dictrX      s     ~~$$--i)LH'<H%-^^%56%5TQAD%56%-^^%56%5TQAD%56 76s   B?Bc                      [         R                  R                  R                  [        S[
        5        [         R                  R                  R                  [        S[        5        g rS   )rF   rG   rH   rI   rJ   rK   	URL_MODEL	MD5_MODELr=   r   r   fetchr\      s<     NN""9gyA
NN""9gyAr   )T)__doc__r   paddle.dataset.commonrF   paddle.utilsr   __all__URL_DEV_TESTMD5_DEV_TESTrJ   rK   rZ   r[   r-   r.   UNKr0   r'   r>   rM   rO   rQ   rX   r\   r=   r   r   <module>rd      s     #
 Q  2 >	.	?	.	
"8"J 
*
K	$ 
*
K	$ 
*
K	 
*
K	 
*
K	BBr   