
    Αi3                        S r SSKrSSKrSSKJr  SSKrSSKJr  / rSr	Sr
SrSrS	rS
rSrS rSS jrS rS r\" SSSSS9SS j5       r\" SSSSS9SS j5       r\" SSSSS9SS j5       r\" SSSSS9SS j5       r\" SSSSS9S 5       rg)aW  
ACL2016 Multimodal Machine Translation. Please see this website for more
details: http://www.statmt.org/wmt16/multimodal-task.html#task1

If you use the dataset created for your task, please cite the following paper:
Multi30K: Multilingual English-German Image Descriptions.

@article{elliott-EtAl:2016:VL16,
 author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
 title     = {Multi30K: Multilingual English-German Image Descriptions},
 booktitle = {Proceedings of the 6th Workshop on Vision and Language},
 year      = {2016},
 pages     = {70--74},
 year      = 2016
}
    N)defaultdict)
deprecatedz2http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz 0c38be43600334966403524a40dcd81ei+  iK  z<s>z<e>z<unk>c           	         [        [        5      n[        R                  " U SS9 nUR	                  S5       Hv  nUR                  5       nUR                  5       R                  S5      n[        U5      S:w  a  MC  US:X  a  US   OUS   nUR                  5        H  n	XI==   S-  ss'   M     Mx     S S S 5        [        US	5       n
U
R                  [         S
[         S
[         S
3R                  5       5        [        [        UR!                  5       S SS95       HC  u  pUS-   U:X  a    O7U
R                  US   R                  5       5        U
R                  S5        ME     S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f)Nrmodewmt16/train	   enr      wb
c                     U S   $ )Nr    )xs    T/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/dataset/wmt16.py<lambda>__build_dict.<locals>.<lambda>B   s    AaD    T)keyreverse      
)r   inttarfileopenextractfiledecodestripsplitlenwrite
START_MARKEND_MARKUNK_MARKencode	enumeratesorteditems)tar_file	dict_size	save_pathlang	word_dictfline
line_splitsenwfoutidxwords                r   __build_dictr9   3   s=   C I	hS	)QMM-0D;;=D++D1J:!##'4<*Q-Z]CYY[! ! 1 
* 
i	$

zl"XJb
"=EEGH"9??$.$G
IC Qw)#JJtAw~~'(JJu
 
	 
*	) 
	s   BE-BE>-
E;>
Fc                 R   [         R                  R                  [        R                  R
                  R                  SU SU S35      n[         R                  R                  U5      (       a'  [        [        US5      R                  5       5      U:w  a  [        XXB5        0 n[        US5       n[        U5       HN  u  pxU(       a"  UR                  5       R                  5       XW'   M.  XuUR                  5       R                  5       '   MP     S S S 5        U$ ! , (       d  f       U$ = f)Nwmt16/_.dictrb)ospathjoinpaddledatasetcommon	DATA_HOMEexistsr#   r   	readlinesr9   r)   r!   r    )	r,   r-   r/   r   	dict_pathr0   fdictr7   r2   s	            r   __load_dictrJ   J   s    ''6$q5)II 77>>)$$DD!++-.);X):I	i	%"5)IC!%!4!4!6	36$**,--/0	 * 
  
	 s   /AD
D&c                 t    [        XS:X  a  [        O[        5      n [        XS:X  a  [        O[        5      nX4$ )Nr   )minTOTAL_EN_WORDSTOTAL_DE_WORDSsrc_dict_sizetrg_dict_sizesrc_langs      r   __get_dict_sizerS   ]   s;    d*:M d*:M ''r   c                 $   ^ ^^^^ UUUU U4S jnU$ )Nc            
   3     >#    [        TTT5      n [        TTTS:X  a  SOS5      nU [           nU [           nU [           nTS:X  a  SOSnSU-
  n[        R
                  " TSS9 nUR                  T5       H  nUR                  5       nUR                  5       R                  S5      n	[        U	5      S:w  a  MC  X   R                  5       n
U/U
 Vs/ s H  oR                  X5      PM     sn-   U/-   nX   R                  5       nU Vs/ s H  oR                  X5      PM     nn/ UQUPnU/UQnXU4v   M     S S S 5        g s  snf s  snf ! , (       d  f       g = f7f)	Nr   der   r   r   r   r   r   )rJ   r%   r&   r'   r   r   r   r    r!   r"   r#   get)src_dicttrg_dictstart_idend_idunk_idsrc_coltrg_colr1   r2   r3   	src_wordsr5   src_ids	trg_wordstrg_idstrg_ids_next	file_namerP   rR   r,   rQ   s                   r   readerreader_creator.<locals>.readerh   sm    xAmh$.>dD
 J'(#(#4'!Qg+\\(-i0{{}!ZZ\//5
z?a'&/557	J8AB	1||A.	BCh  '/557	<EFIq<<2IF11&1#.g.44% 1 .- C
 G .-s=   A"E %A-EE
,EE
%E<	E 
E
EE r   )r,   rd   rP   rQ   rR   re   s   ````` r   reader_creatorrg   g   s    #5 #5J Mr   z2.0.0zpaddle.text.datasets.WMT16r   z>Please use new dataset API which supports paddle.io.DataLoader)since	update_tolevelreasonc                     US;  a  [        S5      e[        XU5      u  p[        [        R                  R
                  R                  [        S[        S5      SU UUS9$ )a  
WMT16 train set reader.

This function returns the reader for train data. Each sample the reader
returns is made up of three fields: the source language word index sequence,
target language word index sequence and next word index sequence.


NOTE:
The original like for training data is:
http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz

paddle.dataset.wmt16 provides a tokenized version of the original dataset by
using moses's tokenization script:
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl

Args:
    src_dict_size(int): Size of the source language dictionary. Three
                        special tokens will be added into the dictionary:
                        <s> for start mark, <e> for end mark, and <unk> for
                        unknown word.
    trg_dict_size(int): Size of the target language dictionary. Three
                        special tokens will be added into the dictionary:
                        <s> for start mark, <e> for end mark, and <unk> for
                        unknown word.
    src_lang(string): A string indicating which language is the source
                      language. Available options are: "en" for English
                      and "de" for Germany.

Returns:
    callable: The train reader.
r   rV   zIAn error language type.  Only support: en (for English); de(for Germany).wmt16wmt16.tar.gzr
   r,   rd   rP   rQ   rR   	
ValueErrorrS   rg   rB   rC   rD   downloadDATA_URLDATA_MD5rO   s      r   trainrv      sq    P |#1
 	
 $3h$ M &&//gx
  ## r   c                     US;  a  [        S5      e[        XU5      u  p[        [        R                  R
                  R                  [        S[        S5      SU UUS9$ )a  
WMT16 test set reader.

This function returns the reader for test data. Each sample the reader
returns is made up of three fields: the source language word index sequence,
target language word index sequence and next word index sequence.

NOTE:
The original like for test data is:
http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz

paddle.dataset.wmt16 provides a tokenized version of the original dataset by
using moses's tokenization script:
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl

Args:
    src_dict_size(int): Size of the source language dictionary. Three
                        special tokens will be added into the dictionary:
                        <s> for start mark, <e> for end mark, and <unk> for
                        unknown word.
    trg_dict_size(int): Size of the target language dictionary. Three
                        special tokens will be added into the dictionary:
                        <s> for start mark, <e> for end mark, and <unk> for
                        unknown word.
    src_lang(string): A string indicating which language is the source
                      language. Available options are: "en" for English
                      and "de" for Germany.

Returns:
    callable: The test reader.
rm   HAn error language type. Only support: en (for English); de(for Germany).rn   ro   z
wmt16/testrp   rq   rO   s      r   testry      sq    N |#?
 	

 $3h$ M &&//gx
 ## r   c                     US;  a  [        S5      e[        XU5      u  p[        [        R                  R
                  R                  [        S[        S5      SU UUS9$ )a+  
WMT16 validation set reader.

This function returns the reader for validation data. Each sample the reader
returns is made up of three fields: the source language word index sequence,
target language word index sequence and next word index sequence.

NOTE:
The original like for validation data is:
http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz

paddle.dataset.wmt16 provides a tokenized version of the original dataset by
using moses's tokenization script:
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl

Args:
    src_dict_size(int): Size of the source language dictionary. Three
                        special tokens will be added into the dictionary:
                        <s> for start mark, <e> for end mark, and <unk> for
                        unknown word.
    trg_dict_size(int): Size of the target language dictionary. Three
                        special tokens will be added into the dictionary:
                        <s> for start mark, <e> for end mark, and <unk> for
                        unknown word.
    src_lang(string): A string indicating which language is the source
                      language. Available options are: "en" for English
                      and "de" for Germany.

Returns:
    callable: The validation reader.
rm   rx   rn   ro   z	wmt16/valrp   rq   rO   s      r   
validationr{     sq    L |#?
 	
 $3h$ M &&//gx
 ## r   c                    U S:X  a  [        U[        5      nO[        U[        5      n[        R                  R                  [        R                  R                  R                  SU  SU S35      n[        R                  R                  U5      (       d   S5       e  [        R                  R                  [        R                  R                  R                  S5      n[        XAX5      $ )a  
return the word dictionary for the specified language.

Args:
    lang(string): A string indicating which language is the source
                  language. Available options are: "en" for English
                  and "de" for Germany.
    dict_size(int): Size of the specified language dictionary.
    reverse(bool): If reverse is set to False, the returned python
                   dictionary will use word as key and use index as value.
                   If reverse is set to True, the returned python
                   dictionary will use index as key and word as value.

Returns:
    dict: The word dictionary for the specific language.
r   r;   r<   r=   z Word dictionary does not exist. ro   )rL   rM   rN   r?   r@   rA   rB   rC   rD   rE   rF   rJ   )r/   r-   r   rH   r,   s        r   get_dictr}   B  s    0 t|	>2		>2	''6$q5)II 77>>)$$H&HH$Eww||FNN11;;^LHxD::r   c                      [         R                  R                  R                  R	                  [
        S[        S5        g)zdownload the entire dataset.rn   ro   N)rB   v4rC   rD   rs   rt   ru   r   r   r   fetchr   i  s+     II%%'8^r   )F)r   )__doc__r?   r   collectionsr   rB   paddle.utilsr   __all__rt   ru   rM   rN   r%   r&   r'   r9   rJ   rS   rg   rv   ry   r{   r}   r   r   r   r   <module>r      s  " 
  #  #
?-
.&(&R 
*
K	33l 
*
K	33l 
*
K	11h 
*
K	;;B 
*
K	r   