
    ёi0"                        S SK Jr  S SKrS SKJrJrJr  S SKrS SK	J
r
  S SKJr  \(       a  S SKJr  \S   r/ rSrSrS	rS
rSrSrSrSr " S S\5      rg)    )annotationsN)TYPE_CHECKINGLiteraloverload)_check_exists_and_download)DatasettraintestgenzJhttp://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz 7d7897317ddd8ba0ae5c5fa7248d3ff5z/http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz 0791583d57d5beb693b9414c5b36798cz<s>z<e>z<unk>   c                  "   \ rS rSr% SrS\S'   S\S'   S\S'   S	\S
'   S	\S'   S	\S'   S\S'   S\S'       S         SS jjrSS jr    SS jrSS jr	\
 S   S S jj5       r\
 S   S!S jj5       r\
 S   S"S jj5       rS#S jrSrg)$WMT14-   a  
Implementation of `WMT14 <http://www.statmt.org/wmt14/>`_ test dataset.
The original WMT14 dataset is too large and a small set of data for set is
provided. This module will download dataset from
http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz .

Args:
    data_file(str|None): path to data tar file, can be set None if
        :attr:`download` is True. Default None.
    mode(str): 'train', 'test' or 'gen'. Default 'train'.
    dict_size(int): word dictionary size. Default -1.
    download(bool): whether to download dataset automatically if
        :attr:`data_file` is not set. Default True.

Returns:
    Dataset: Instance of WMT14 dataset
        - src_ids (np.array) - The sequence of token ids of source language.
        - trg_ids (np.array) - The sequence of token ids of target language.
        - trg_ids_next (np.array) - The next sequence of token ids of target language.
Examples:

    .. code-block:: python

        >>> import paddle
        >>> from paddle.text.datasets import WMT14

        >>> class SimpleNet(paddle.nn.Layer):
        ...     def __init__(self):
        ...         super().__init__()
        ...
        ...     def forward(self, src_ids, trg_ids, trg_ids_next):
        ...         return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)

        >>> wmt14 = WMT14(mode='train', dict_size=50)

        >>> for i in range(10):
        ...     src_ids, trg_ids, trg_ids_next = wmt14[i]
        ...     src_ids = paddle.to_tensor(src_ids)
        ...     trg_ids = paddle.to_tensor(trg_ids)
        ...     trg_ids_next = paddle.to_tensor(trg_ids_next)
        ...
        ...     model = SimpleNet()
        ...     src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
        ...     print(src_ids.item(), trg_ids.item(), trg_ids_next.item())
        91 38 39
        123 81 82
        556 229 230
        182 26 27
        447 242 243
        116 110 111
        403 288 289
        258 221 222
        136 34 35
        281 136 137

_Wmt14DataSetModemode
str | None	data_fileint	dict_sizezlist[list[int]]src_idstrg_idstrg_ids_nextdict[str, int]src_dicttrg_dictNc                *   UR                  5       S;   d
   SU 35       eUR                  5       U l        Xl        U R                  c*  U(       d   S5       e[        U[        [
        SU5      U l        US:  d   S5       eX0l        U R                  5         g )Nr	   z1mode should be 'train', 'test' or 'gen', but got z>data_file is not set and downloading automatically is disabledwmt14r   z*dict_size should be set as positive number)lowerr   r   r   	URL_TRAIN	MD5_TRAINr   
_load_data)selfr   r   r   downloads        Z/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/text/datasets/wmt14.py__init__WMT14.__init__p   s     zz|  
 
 	F ?tfE		F 

 JJL	">>! P8 89i(DN
 1}JJJ}"    c           
        SS jn/ U l         / U l        / U l        [        R                  " U R
                  SS9 nU Vs/ s H1  nUR                  R                  S5      (       d  M%  UR                  PM3     nn[        U5      S:X  d   eU" UR                  US   5      U R                  5      U l        U Vs/ s H1  nUR                  R                  S5      (       d  M%  UR                  PM3     nn[        U5      S:X  d   eU" UR                  US   5      U R                  5      U l        U R                   SU R                   3nU Vs/ s H1  nUR                  R                  U5      (       d  M%  UR                  PM3     nnU GH  nUR                  U5       GHx  nUR                  5       nUR                  5       R!                  S	5      n[        U5      S
:w  a  MD  US   n	U	R!                  5       n
["        /U
Q[$        P Vs/ s H#  nU R                  R'                  U[(        5      PM%     nnUS   nUR!                  5       nU Vs/ s H"  oR                  R'                  U[(        5      PM$     nn[        U5      S:  d  [        U5      S:  a  M  / UQU R                  [$           PnU R                  ["           /UQnU R                   R+                  U5        U R                  R+                  U5        U R                  R+                  U5        GM{     GM     S S S 5        g s  snf s  snf s  snf s  snf s  snf ! , (       d  f       g = f)Nc                    0 n[        U 5       H-  u  p4X1:  a"  X2UR                  5       R                  5       '   M,    U$    U$ N)	enumeratestripdecode)fdsizeout_dict
line_countlines        r'   	__to_dict#WMT14._load_data.<locals>.__to_dict   sD    H$-bM 
$6@TZZ\0023O %2
 Or*   r)r   zsrc.dict   r   ztrg.dict/	r   P   )r2   r   returnr   )r   r   r   tarfileopenr   nameendswithlenextractfiler   r   r   r   r0   r/   splitSTARTENDgetUNK_IDXappend)r%   _WMT14__to_dictf	each_itemnames	file_namer@   r5   
line_splitsrc_seq	src_wordswr   trg_seq	trg_wordsr   r   s                    r'   r$   WMT14._load_data   s   	 \\$..s3q "#!"I>>**:6 	!"  
 u:?"?%ammE!H&=t~~NDM "#!"I>>**:6 	!"  
 u:?"?%ammE!H&=t~~NDM99+Qtyyk2I "#!"I>>**95 	!"  
 MM$/D;;=D!%!3!3D!9J:!+ (mG 'I #(!9)!9S!9!9A ))!W5!9  
 )mG 'IFOPi}}00G<iGP 7|b(CL2,= #AW#AdmmC.@#AL#}}U3>g>GLL''0LL''0%%,,\:1 0 - 43 QK 43sb   M$L('L(7AM8$L- L-0AM$L24L2BM*L7
1M)L<
5B*M(M
Mc                    [         R                  " U R                  U   5      [         R                  " U R                  U   5      [         R                  " U R                  U   5      4$ r-   )nparrayr   r   r   )r%   idxs     r'   __getitem__WMT14.__getitem__   sO     HHT\\#&'HHT\\#&'HHT&&s+,
 	
r*   c                ,    [        U R                  5      $ r-   )rB   r   )r%   s    r'   __len__WMT14.__len__   s    4<<  r*   c                    g r-    r%   reverses     r'   get_dictWMT14.get_dict        14r*   c                    g r-   r`   ra   s     r'   rc   rd      re   r*   c                    g r-   r`   ra   s     r'   rc   rd      s     r*   c                    U R                   U R                  p2U(       aH  UR                  5        VVs0 s H  u  pEXT_M	     nnnUR                  5        VVs0 s H  u  pEXT_M	     nnnX#4$ s  snnf s  snnf )a  
Get the source and target dictionary.

Args:
    reverse (bool): whether to reverse key and value in dictionary,
        i.e. key: value to value: key.

Returns:
    Two dictionaries, the source and target dictionary.

Examples:

    .. code-block:: python

        >>> from paddle.text.datasets import WMT14
        >>> wmt14 = WMT14(mode='train', dict_size=50)
        >>> src_dict, trg_dict = wmt14.get_dict()

)r   r   items)r%   rb   r   r   kvs         r'   rc   rd      si    ( "]]DMM()1)9:)9)9H:)1)9:)9)9H:!! ;:s   A*A0)r   r   r   r   r   r   r   r   )Nr
   T)
r   r   r   r   r   r   r&   boolr=   None)r=   rn   )rY   r   r=   zGtuple[npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_]])r=   r   ).)rb   zLiteral[True]r=   z%tuple[dict[int, str], dict[int, str]])rb   zLiteral[False]r=   z%tuple[dict[str, int], dict[str, int]])rb   rm   r=   zMtuple[dict[str, int], dict[str, int]] | tuple[dict[int, str], dict[int, str]])F)__name__
__module____qualname____firstlineno____doc____annotations__r(   r$   rZ   r]   r   rc   __static_attributes__r`   r*   r'   r   r   -   s   7r N!! !%")   	
  
8<;|



! '*4$4	.4 4 (+4%4	.4 4 !	0 "r*   r   )
__future__r   r>   typingr   r   r   numpyrW   paddle.dataset.commonr   	paddle.ior   numpy.typingnptr   __all__URL_DEV_TESTMD5_DEV_TESTr"   r#   rE   rF   UNKrH   r   r`   r*   r'   <module>r      ss    #  3 3  <  67
 Q  2 >	.	
W"G W"r*   