
    |-j                     8   U d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	 ddl
mZmZ  ej        e          Z e            rd dlZerddlmZ d Z	 	 	 	 d"d	ed
         ded         dedz  dedz  dedef         f
dZ	 	 	 	 	 d#d	ed
         ded         dedz  dedz  dededef         fdZ	 	 	 	 d"d	ed
         ded         dedz  dedz  dedef         f
dZ	 	 	 d$d	d
ded         dedz  dedz  dedef         f
dZ	 	 	 d$d	d
ded         dedz  dedz  dedef         f
dZ	 	 	 d$d	d
ded         dedz  dedz  dedef         f
dZeeeeeedZeeededef         f         f         e d<    G d de	          Z! G d d          Z"d%d	e"d e#dz  fd!Z$dS )&    N)Callablewraps)TYPE_CHECKINGOptional	TypedDict   )is_torch_availablelogging)PreTrainedConfigc                 V     ddddt                     d fd	            }|S )ad  
    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

    Args:
        rope_forward (Callable):
            The forward pass of the RoPE implementation.

    Returns:
        The decorated forward pass.
    Nc                 b   t          j        |          dz   }|#| j        }| j        }d}| j        j        d         }n=| j        |         }t          | | d          }| d}| j        j        |         d         }||k    rkt          | | d          s't          |         }	 |	| j        ||dz   |          \  }
}| 	                    | d	|
d
           t          | | d|
           dS |                    |          }| 	                    | d	|d
           t          | | d|           dS )zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r	   N  original_max_position_embeddings_original_inv_freq__long_inv_freqseq_len
layer_typeinv_freqF
persistentlong_inv_freqoriginal_inv_freq)torchmax	rope_typer   configrope_parametersgetattrhasattrROPE_INIT_FUNCTIONSregister_buffersetattrto)selfposition_idsdevicer   r   r   r   prefixr   rope_init_fnr   r   s               `/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_updatez6dynamic_rope_update.<locals>.longrope_frequency_update/   s   )L))A-I $ 6F/3{/JKm/n,,z2I '.O.O.O P P"%%%F/3{/J:/V20, 5554J!>!>!>?? 29=#/<K<q@)	$ $ $ q   F!4!4!4mPU VVVDV222MBBBBB !2 4 4V < <  F!4!4!46GTY ZZZDV6668IJJJJJ    c                    t          j        |          dz   }|| j        }| j        }| j        }d}n>| j        |         }t          | | d| j                  }t          | | d          }| d}||k    rXt          |         }	 |	| j        |||          \  }
| _        | 	                    | d|
d	
           t          | | d|           || j        k     rj|| j        k    ra|                    |          }| 	                    | d|d	
           t          | | d|           t          | | d| j                   dS dS dS )a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r	   Nr   _max_seq_len_cachedr   r   r   r   Fr   r   )r   r   r   max_seq_len_cachedr   r!   r#   r   attention_scalingr$   r%   original_max_seq_lenr&   )r'   r(   r)   r   r   r   r1   r   r*   r+   r   s              r,   dynamic_frequency_updatez5dynamic_rope_update.<locals>.dynamic_frequency_updateR   s    )L))A-I!%!8 $ 6FFz2I!(*/Q/Q/QSWSj!k!k '.O.O.O P P"%%%F'''.y9L/;|%	0 0 0,Hd,   F!4!4!4h5 QQQDZ<<<gFFFT...3EHa3a3a !2 4 4V < <  F!4!4!46GTY ZZZDV6668IJJJDZ<<<d>WXXXXX /.3a3ar.   c                     || j         n| j         |         }|d|ini }d|v r | |fd|j        i| n|dk    r | |fd|j        i|  | ||fi |S )Nr   dynamicr)   longrope)r   r)   )	r'   xr(   r   r   kwargsr4   r-   rope_forwards	         r,   wrapperz$dynamic_rope_update.<locals>.wrapperx   s    &0&8DNNdnZ>X	/9/E,
++2	!!$$T<SSSFSSSS*$$%%dLTTTVTTT|D!\<<V<<<r.   Nr   )r:   r;   r4   r-   s   ` @@r,   dynamic_rope_updater=   "   s{    !K !K !K !KF$Y $Y $Y $YL <= = = = = = = = Nr.   r   r   r)   ztorch.devicer   r   returnztorch.Tensorc                    |                                   || j        |         n| j        }|d         }|d         }|                    dd          }t          | dd          p| j        | j        z  }t          ||z            }	d}
d|t          j        d|	dt          j	        	          
                    |t          j        
          |	z  z  z  }||z  }||
fS )aX  
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    Nfactor
rope_thetapartial_rotary_factor      ?head_dimr      dtyper)   rG   )standardize_rope_paramsr    getr!   hidden_sizenum_attention_headsintr   arangeint64r&   float)r   r)   r   r   rope_parameters_dictr@   baserB   rD   dimattention_factorr   s               r,   '_compute_linear_scaling_rope_parametersrU      s    B ""$$$AKAW61*==]c]s!(+F  -D0445LcRRvz400dF4F&Jd4dH
h..
/
/C du|AsAU[IIILLTZbgbmLnnqttuvH
 H%%%r.   rD   head_dim_keyc                 h   |                                   || j        |         n| j        }t          | |d          p| j        | j        z  }|d         }|                    dd          }|                    dd          }	d}
t          |	|z  dz            }d|t          j        dd|z  dt          j	                  
                    |t          j        	          |z  z  z  }|dz  |z
  }|dk    r8t          j        |t          j        |t          j        |
          fd          }n|}||z  }||
fS )a  
    Computes the inverse frequencies with proportional RoPE.

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): The proportion of the embedding dimension
                to apply rotary positional encoding, e.g., [0.0, 0.25, 0.5, 0.75, 1.0]. Unlike other RoPE functions
                that use this parameter, proportional RoPE will always return an encoding that is the size of
                `head_dim`.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    NrA   r@   rC   rB   rE   r   rF   rH   rG   r)   )rS   )rI   r    r!   rK   rL   rJ   rM   r   rN   rO   r&   rP   catzerosfloat32)r   r)   r   r   rV   rQ   rD   rR   r@   rope_proportionrT   rope_anglesinv_freq_rotatednope_anglesr   s                  r,   %_compute_proportional_rope_parametersr`      sd   J ""$$$AKAW61*==]c]sv|T22ff6HFLf6fH-D!%%h44F*../FLLOo0A566KLAOQekBBBEEV[`[fEggjrr	t
 a-+-KQ9 Ku}VLLL 
 
 
 $H%%%r.   c                    |                                   || j        |         n| j        }|d         }|                    dd          }t          | d| j        | j        z            }t          ||z            }|d         }	d}
|| j        }nit          |t          j
                  r:t          j        |t          j        | j        |j        |j                            }nt          || j                  }||	|z  | j        z  |	dz
  z
  ||d	z
  z  z  z  }d|t          j        d
|d	t          j                                      |t          j                  |z  z  z  }||
fS )a
  
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at
                inference time
            *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor`
                will be accessed. The value of `factor` is used to determine the new base frequency, along with the
                current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the
                computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this
                factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the
                context window using an exponent derived from `dim`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than
            max_position_embeddings, this value will be overridden by max_position_embeddings.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    NrA   rB   rC   rD   r@   rX   r	   rE   r   rF   rH   )rI   r    rJ   r!   rK   rL   rM   max_position_embeddings
isinstancer   TensormaximumtensorrG   r)   r   rN   rO   r&   rP   )r   r)   r   r   rQ   rR   rB   rD   rS   r@   rT   r   s               r,   _compute_dynamic_ntk_parametersrg     s   V ""$$$AKAW61*==]c]s-D0445LcRRvz6+=A[+[\\H
h..
/
/C!(+F 0	GU\	*	* ?-L7w}U\Ucddd
 

 gv=>> FW$v'EE&ST*U[^behibi[jkkDdu|AsAU[IIILLTZbgbmLnnqttuvH%%%r.   c                    |                                   || j        |         n| j        }|d         }|                    dd          }t          | d| j        | j        z            }t          ||z            }|d         }	|                    d          }
|                    d          }|                    d	          }|d
         }|	
| j        |z  }	dd}|
6|r)|r't           ||	|           ||	|          z            }
n ||	          }
|                    d          pd}|                    d          pd}d fd}d }|t          j
        d|d                              |t          j                  |z  z  }d|z  }d|	|z  z  }| j                            dd          } |||||||          \  }}d ||||dz                                |t          j                  z
  }|d|z
  z  ||z  z   }||
fS )aD  
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://huggingface.co/papers/2309.00071)

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin.
                    If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as available.
                *   `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation
                    (only) in the linear ramp function.
                *   `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation
                    (only) in the linear ramp function.
                *   `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to
                    extend the possible context length. Additionally, if `attention_factor` is None, the log of this
                    value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and
                    `mscale_all_dim`, if provided.
                *   `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the
                    numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be
                    calculated based on `factor` only.
                *   `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                    the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                    will be calculated based on `factor` only.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used during pretraining.
                *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    NrA   rB   rC   rD   r@   rT   mscalemscale_all_dimr   r	   c                 L    | dk    rdS d|z  t          j        |           z  dz   S )Nr	   rC   g?)mathlog)scaleri   s     r,   
get_mscalez,_compute_yarn_parameters.<locals>.get_mscale  s,    A::3V|dhuoo-33r.   	beta_fast    	beta_slowc                     |t          j        || dz  t           j        z  z            z  dt          j        |          z  z  S )zPInverse dimension formula to find the dimension based on the number of rotationsrE   )rl   rm   pi)num_rotationsrS   rR   rb   s       r,   find_correction_dimz5_compute_yarn_parameters.<locals>.find_correction_dim  sA    dh6-!:Kdg:UVWWW\]`d`him`n`n\noor.   c                      | |||          } ||||          }|r(t          j        |          }t          j        |          }t          |d          t	          ||dz
            fS )z.Find dimension range bounds based on rotationsr   r	   )rl   floorceilr   min)	low_rothigh_rotrS   rR   rb   truncatelowhighrv   s	           r,   find_correction_rangez7_compute_yarn_parameters.<locals>.find_correction_range  st    !!'36MNN""8S$8OPP 	#*S//C9T??D3{{CcAg....r.   c                     | |k    r|dz  }t          j        |t           j                  | z
  || z
  z  }t          j        |dd          }|S )NgMbP?rF   r   r	   )r   rN   r[   clamp)rz   r   rS   linear_func	ramp_funcs        r,   linear_ramp_factorz4_compute_yarn_parameters.<locals>.linear_ramp_factor  sQ    #::5LC|Cu}===Cc	RKQ22	r.   r   rE   rH   r}   T)r	   )rI   r    rJ   r!   rK   rL   rM   rb   rP   r   rN   r&   )r   r)   r   r   rQ   rR   rB   rD   rS   r@   rT   ri   rj   r   ro   rp   rr   r   r   	pos_freqsinv_freq_extrapolationinv_freq_interpolationr}   r~   r   inv_freq_extrapolation_factorr   rv   s                              @r,   _compute_yarn_parametersr   G  s   t ""$$$AKAW61*==]c]s-D0445LcRRvz6+=A[+[\\H
h..
/
/C!(+F+//0BCC!%%h//F)--.>??N';<^'_$
 ~/2RR4 4 4 4  	2n 	2$ZZ%?%?**VUcBdBd%dee)z&11 %((55;I$((55:Ip p p/ / / / /   aa003363UUX[[\I 9_ FY$67%))*d;;H%%iCGgiqrrIC %&(:(:3cQh(O(O(R(RZ`hmhs(R(t(t$t!!&C"CD
 #@
@	A  %%%r.   c                 D   |                                   || j        |         n| j        }|d         }|                    dd          }t          | d| j        | j        z            }t          ||z            }|d         }	|d         }
|                    d          }|                    d	          }|d
         }|
| j        |z  }|G|dk    rd}n>t          j	        dt          j
        |          t          j
        |          z  z             }|r(||k    r"t          j        |	t          j        |          }n!t          j        |
t          j        |          }t          j        d|dt          j        |                                          |z  }d|||z  z  z  }||fS )a  
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during
                pretraining. If not provided, defaults to `max_position_embeddings`.
            *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys
                will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, inferred from
                    the value of `factor`.
                *   `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both
                    `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be
                    overridden s the ratio between those values.
                *   `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`.
                *   `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    NrA   rB   rC   rD   long_factorshort_factorr@   rT   r   r	   rX   r   rE   )rI   r    rJ   r!   rK   rL   rM   rb   rl   sqrtrm   r   rf   r[   rN   rO   rP   )r   r)   r   r   rQ   rR   rB   rD   rS   r   r   r@   rT   r   ext_factorsinv_freq_shaper   s                    r,   _compute_longrope_parametersr     s   d ""$$$AKAW61*==]c]s-D0445LcRRvz6+=A[+[\\H
h..
/
/C&}5K'7L!%%h//F+//0BCC';<^'_$
 ~/2RR S=="#yTXf-=-=Ii@j@j-j)jkk  U7===l;emFSSSl<u}VTTT\!S!5;vNNNTTVVY\\NkD.$889H%%%r.   c                    |                                   || j        |         n| j        }|d         }|                    dd          }t          | dd          p| j        | j        z  }t          ||z            }d}	d|t          j        d|dt          j	                  
                    |t          j        	          |z  z  z  }
|d
         }|d         }|d         }|d         }||z  }||z  }dt          j        z  |
z  }t          j        ||k    |
|z  |
          }||z  |z
  ||z
  z  }d|z
  |z  |z  ||z  z   }||k      ||k     z  }t          j        |||          }||	fS )a
  
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the
                    wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies
                    during smoothing.
                *   `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and
                    the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift.
                *   `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and
                    the shift applied to the numerator and denominator of the smoothing factor.
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    NrA   rB   rC   rD   r   rE   rF   rH   r@   low_freq_factorhigh_freq_factorr   r	   )rI   r    rJ   r!   rK   rL   rM   r   rN   rO   r&   rP   rl   rt   where)r   r)   r   r   rQ   rR   rB   rD   rS   rT   r   r@   r   r   old_context_lenlow_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqs                         r,   _compute_llama3_parametersr   &  s   Z ""$$$AKAW61*==]c]s  -D0445LcRRvz400dF4F&Jd4dH
h..
/
/C du|AsAU[IIILLTZbgbmLnnqttuvH!(+F*+<=O+,>?*+MNO&8'*::$'kH$G [+;!;X=NPXYYN$w.@EUXgEghM]*n<vEXfHff!223BR8R6SSN[1BNSSN+++r.   )linearr6   yarnr7   llama3proportional.r#   c                       e Zd ZU dZedz  ed<   edz  ed<   edz  ed<   edz  ed<   edz  ed<   edz  ed<   edz  ed	<   edz  ed
<   ee         dz  ed<   ee         dz  ed<   edz  ed<   edz  ed<   dS )RopeParametersu  
    Args:
        rope_theta (`float`, *optional*, defaults to `RotaryEmbeddingConfigMixin.default_theta`):
            The base period of the RoPE embeddings. Optional in serialized configs — if omitted,
            the model's `default_theta` (typically 10000.0) is used.
        rope_type (`str`, *optional*, defaults to "default"):
            The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
            'llama3'], with 'default' being the original RoPE implementation.
        partial_rotary_factor (`float`, *optional*):
            The percentage of the query and key head embedding on which RoPE will be applied.
        factor (`float`, *optional*):
            Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
            most scaling types, a `factor` of x will enable the model to handle sequences of length x *
            original maximum pre-trained length.
        original_max_position_embeddings (`int`, *optional*):
            Used with 'yarn', 'longrope' and 'llama3'. The original max position embeddings used during
            pretraining.
        attention_factor (`float`, *optional*):
            Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
            computation. If unspecified, it defaults to value recommended by the implementation, using the
            `factor` field to infer the suggested value.
        beta_fast (`float`, *optional*):
            Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
            ramp function. If unspecified, it defaults to 32.
        beta_slow (`float`, *optional*):
            Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
            ramp function. If unspecified, it defaults to 1.
        short_factor (`list[float]`, *optional*):
            Only used with 'longrope'. The scaling factor to be applied to short contexts (<
            `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
            size divided by the number of attention heads divided by 2
        long_factor (`list[float]`, *optional*):
            Only used with 'longrope'. The scaling factor to be applied to long contexts (<
            `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
            size divided by the number of attention heads divided by 2
        low_freq_factor (`float`, *optional*):
            Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
        high_freq_factor (`float`, *optional*):
            Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    NrA   r   rB   r@   r   rT   rp   rr   r   r   r   r   )	__name__
__module____qualname____doc__rP   __annotations__strrM   list r.   r,   r   r     s         ' 'R Tz 4<'''DL&)Dj000dl"""t|t|u+$$$$et####T\!!!dl"""""r.   r   c                   $   e Zd ZdZdZ e            Zd Zd ZddZ	dd	e
d
edz  fdZdd	e
d
edz  fdZdd	e
d
edz  fdZdd	e
d
edz  fdZdd	e
d
edz  fdZdd	e
d
edz  fdZdd	e
d
edz  fdZe	 	 ddededededz  d
edz  f
d            ZdS )RotaryEmbeddingConfigMixinz[
    A Mixin containing the functionality to standardize and validate RoPE parameters.
    g     @c                    |                     dd           }|p| j        | _        | j        | j        ni | _        |                     dt          | d| j                            }| j                            d|           |                    dt          | dd                     }|+| j                            d|           | j        dhz  | _        |                                  |S )Nrope_scalingrA   rB   )popr    r!   default_theta
setdefaultrJ   ignore_keys_at_rope_validationrI   )r'   r9   r   rA   rB   s        r,   convert_rope_params_to_dictz6RotaryEmbeddingConfigMixin.convert_rope_params_to_dict  s    zz.$77+Ct/C7;7K7Wt33]_ ZZgdL$J\.].]^^
''jAAA &

+BGDRikoDpDp q q , ++,CEZ[[[262UYpXq2qD/$$&&&r.   c                    t          | dd          }t          | dd          }t          | dd          pi }t          | dd          }|s|st                              d           dS |:|i k    s4t          |                                                              |          s|                    d|                    dd	                     |                    d|           |||d<   |d         d
v r@t          | d          r| j	        | j
        d<   n| j
                            d| j                   nt          |          D ]}||                             d||                             dd	                     ||                             d|           ||||         d<   ||         d         d
v r&| j
        |                             d| j                   || _
        dS )z
        Helper to standardize the config's rope params field by ensuring the params are defined for each
        later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility)
        rA   NrB   r    layer_typeszG`standardize_rope_params` was called but no RoPE parameters were found.r   typedefault)r   r   r7   r   )r!   loggerwarningsetkeysissubsetr   rJ   r"   r   r    rb   )r'   rA   rB   r    r   r   s         r,   rI   z2RotaryEmbeddingConfigMixin.standardize_rope_params  s)    T<66
 '.Et L L!$(94@@FBdM488    	:  	NNdeeeF Or$9$9_EYEYE[E[A\A\AeAefqArAr$9&&{O4G4GPY4Z4Z[[[&&|Z@@@$0;P 78 {+/MMM4!CDD v PTOtD()KLL(334VX\Xtuuu "+.. 	 	

+66{OT^D_DcDcdjluDvDvwww
+66|ZPPP(4K`OJ/0GH":.{;?]]](4??:D<X    /r.   r'   r   c                    t          | dd          }|sdS t          | dd          :t          |                                                              | j                  rnd|i}|                                D ]y}|                    d|                    dd                    }t          | d| d	d          }||d<   | ||| j        
           [t          	                    d| d           zdS )zY
        Validate the RoPE config arguments, given a `"PreTrainedConfig"` object
        r    Nr   full_attentionr   r   r   
_validate__rope_parametersignore_keyszMMissing validation function in 'RotaryEmbeddingConfigMixin' for 'rope_type'='')
r!   r   r   r   r   valuesrJ   r   r   r   )r'   rQ   r    r   validation_fns        r,   validate_ropez(RotaryEmbeddingConfigMixin.validate_rope  s5     't->EE# 	F4--9cBVB[B[B]B]>^>^>g>g?
 ?
9 $46J#K 3::<< 
	 
	O'++K9L9LVU^9_9_``I#D*Ry*R*R*RTXYYM+4OK((o4;^_____pdmppp   
	 
	r.   Nr    r   c                     dh}dh}t          |                                          }|d         }|                     |||||           d S )Nr   rA   optional_keysr   )r   r   _check_received_keys)r'   r    r   required_keysr   received_keysr   s          r,   !_validate_default_rope_parametersz<RotaryEmbeddingConfigMixin._validate_default_rope_parameters$  sf    $%O002233#K0	!!}m=^i 	" 	
 	
 	
 	
 	
r.   c                 0   ddh}dh}t          |                                          }|d         }|                     |||||           |d         }|"t          |t          t
          f          r|dk     rt                              d|            d S d S Nr   r@   rA   r   rC   B`rope_parameters`'s factor field must be a float or int >= 1, got r   r   r   rc   rP   rM   r   r   r'   r    r   r   r   r   r   r@   s           r,    _validate_linear_rope_parametersz;RotaryEmbeddingConfigMixin._validate_linear_rope_parameters-      $h/%O002233#K0	!!}m=^i 	" 	
 	
 	
 !*>FUCL!A!A>Vc\\NNh`fhhiiiii FR\r.   c                 0   ddh}dh}t          |                                          }|d         }|                     |||||           |d         }|"t          |t          t
          f          r|dk     rt                              d|            d S d S r   r   r   s           r,   !_validate_dynamic_rope_parametersz<RotaryEmbeddingConfigMixin._validate_dynamic_rope_parameters:  r   r.   c           	         h d}h d}t          |                                          }|d         }|                     |||||           |d         }|"t          |t          t
          f          r|dk     rt                              d|            |                    d          }|8t          |t                    r|d	k     rt                              d
|            |                    d          }	|	9t          |	t          t
          f          st                              d|	            |                    d          }
|
9t          |
t          t
          f          st                              d|
            |	pd|
pdk     r!t                              d|	 d|
 d           |d         }| j	        |z  }||k    r,|dk    r(t          
                    d| d| d| d           d S d S d S )N>   r@   r   r   >   ri   r}   rp   rr   rA   rj   rT   r   r   r@   rC   r   rT   r   zO`rope_parameters`'s attention_factor field must be a float greater than 0, got rp   z@`rope_parameters`'s beta_fast field must be a float or int, got rr   z@`rope_parameters`'s beta_slow field must be a float or int, got rq   r	   zR`rope_parameters`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r   zKThe explicitly set RoPE scaling factor (config.rope_parameters['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'original_max_position_embeddings' fields in the model config.)r   r   r   rc   rP   rM   r   r   rJ   rb   warning_once)r'   r    r   r   r   r   r   r@   rT   rp   rr   r   implicit_factors                r,   _validate_yarn_rope_parametersz9RotaryEmbeddingConfigMixin._validate_yarn_rope_parametersG  s   SSS
 
 
 O002233#K0	!!)]M=fq!rrr *>FUCL!A!A>Vc\\NNh`fhhiii*../ABB'<Le1T1T'XhklXlXlNNtbrtt   $''44	 Is|)L)L NNi^giijjj#''44	 Is|)L)L NNi^giijjjO	Q//NN^en ^ ^:C^ ^ ^   ,;;]+^(69YYf$$A)=)=~^d ~ ~ #	~ ~ CI	~ ~ ~     %$)=)=r.   c                    h d}h d}t          |                                          }|d         }|                     |||||           |                    dd          }t	          | d| j        | j        z            }t          ||z            }	|                    d          }
t          |
t                    rt          d	 |
D                       st                              d
|
            t          |
          |	dz  k    r0t                              d|	dz   dt          |
                      |                    d          }t          |t                    rt          d |D                       st                              d|            t          |          |	dz  k    r0t                              d|	dz   dt          |                      |                    d          }|d         }||t                              d           n^||t                              d           n?t          |t          t          f          r|dk     rt                              d|            |                    d          }|At          |t          t          f          r|dk     r!t                              d|            d S d S d S )N>   r   r   r   r   >   r@   rA   rT   r   r   rB   rC   rD   r   c              3   N   K   | ] }t          |t          t          f          V  !d S r<   rc   rM   rP   .0r8   s     r,   	<genexpr>zPRotaryEmbeddingConfigMixin._validate_longrope_rope_parameters.<locals>.<genexpr>  s1      6i6iWXz!c5\7R7R6i6i6i6i6i6ir.   zF`rope_parameters`'s short_factor field must be a list of numbers, got rE   z8`rope_parameters`'s short_factor field must have length z, got r   c              3   N   K   | ] }t          |t          t          f          V  !d S r<   r   r   s     r,   r   zPRotaryEmbeddingConfigMixin._validate_longrope_rope_parameters.<locals>.<genexpr>  s1      5g5gVWjS%L6Q6Q5g5g5g5g5g5gr.   zE`rope_parameters`'s long_factor field must be a list of numbers, got z7`rope_parameters`'s long_factor field must have length r@   r   av  This model config has set a `rope_parameters['original_max_position_embeddings']` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.z4Missing required keys in `rope_parameters`: 'factor'r   rT   g        zV`rope_parameters`'s attention_factor field must be a float or int greater than 0, got )r   r   r   rJ   r!   rK   rL   rM   rc   r   allr   r   lenr   rP   )r'   r    r   r   r   r   r   rB   rD   rS   r   r   r@   r   rT   s                  r,   "_validate_longrope_rope_parametersz=RotaryEmbeddingConfigMixin._validate_longrope_rope_parameters{  s*   hhhDDDO002233#K0	!!)]M=fq!rrr / 3 34KS Q Q4T-=AY-YZZ(2233&**>::<.. 	t36i6i\h6i6i6i3i3i 	tNNrdprrsss|q((NNn3RS8nn[^_k[l[lnn   &))-88;-- 	r#5g5g[f5g5g5g2g2g 	rNNpcnppqqq{sax''NNl#QR(llZ]^iZjZjll   !$$X..+:;]+^( >>JE    ^ @ HNNQRRRRFUCL11 	jVc\\NNh`fhhiii*../ABB'<LuVYl1[1['_oru_u_uNN{iy{{     ('_u_ur.   c                 X   h d}|d         }t          |                                          }|                     ||||           |d         }|"t          |t          t
          f          r|dk     rt                              d|            |d         }|d         }|t          |t          t
          f          st                              d	|            |t          |t          t
          f          st                              d
|            ||k    r t                              d| d|            |d         }	|	t          |	t
                    st                              d|	            |	| j        k    r't                              d|	 d| j                    d S d S )N>   r@   r   rA   r   r   r   r   r   r@   rC   r   r   r   zF`rope_parameters`'s low_freq_factor field must be a float, or int got zG`rope_parameters`'s high_freq_factor field must be a float or int, got zf`rope_parameters`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r   zS`rope_parameters`'s original_max_position_embeddings field must be an integer, got zj`rope_parameters`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)	r   r   r   rc   rP   rM   r   r   rb   )
r'   r    r   r   r   r   r@   r   r   r   s
             r,    _validate_llama3_rope_parametersz;RotaryEmbeddingConfigMixin._validate_llama3_rope_parameters  s   
 
 
 $K0	O002233!!)]MWb!ccc *>FUCL!A!A>Vc\\NNh`fhhiii)*;<*+=>"*_ucl*S*S"NNudsuuvvv#:6FPS+U+U#NNlZjll   ..NNL#L L:IL L  
 ,;;]+^(+3:Ffhk;l;l3NN636 6   ,t/KKKNNq3q qRVRnq q     LKr.   c                     ddh}|d         }t          |                                          }|                     ||||           |                    d          }|t                              d           d S d S )Nr   rA   r   rB   z`rope_parameters`'s partial_rotary_factor is None. This will default to 1.0 in the computation, making this equivalent to the linear_scaling RoPE type. Provide a value in the range [0.0, 1.0) to make use of the proportional RoPE funcitonality.)r   r   r   rJ   r   r   )r'   r    r   r   r   r   rB   s          r,   &_validate_proportional_rope_parameterszARotaryEmbeddingConfigMixin._validate_proportional_rope_parameters  s    $l3#K0	O002233!!)]MWb!ccc / 3 34K L L (NNC     )(r.   r   r   r   r   c                 N   d|v r|dhz  }|                     d           |pt                      }d|vr|                     d           ||t          |          z  }||z
  }|rt          d|  d|           ||z
  |z
  }|r"t                              d|  d|            dS dS )z\Compare the received keys in `config.rope_parameters` against the expected and optional keysr   r   rB   Nz<Missing required keys in `rope_parameters` for 'rope_type'='z': z8Unrecognized keys in `rope_parameters` for 'rope_type'=')addr   KeyErrorr   r   )r   r   r   r   r   missing_keysunused_keyss          r,   r   z/RotaryEmbeddingConfigMixin._check_received_keys  s     ]""fX%Mk***%."-775666 "S---M$}4 	xvZcvvhtvvwww#m3mC 	sNNqV_qqdoqqrrrrr	s 	sr.   )r'   r   r<   )NN)r   r   r   r   r   r   r   r   rI   r   dictr   r   r   r   r   r   r   staticmethodr   r   r   r.   r,   r   r     s&         M%(SUU"  *./ ./ ./`   :
 
 
TWZ^T^ 
 
 
 
j j jSVY]S] j j j jj j jTWZ^T^ j j j j2 2d 2QTW[Q[ 2 2 2 2h0 0$ 0UX[_U_ 0 0 0 0d) ) )SVY]S] ) ) ) )V d Y\_cYc     
 %)"&s sss s Tz	s
 4Zs s s \s s sr.   r   r   c                     t          j        dt                     |                                  |                                  dS )zq
    This is a deprecated function.
    It has been kept for backward compatibility with custom code models.
    aX  `rope_config_validation` is deprecated and has been removed. Its functionality has been moved to RotaryEmbeddingConfigMixin.validate_rope method. PreTrainedConfig inherits this class, so please call self.validate_rope() instead. Also, make sure to use the new rope_parameters syntax. You can call self.standardize_rope_params() in the meantime.N)warningswarnFutureWarningrI   r   )r   r   s     r,   rope_config_validationr     sO    
 M	G
 	   ""$$$
r.   )NNNN)NNNNrD   )NNNr<   )%rl   r   collections.abcr   	functoolsr   typingr   r   r   utilsr
   r   
get_loggerr   r   r   configuration_utilsr   r=   rM   r   tuplerP   rU   r`   rg   r   r   r   r#   r   r   r   r   r   r   r   r.   r,   <module>r      s     $ $ $ $ $ $       5 5 5 5 5 5 5 5 5 5 . . . . . . . . 
	H	%	%  LLL 6555555` ` `H ,0'+!	3& 3&'(3&^$3& 4Z3& d
	3&
 >5 !3& 3& 3& 3&n ,0'+!"C& C&'(C&^$C& 4ZC& d
	C&
 C& >5 !C& C& C& C&N ,0'+!	C& C&'(C&^$C& 4ZC& d
	C&
 >5 !C& C& C& C&P (,!	D& D&D&^$D& 4ZD& d
	D&
 >5 !D& D& D& D&R (,!	U& U&U&^$U& 4ZU& d
	U&
 >5 !U& U& U& U&t (,!	L, L,L,^$L, 4ZL, d
	L,
 >5 !L, L, L, L,f 6.$,(9O O T#xU>53H-I(IJJK   5# 5# 5# 5# 5#Y 5# 5# 5#pHs Hs Hs Hs Hs Hs Hs HsV
 #= CRVJ      r.   