
    RЦi7                         S r SSKrSSKJrJrJr  SSKrSSKJr  SSK	Js  J
r  SSKJr  SSKJr  SSKJr  \R$                  " \5      r " S S	\R*                  5      r " S
 S\5      rg)zJToken-based distillation training task for models with distillation heads.    N)DictOptionalUnion)create_model)unwrap_model   )TrainingTaskc                   l  ^  \ rS rSrSr     SS\\\R                  4   S\	\
   S\
S\	\   S\	\R                     S\	\R                     4U 4S	 jjjrS
\R                  S\R                  4S jr  SS
\R                  S\	\R                     S\	\R                     S\R                  4S jjrSrU =r$ )TokenDistillationTeacher   a  Wrapper for a teacher model used in token-based distillation.

Creates and manages a pre-trained teacher model for token distillation,
handling model creation and normalization differences between teacher and student.

Can be created from:
- A model name string (creates the model internally)
- An existing nn.Module (wraps it with the necessary interface)

Args:
    model_name_or_module: Either a model name string or an nn.Module
    num_classes: Number of output classes (required if model_name_or_module is a string)
    in_chans: Number of input channels (used if model_name_or_module is a string)
    pretrained_path: Optional path to pretrained weights (used if model_name_or_module is a string)
    device: Device to place the model on
    dtype: Model dtype (uses float32 if None)
model_name_or_modulenum_classesin_chanspretrained_pathdevicedtypec           	        > [         TU ]  5         [        U[        5      (       aC  [        R                  SU S35        SS0nU(       a  [        UUS9US'   [        SUUUUUS.UD6nOC[        U[        R                  5      (       a  UnO![        S[        U5      R                   35      eUR                  5         Xl        [        U5      n	[!        U	S	5      (       a9  U	R"                  R%                  S
S5      n
U	R"                  R%                  SS5      nOSn
Sn[&        R(                  " XUS9R+                  SSSS5      n[&        R(                  " XUS9R+                  SSSS5      nU R-                  SUSS9  U R-                  SUSS9  g )Nz,Creating token distillation teacher model: ''
pretrainedT)filer   pretrained_cfg_overlay)
model_namer   r   r   r   z8model_name_or_module must be a string or nn.Module, got pretrained_cfgmean)g
ףp=
?gv/?gCl?std)gZd;O?gy&1?g?r   r   r   mean_kdF
persistentstd_kd )super__init__
isinstancestr_loggerinfodictr   nnModule	TypeErrortype__name__evalmodelr   hasattrr   gettorchtensorviewregister_buffer)selfr   r   r   r   r   r   pretrained_kwargsr0   model_unwrappedr   r   r   r!   	__class__s                 [/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/task/token_distillation.pyr$   !TokenDistillationTeacher.__init__$   s    	*C00LLGH\G]]^_`!-t 4>B( +?!":;
 ! /'! $E ,bii88(EJ4PdKeKnKnJop  	


 'u-?$455"1155f>STD!0044U<QRC(D'C,,t%@EEaQPQRc>CCAr1aPYEBXv%@    inputreturnc                 $    U R                  U5      $ )zForward pass through teacher model.

Args:
    input: Input tensor (should already be normalized for teacher)

Returns:
    Teacher logits
r0   )r7   r>   s     r;   forward TokenDistillationTeacher.forwardY   s     zz%  r=   student_meanstudent_stdc                     Ub  Uc  U$ [         R                  " X R                  5      (       a'  [         R                  " X0R                  5      (       a  U$ X-  U-   U R                  -
  U R                  -  $ )a7  Normalize input to match teacher's expected normalization.

Args:
    input: Input tensor (already normalized for student)
    student_mean: Student normalization mean buffer [1, 3, 1, 1]
    student_std: Student normalization std buffer [1, 3, 1, 1]

Returns:
    Input tensor normalized for the teacher model
)r3   equalr   r!   )r7   r>   rD   rE   s       r;   normalize_input(TokenDistillationTeacher.normalize_inputd   s`      ;#6L;;|\\22u{{;P[P[7\7\L#l2T\\AT[[PPr=   rA   )N   NNN)NN)r.   
__module____qualname____firstlineno____doc__r   r&   r*   r+   r   intr3   r   r   r$   TensorrB   rH   __static_attributes____classcell__r:   s   @r;   r   r      s    * *.-1-1+/3A"'RYY"73A "#3A 	3A
 &c]3A U\\*3A EKK(3A 3Aj	!U\\ 	!ell 	! 4826	Q<<Q #5<<0Q "%,,/	Q
 
Q Qr=   r   c                     ^  \ rS rSrSr         SS\R                  S\\\R                  \	4   S\
\R                     S\
\   S\S\
\   S	\
\   S
\S\
\R                     S\
\R                     S\4U 4S jjjr SS\
\   SS 4S jjrS\R&                  S\R&                  S\\\R&                  4   4S jrSrU =r$ )TokenDistillationTask{   a,  Token-based distillation task for models with distillation heads.

For models like DeiT that have a dedicated distillation token/head that returns
a tuple (main_logits, dist_logits) when distilled_training is enabled. The main
head is trained against ground truth labels while the distillation head matches
teacher outputs.

Supports two distillation modes:
- 'soft': KL divergence with temperature scaling (default)
- 'hard': Cross-entropy with teacher's hard predictions (argmax)

Loss weighting supports two modes:
1. Independent weights: loss = task_loss_weight * task_loss + distill_loss_weight * distill_loss
2. Complementary mode: loss = task_loss_weight * task_loss + (1 - task_loss_weight) * distill_loss
   (used when only task_loss_weight is specified)

Args:
    student_model: Student model with set_distilled_training() method
    teacher_model: Teacher model - can be a model name string, nn.Module, or TokenDistillationTeacher
    criterion: Task loss function for main head (default: CrossEntropyLoss)
    teacher_pretrained_path: Path to teacher pretrained weights (used when teacher_model is a string)
    distill_type: 'soft' for KL-div or 'hard' for CE with teacher argmax
    distill_loss_weight: Weight for distillation loss
    task_loss_weight: Weight for task loss
    temperature: Softmax temperature for soft distillation (ignored for hard)
    device: Device for task tensors/buffers
    dtype: Dtype for task tensors/buffers
    verbose: Enable info logging

Example:
    >>> # With model name string (num_classes/in_chans inferred from student)
    >>> task = TokenDistillationTask(
    ...     student_model=model, teacher_model='deit_base_patch16_224',
    ...     criterion=nn.CrossEntropyLoss(),
    ...     distill_type='soft', temperature=3.0, task_loss_weight=0.5,
    ...     device=torch.device('cuda'),
    ... )
    >>> # With raw model
    >>> task = TokenDistillationTask(
    ...     student_model=model, teacher_model=my_teacher_model,
    ...     criterion=nn.CrossEntropyLoss(),
    ...     distill_type='hard', task_loss_weight=0.5,
    ... )
student_modelteacher_model	criterionteacher_pretrained_pathdistill_typedistill_loss_weighttask_loss_weighttemperaturer   r   verbosec           	      4  > [         TU ]  XUS9  [        U5      n[        US5      (       d#  [	        SUR
                  R                   S35      eUR                  S5        [        U[        5      (       a  UnO[        U[        5      (       d  [        U[        R                  5      (       a;  UR                  nUR                  n[        UUUUU R                  U R                   S9nO![#        S[%        U5      R                   35      eXl        Xl        Ub  UO[        R*                  " 5       U l        XPl        Xl        US;  a  [	        S	U S
35      e[2        R4                  " UR6                  S   U R                  U R                   S9R9                  SSSS5      n[2        R4                  " UR6                  S   U R                  U R                   S9R9                  SSSS5      nU R;                  SUSS9  U R;                  SUSS9  UbI  X`l        Ub  UOSU l        U R@                  (       a%  [B        RE                  SU R>                   SU 35        OUbG  Xpl        SU-
  U l        U R@                  (       a%  [B        RE                  SU SU R<                   35        ONSU l        SU l        U R@                  (       a/  [B        RE                  SU R>                   SU R<                   35        U R@                  (       a  [B        RE                  SU SU 35        g g )N)r   r   r_   set_distilled_trainingzModel z does not have 'set_distilled_training' method. TokenDistillationTask requires a model with a distillation head (e.g., DeiT distilled variants).T)r   r   r   r   r   r   zWteacher_model must be a model name string, nn.Module, or TokenDistillationTeacher, got )softhardzUnsupported distill_type 'z'. Must be 'soft' or 'hard'.r   r   r   r   r   rD   Fr   rE         ?z9TokenDistillationTask: Independent weights - task_weight=z, distill_weight=z8TokenDistillationTask: Complementary mode - task_weight=z;TokenDistillationTask: Default equal weights - task_weight=z$TokenDistillationTask: distill_type=z, temperature=)#r#   r$   r   r1   
ValueErrorr:   r.   ra   r%   r   r&   r*   r+   r   r   r   r   r,   r-   studentteacherCrossEntropyLossrY   r[   r^   r3   r4   r   r5   r6   r\   r]   r_   r'   r(   )r7   rW   rX   rY   rZ   r[   r\   r]   r^   r   r   r_   student_unwrappedrg   r   r   rD   rE   r:   s                     r;   r$   TokenDistillationTask.__init__   s    	WE )7(*BCC*44==> ?s s  	006 m%=>>#Gs++z-/S/S+77K(11H.%2'! 7{{jjG M*3346 
 %&/&;ATATAV(&//9,Gcdee ||,,V4;;**
 $q"a
	 	
 ll,,U3;;**
 $q"a
	 	
 	^\eL]KEJ *':$8H8T$4Z]D!||##'#8#8"99JK^J_a )$4!'*-='=D$||##3"44EdF^F^E_a (+D$$'D!||##'#8#8"99J4KcKcJdf
 <<LL6|nNS^R_` r=   
device_idsr?   c                     SSK Jn  U R                  R                  5        H
  nSUl        M     U" U R
                  4SU0UD6U l        U $ )aL  Prepare task for distributed training.

Wraps the student model in DistributedDataParallel (DDP) while leaving
the frozen teacher model unwrapped.

Args:
    device_ids: List of device IDs for DDP (e.g., [local_rank])
    **ddp_kwargs: Additional arguments passed to DistributedDataParallel

Returns:
    self (for method chaining)
r   )DistributedDataParallelFrk   )torch.nn.parallelrm   rg   
parametersrequires_gradrf   )r7   rk   
ddp_kwargsDDPparams        r;   prepare_distributed)TokenDistillationTask.prepare_distributed  sH    " 	E\\,,.E"'E / 4<<MJM*Mr=   r>   targetc                    U R                  U5      nUu  pEU R                  XB5      n[        R                  " 5          U R                  R                  XR                  U R                  5      nU R	                  UR                  5       5      nSSS5        U R                  S:X  aj  [        R                  " XPR                  -  SS9n	[        R                  " WU R                  -  SS9n
[        R                  " XSSS9U R                  S-  -  nO%WR                  SS9n[        R                  " X\5      nU R                   U-  U R"                  U-  -   nUUUUS	.$ ! , (       d  f       N= f)
ar  Forward pass with token distillation.

Args:
    input: Input tensor [B, C, H, W]
    target: Target labels [B]

Returns:
    Dictionary containing:
        - 'loss': Combined training loss (task + distillation)
        - 'output': Main head logits (for metrics)
        - 'task_loss': Classification loss component
        - 'distill_loss': Distillation loss component
Nrb   r   )dim	batchmeanT)	reduction
log_target   )lossoutput	task_lossdistill_loss)rf   rY   r3   no_gradrg   rH   rD   rE   detachr[   Flog_softmaxr^   kl_divargmaxcross_entropyr]   r\   )r7   r>   rv   student_outputmain_logitsdist_logitsr   input_kdteacher_logitsprob_sprob_tr   teacher_hard
total_losss                 r;   rB   TokenDistillationTask.forward+  s<   & e,#1  NN;7	 ]]_||33E;L;LdN^N^_H!\\(//*;<N 
 &]];1A1A#ArJF]]>D4D4D#D"MF88FkVZ[_c_o_ost_tuL)00R08L??;EL**Y69Q9QT`9``
 !"(	
 	
 _s   AE
E')rY   r\   r[   rf   r]   rg   r^   )	NNrb   NNrd   NNT)N)r.   rK   rL   rM   rN   r*   r+   r   r&   r   r   floatr3   r   r   boolr$   listrt   rP   r   rB   rQ   rR   rS   s   @r;   rU   rU   {   sB   +b .259 &3704!$-1+/ g99g !bii1I!IJg  		*	g
 &.c]g g "*%g 'uog g U\\*g EKK(g g gV *.  
!	2.
<<.
 LL.
 
c5<<	 	.
 .
r=   rU   )rN   loggingtypingr   r   r   r3   torch.nnr*   torch.nn.functional
functionalr   timm.modelsr   
timm.utilsr   taskr	   	getLoggerr.   r'   r+   r   rU   r"   r=   r;   <module>r      sY    P  ( (     $ # 


H
%gQryy gQT^
L ^
r=   