
    RЦi_                        S r SSKrSSKJrJrJrJr  SSKrSSKJ	r	  SSK
J	s  Jr  SSKJr  SSKJr  SSKJr  \R&                  " \5      r " S S	\	R,                  5      rS
\\\	R,                  \4   S\	R,                  S\\   S\\R2                     S\\R4                     S\4S jr " S S\5      r " S S\	R,                  5      r " S S\5      rg)z5Knowledge distillation training tasks and components.    N)DictOptionalTupleUnion)create_model)unwrap_model   )TrainingTaskc                   v  ^  \ rS rSrSr     SS\\\R                  4   S\	\
   S\
S\	\   S\	\R                     S\	\R                     4U 4S	 jjjr SS
\R                  S\S\R                  4S jjr  SS
\R                  S\	\R                     S\	\R                     S\R                  4S jjrSrU =r$ )DistillationTeacher   a1  Wrapper for a teacher model used in knowledge distillation.

Creates and manages a pre-trained teacher model for knowledge distillation,
handling model creation and normalization differences between teacher and student.

Can be created from:
- A model name string (creates the model internally with pretrained weights)
- An existing nn.Module (wraps it with the necessary interface)

Args:
    model_name_or_module: Either a model name string or an nn.Module
    num_classes: Number of output classes (required if model_name_or_module is a string)
    in_chans: Number of input channels (used if model_name_or_module is a string)
    pretrained_path: Optional path to pretrained weights (used if model_name_or_module is a string)
    device: Device to place the model on
    dtype: Model dtype (uses float32 if None)
model_name_or_modulenum_classesin_chanspretrained_pathdevicedtypec           	        > [         TU ]  5         [        U[        5      (       aC  [        R                  SU S35        SS0nU(       a  [        UUS9US'   [        SUUUUUS.UD6nOC[        U[        R                  5      (       a  UnO![        S[        U5      R                   35      eUR                  5         Xl        [        U5      n	[!        U	S	5      (       a9  U	R"                  R%                  S
S5      n
U	R"                  R%                  SS5      nOSn
Sn[&        R(                  " XUS9R+                  SSSS5      n[&        R(                  " XUS9R+                  SSSS5      nU R-                  SUSS9  U R-                  SUSS9  g )NzCreating KD teacher model: ''
pretrainedT)filer   pretrained_cfg_overlay)
model_namer   r   r   r   z8model_name_or_module must be a string or nn.Module, got pretrained_cfgmean)g
ףp=
?gv/?gCl?std)gZd;O?gy&1?g?r   r   r	   mean_kdF
persistentstd_kd )super__init__
isinstancestr_loggerinfodictr   nnModule	TypeErrortype__name__evalmodelr   hasattrr   gettorchtensorviewregister_buffer)selfr   r   r   r   r   r   pretrained_kwargsr1   model_unwrappedr   r   r   r"   	__class__s                 U/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/task/distillation.pyr%   DistillationTeacher.__init__%   s    	*C00LL78L7MQOP!-t 4>B( +?!":;
 ! /'! $E ,bii88(EJ4PdKeKnKnJop  	


 'u-?$455"1155f>STD!0044U<QRC(D'C,,t%@EEaQPQRc>CCAr1aPYEBXv%@    inputreturn_featuresreturnc                 b   U(       a  [        U R                  S5      (       a  [        U R                  S5      (       d-  [        SU R                  R                  R                   S35      eU R                  R                  U5      nU R                  R                  USS9$ U R                  U5      $ )a  Forward pass through teacher model.

Args:
    input: Input tensor (should already be normalized for teacher)
    return_features: Whether to return pooled pre-logits features instead of logits

Returns:
    Logits or pooled pre-logits features depending on return_features flag
forward_featuresforward_headzModel zi does not support feature extraction. Ensure the model has 'forward_features' and 'forward_head' methods.T
pre_logits)r2   r1   
ValueErrorr;   r/   rC   rD   )r8   r?   r@   feature_maps       r<   forwardDistillationTeacher.forwardZ   s     4::'9::'$**VdBeBe TZZ11::; <Z Z  **55e<K::**;4*HH::e$$r>   student_meanstudent_stdc                     Ub  Uc  U$ [         R                  " X R                  5      (       a'  [         R                  " X0R                  5      (       a  U$ X-  U-   U R                  -
  U R                  -  $ )a7  Normalize input to match teacher's expected normalization.

Args:
    input: Input tensor (already normalized for student)
    student_mean: Student normalization mean buffer [1, 3, 1, 1]
    student_std: Student normalization std buffer [1, 3, 1, 1]

Returns:
    Input tensor normalized for the teacher model
)r4   equalr   r"   )r8   r?   rK   rL   s       r<   normalize_input#DistillationTeacher.normalize_inputs   s`      ;#6L;;|\\22u{{;P[P[7\7\L#l2T\\AT[[PPr>   r1   )N   NNN)F)NN)r/   
__module____qualname____firstlineno____doc__r   r'   r+   r,   r   intr4   r   r   r%   TensorboolrI   rO   __static_attributes____classcell__r;   s   @r<   r   r      s   * *.-1-1+/3A"'RYY"73A "#3A 	3A
 &c]3A U\\*3A EKK(3A 3Ap %*%<<% "% 
	%8 4826	Q<<Q #5<<0Q "%,,/	Q
 
Q Qr>   r   teacherstudent_modelr   r   r   rA   c           	          [        U [        5      (       a  U $ [        U5      nUR                  nUR                  n[        U UUUUUS9$ )an  Resolve teacher input to a DistillationTeacher instance.

Args:
    teacher: Model name string, nn.Module, or DistillationTeacher
    student_model: Student model to infer num_classes/in_chans from
    pretrained_path: Optional path to teacher pretrained weights
    device: Device for teacher
    dtype: Dtype for teacher

Returns:
    DistillationTeacher instance
)r   r   r   r   r   r   )r&   r   r   r   r   )r]   r^   r   r   r   student_unwrappedr   r   s           r<   _resolve_teacherra      sY    & '.// %]3#//K ))H$' r>   c                     ^  \ rS rSrSr         SS\R                  S\\\R                  \	4   S\
\R                     S\
\   S\S\
\   S	\
\   S
\S\
\R                     S\
\R                     S\4U 4S jjjr SS\
\   SS 4S jjrS\R&                  S\R&                  S\\\R&                  4   4S jrSrU =r$ )LogitDistillationTask   a  Logit-based knowledge distillation task.

Performs distillation by matching student and teacher output logits using
KL divergence with temperature scaling.

Loss weighting supports two modes:
1. Independent weights: loss = task_loss_weight * task_loss + distill_loss_weight * distill_loss
2. Complementary mode: loss = task_loss_weight * task_loss + (1 - task_loss_weight) * distill_loss
   (used when only task_loss_weight is specified)

Args:
    student_model: Student model to train
    teacher_model: Teacher model - can be a model name string, nn.Module, or DistillationTeacher
    criterion: Task loss function (default: CrossEntropyLoss)
    teacher_pretrained_path: Path to teacher pretrained weights (used when teacher_model is a string)
    loss_type: Type of distillation loss (currently only 'kl' supported)
    distill_loss_weight: Weight for distillation loss
    task_loss_weight: Weight for task loss
    temperature: Softmax temperature for distillation (typical values: 1-4)
    device: Device for task tensors/buffers
    dtype: Dtype for task tensors/buffers
    verbose: Enable info logging

Example:
    >>> # With model name string (num_classes/in_chans inferred from student)
    >>> task = LogitDistillationTask(
    ...     student_model=model, teacher_model='resnet50',
    ...     criterion=nn.CrossEntropyLoss(),
    ...     task_loss_weight=0.3, temperature=4.0,
    ...     device=torch.device('cuda'),
    ... )
    >>> # With raw model
    >>> task = LogitDistillationTask(
    ...     student_model=model, teacher_model=my_teacher_model,
    ...     criterion=nn.CrossEntropyLoss(),
    ...     task_loss_weight=0.3, temperature=4.0,
    ... )
r^   teacher_model	criterionteacher_pretrained_path	loss_typedistill_loss_weighttask_loss_weighttemperaturer   r   verbosec                   > [         TU ]  XUS9  [        UUUU R                  U R                  5      nXl        Xl        Ub  UO[        R                  " 5       U l	        XPl
        Xl        US:w  a  [        SU S35      e[        U5      n[        R                  " UR                   S   U R                  U R                  S9R#                  SSSS5      n[        R                  " UR                   S	   U R                  U R                  S9R#                  SSSS5      nU R%                  S
USS9  U R%                  SUSS9  UbI  X`l        Ub  UOSU l        U R*                  (       a%  [,        R/                  SU R(                   SU 35        OUbG  Xpl        SU-
  U l        U R*                  (       a%  [,        R/                  SU SU R&                   35        ONSU l        SU l        U R*                  (       a/  [,        R/                  SU R(                   SU R&                   35        U R*                  (       a  [,        R/                  SU SU 35        g g )Nr   r   rl   klzUnsupported loss_type 'z$'. Currently only 'kl' is supported.r   r   r	   r   r   rK   Fr    rL         ?z9LogitDistillationTask: Independent weights - task_weight=, distill_weight=z8LogitDistillationTask: Complementary mode - task_weight=z;LogitDistillationTask: Default equal weights - task_weight=z!LogitDistillationTask: loss_type=z, temperature=)r$   r%   ra   r   r   studentr]   r+   CrossEntropyLossrf   rh   rk   rG   r   r4   r5   r   r6   r7   ri   rj   rl   r(   r)   )r8   r^   re   rf   rg   rh   ri   rj   rk   r   r   rl   r]   r`   rK   rL   r;   s                   r<   r%   LogitDistillationTask.__init__   sU    	WE ##KKJJ
 %&/&;ATATAV"&6ykAefgg )7||,,V4;;**
 $q"a
	 	
 ll,,U3;;**
 $q"a
	 	
 	^\eL]KEJ *':$8H8T$4Z]D!||##'#8#8"99JK^J_a )$4!'*-='=D$||##3"44EdF^F^E_a (+D$$'D!||##'#8#8"99J4KcKcJdf
 <<LL3I;n[MZ r>   
device_idsrA   c                     SSK Jn  U R                  R                  5        H
  nSUl        M     U" U R
                  4SU0UD6U l        U $ )aL  Prepare task for distributed training.

Wraps the student model in DistributedDataParallel (DDP) while leaving
the frozen teacher model unwrapped.

Args:
    device_ids: List of device IDs for DDP (e.g., [local_rank])
    **ddp_kwargs: Additional arguments passed to DistributedDataParallel

Returns:
    self (for method chaining)
r   DistributedDataParallelFru   )torch.nn.parallelrx   r]   
parametersrequires_gradrr   r8   ru   
ddp_kwargsDDPparams        r<   prepare_distributed)LogitDistillationTask.prepare_distributed*  sH    " 	E\\,,.E"'E / 4<<MJM*Mr>   r?   targetc                 ^   U R                  U5      nU R                  X25      n[        R                  " 5          U R                  R                  XR                  U R                  5      nU R	                  UR                  5       SS9nSSS5        [        R                  " X0R                  -  SS9n[        R                  " WU R                  -  SS9n[        R                  " XxSSS9U R                  S	-  -  n	U R                  U-  U R                  U	-  -   n
U
UUU	S
.$ ! , (       d  f       N= f)aq  Forward pass with logit distillation.

Args:
    input: Input tensor [B, C, H, W]
    target: Target labels [B]

Returns:
    Dictionary containing:
        - 'loss': Combined training loss (task + distillation)
        - 'output': Student logits (for metrics)
        - 'task_loss': Classification loss component
        - 'kd_loss': Logit distillation loss component
Fr@   Nr   )dim	batchmeanT)	reduction
log_target   lossoutput	task_losskd_loss)rr   rf   r4   no_gradr]   rO   rK   rL   detachFlog_softmaxrk   kl_divrj   ri   )r8   r?   r   student_logitsr   input_kdteacher_logitsprob_sprob_tr   
total_losss              r<   rI   LogitDistillationTask.forwardC  s   $ e,NN>:	]]_||33E;L;LdN^N^_H!\\(//*;U\SN  ~0@0@@bI~0@0@@bI((6[TRVZVfVfjkVkl**Y69Q9QT[9[[
 $"	
 	
 _s   AD
D,)rf   ri   rh   rr   rj   r]   rk   )	NNro   NNrp   NNTN)r/   rS   rT   rU   rV   r+   r,   r   r'   r   r   floatr4   r   r   rY   r%   listr   rX   r   rI   rZ   r[   r\   s   @r<   rc   rc      sB   %V .259!3704!$-1+/ Q99Q !bii1D!DEQ  		*	Q
 &.c]Q Q "*%Q 'uoQ Q U\\*Q EKK(Q Q Qj *.  
!	2$
<<$
 LL$
 
c5<<	 	$
 $
r>   rc   c                      ^  \ rS rSrSr S
S\R                  S\\R                     4U 4S jjjrS\	R                  S\\	R                  \	R                  4   4S jrS	rU =r$ )"FeatureDistillationTrainableModuleij  a  Trainable module for feature distillation.

Wraps student model and projection layer into a single module where all
trainable forward operations happen inside forward(). This ensures proper
DDP wrapping when the module is used with DistributedDataParallel.
r^   
projectionc                 :   > [         TU ]  5         Xl        X l        g)zCreate trainable module wrapper for feature distillation.

Args:
    student_model: Student model to train
    projection: Optional projection layer (Linear layer or None)
N)r$   r%   rr   r   )r8   r^   r   r;   s      r<   r%   +FeatureDistillationTrainableModule.__init__r  s     	$$r>   r?   rA   c                     U R                   R                  U5      nU R                   R                  U5      nU R                   R                  USS9nU R                  b  U R                  U5      nX44$ )zForward pass through student and projection.

Args:
    input: Input tensor [B, C, H, W]

Returns:
    Tuple of (student_logits, student_features) where features are
    optionally projected to match teacher dimension.
TrE   )rr   rC   rD   r   )r8   r?   rH   r   student_featuress        r<   rI   *FeatureDistillationTrainableModule.forward  si     ll33E:22;?<<44[T4R??&#/?@//r>   )r   rr   r   )r/   rS   rT   rU   rV   r+   r,   r   r%   r4   rX   r   rI   rZ   r[   r\   s   @r<   r   r   j  sd     /3%99% !+% %0U\\ 0eELL%,,4N.O 0 0r>   r   c                     ^  \ rS rSrSr         SS\R                  S\\\R                  \	4   S\
\R                     S\
\   S\
\   S\
\   S	\
\   S
\
\   S\
\R                     S\
\R                     S\4U 4S jjjr\S\R                  S\4S j5       r SS\
\   SS 4S jjrS\R,                  S\R,                  S\\\R,                  4   4S jrSrU =r$ )FeatureDistillationTaski  a  Feature-based knowledge distillation task.

Performs distillation by matching student and teacher intermediate features
(pooled pre-logits) using MSE loss. Automatically creates a projection layer
if student and teacher feature dimensions differ.

Loss weighting supports two modes:
1. Independent weights: loss = task_loss_weight * task_loss + distill_loss_weight * distill_loss
2. Complementary mode: loss = task_loss_weight * task_loss + (1 - task_loss_weight) * distill_loss
   (used when only task_loss_weight is specified)

Args:
    student_model: Student model to train
    teacher_model: Teacher model - can be a model name string, nn.Module, or DistillationTeacher
    criterion: Task loss function (default: CrossEntropyLoss)
    teacher_pretrained_path: Path to teacher pretrained weights (used when teacher_model is a string)
    distill_loss_weight: Weight for distillation loss
    task_loss_weight: Weight for task loss
    student_feature_dim: Student pre-logits dimension (auto-detected if None)
    teacher_feature_dim: Teacher pre-logits dimension (auto-detected if None)
    device: Device for task tensors/buffers
    dtype: Dtype for task tensors/buffers
    verbose: Enable info logging

Example:
    >>> # With model name string (num_classes/in_chans inferred from student)
    >>> task = FeatureDistillationTask(
    ...     student_model=model, teacher_model='resnet50',
    ...     criterion=nn.CrossEntropyLoss(),
    ...     distill_loss_weight=5.0, task_loss_weight=1.0,
    ...     device=torch.device('cuda'),
    ... )
r^   re   rf   rg   ri   rj   student_feature_dimteacher_feature_dimr   r   rl   c                   > [         TU ]  XUS9  [        UUUU R                  U R                  5      nXl        Ub  UO[        R                  " 5       U l        UbI  XPl	        Ub  UOSU l
        U R                  (       a%  [        R                  SU R                   SU 35        OUbG  X`l
        SU-
  U l	        U R                  (       a%  [        R                  SU SU R                   35        ONSU l	        SU l
        U R                  (       a/  [        R                  SU R                   SU R                   35        Uc  U R                  U5      nUc  U R                  UR                  5      nS nXx:w  aW  U R                  (       a  [        R                  SU SU 35        [        R                   " XxU R                  U R                  S	9nO&U R                  (       a  [        R                  S
5        [#        X5      U l        ['        U5      n[(        R*                  " UR,                  S   U R                  U R                  S	9R/                  SSSS5      n[(        R*                  " UR,                  S   U R                  U R                  S	9R/                  SSSS5      nU R1                  SUSS9  U R1                  SUSS9  U R                  (       a  [        R                  SU SU 35        g g )Nrn   rp   z;FeatureDistillationTask: Independent weights - task_weight=rq   z:FeatureDistillationTask: Complementary mode - task_weight=z=FeatureDistillationTask: Default equal weights - task_weight=zCreating projection layer: z -> r   z.Feature dimensions match, no projection neededr   r	   r   r   rK   Fr    rL   z%FeatureDistillationTask: student_dim=z, teacher_dim=)r$   r%   ra   r   r   r]   r+   rs   rf   ri   rj   rl   r(   r)   _detect_feature_dimr1   Linearr   trainable_moduler   r4   r5   r   r6   r7   )r8   r^   re   rf   rg   ri   rj   r   r   r   r   rl   r]   r   r`   rK   rL   r;   s                    r<   r%    FeatureDistillationTask.__init__  s    	WE ##KKJJ
 &/&;ATATAV *':$8H8T$4Z]D!||##'#8#8"99JK^J_a )$4!'*-='=D$||##3"44EdF^F^E_a (+D$$'D!||##'#8#8"99J4KcKcJdf &"&":":="I&"&":":7=="I 
5||12E1FdK^J_` #6TXT_T_gkgqgqrJ||MN B= ] )7||,,V4;;**
 $q"a
	 	
 ll,,U3;;**
 $q"a
	 	
 	^\eL]KEJ<<LL23>BUAVX r>   r1   rA   c                     [        U 5      n [        U S5      (       a  U R                  $ [        U S5      (       a  U R                  $ [	        S5      e)z)Auto-detect feature dimension from model.head_hidden_sizenum_featureszCannot auto-detect feature dimension. Model must have 'head_hidden_size' or 'num_features' attribute, or you must specify student_feature_dim and teacher_feature_dim explicitly.)r   r2   r   r   rG   rQ   s    r<   r   +FeatureDistillationTask._detect_feature_dim  sT     U#5,--)))UN++%%%R r>   ru   c                     SSK Jn  U R                  R                  5        H
  nSUl        M     U" U R
                  4SU0UD6U l        U $ )af  Prepare task for distributed training.

Wraps the trainable module (student + projection) in DistributedDataParallel
(DDP) while leaving the frozen teacher model unwrapped.

Args:
    device_ids: List of device IDs for DDP (e.g., [local_rank])
    **ddp_kwargs: Additional arguments passed to DistributedDataParallel

Returns:
    self (for method chaining)
r   rw   Fru   )ry   rx   r]   rz   r{   r   r|   s        r<   r   +FeatureDistillationTask.prepare_distributed*  sL    " 	E\\,,.E"'E / !$D$9$9 _j _T^ _r>   r?   r   c                    U R                  U5      u  p4U R                  X25      n[        R                  " 5          U R                  R                  XR                  U R                  5      nU R	                  UR                  5       SS9nSSS5        [        R                  " UW5      nU R                  U-  U R                  U-  -   n	U	UUUS.$ ! , (       d  f       NK= f)au  Forward pass with feature distillation.

Args:
    input: Input tensor [B, C, H, W]
    target: Target labels [B]

Returns:
    Dictionary containing:
        - 'loss': Combined training loss (task + distillation)
        - 'output': Student logits (for metrics)
        - 'task_loss': Classification loss component
        - 'kd_loss': Feature distillation loss component
Tr   Nr   )r   rf   r4   r   r]   rO   rK   rL   r   r   mse_lossrj   ri   )
r8   r?   r   r   r   r   r   teacher_featuresr   r   s
             r<   rI   FeatureDistillationTask.forwardC  s    $ ,0+@+@+G(NN>:	]]_||33E;L;LdN^N^_H#||HOO,=t|T  **-/?@**Y69Q9QT[9[[
 $"	
 	
 _s   AC
C)rf   ri   rj   r]   r   )	NNNNNNNNTr   )r/   rS   rT   rU   rV   r+   r,   r   r'   r   r   r   rW   r4   r   r   rY   r%   staticmethodr   r   r   rX   r   rI   rZ   r[   r\   s   @r<   r   r     sp    L .25937041515-1+/ `99` !bii1D!DE`  		*	`
 &.c]` "*%` 'uo` "*#` "*#` U\\*` EKK(` ` `D 299   " *.  
#	2!
<<!
 LL!
 
c5<<	 	!
 !
r>   r   )rV   loggingtypingr   r   r   r   r4   torch.nnr+   torch.nn.functional
functionalr   timm.modelsr   
timm.utilsr   taskr
   	getLoggerr/   r(   r,   r   r'   r   r   ra   rc   r   r   r#   r>   r<   <module>r      s    ;  / /     $ #  

H
%uQ")) uQp"sBII'::;"yy" "#" &	"
 $" "Jx
L x
v(0 (0VO
l O
r>   