
    <Цi)n                     2   % S SK r S SKJrJrJr  SSKJr  S SKJrJ	r	J
r
JrJrJrJr  S SKJr  S SKJr  S SKJr  S S	KJr  S SKrS
S/r\ R2                  R4                  rS r0 r\
\	\	4   \S'   S rS5S jr\" \R@                  5      SS.S\!4S jj5       r"\" \RF                  5      S6S\!4S jj5       r$\" \RJ                  5      S6S\!4S jj5       r&\" \RN                  5      S6S\!4S jj5       r( S5S\\!   S\\!   S\\!   S\)S\!4
S jjr*\" \RV                  \RX                  /5      SS.S\!4S jj5       r-\" \R\                  5      S\!4S j5       r/S r0\" \Rb                  \Rd                  \Rf                  /5      SS.S\!4S jj5       r4S r5SS .S\\\\!S!4   \\!S!4   \\!S!4   \\\!S!4      4      4S" jjr6SS .S\\\\!S!4   \\!S!4   \\!S!4   \\\!S!4      4      4S# jjr7\" \Rp                  S$S%9SS.S\!4S& jj5       r9\" \Rt                  S$S%9S\!4S' j5       r;S( r<\" \Rz                  \R|                  \R~                  /5      SS.S\!4S) jj5       r@\" \R                  S$S%9S\!4S* j5       rB\" \R                  S$S%9S\!4S+ j5       rD0 \R@                  \"_\RF                  \$_\RJ                  \&_\RN                  \(_\RV                  \-_\RX                  \-_\R\                  \/_\Rb                  \4_\Rd                  \4_\Rf                  \4_\Rz                  \@_\R|                  \@_\R~                  \@_\Rp                  \9_\Rt                  \;_\R                  \B_\R                  \D_rS, rE/ S-QrFS. rGS/ rHS0 rIS1 rJ " S2 S
5      rK " S3 S4\5      rLg)7    N)tree_maptree_flattentree_unflatten   )ModuleTracker)ListAnyDictOptionalUnionTupleIterator)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formulac                 \    [        U [        R                  5      (       a  U R                  $ U $ N)
isinstancetorchTensorshape)is    W/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torch/utils/flop_counter.py	get_shaper      s!    !U\\""wwH    flop_registryc                 8   ^  [        T 5      S S.U 4S jj5       nU$ )N)out_valc                 B   > [        [        XU 45      u  pnT" USU0UD6$ )N	out_shape)r   r   )r#   argskwargsr%   fs       r   nfshape_wrapper.<locals>.nf   s.    "*9tW6M"Ni$6)6v66r    r   r(   r)   s   ` r   shape_wrapperr,      s#    
1X 7 7 Ir    c                    ^ ^ UU 4S jnU$ )Nc                    >^  T(       d  [        T 5      m U 4S jn[        R                  R                  R	                  UT5        T $ )Nc                    > [        U [        R                  R                  5      (       d  [	        SU  S[        U 5       35      eU [        ;   a  [        SU  35      eT[        U '   g )Nzlregister_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), got z which is of type zduplicate registrations for )r   r   _opsOpOverloadPacket
ValueErrortyper!   RuntimeError)targetflop_formulas    r   register=register_flop_formula.<locals>.register_fun.<locals>.register&   sl    fejj&A&ABB Hh0f@A A &"%A&#JKK$0M&!r    )r,   r   utils_pytree	tree_map_)r6   r7   get_rawtargetss   ` r   register_fun+register_flop_formula.<locals>.register_fun"   s7    (6L	1 	%%h8r     )r=   r<   r>   s   `` r   r   r   !   s    & r    )r%   returnc                4    U u  pVUu  pxXg:X  d   eXX-  S-  U-  $ )zCount flops for matmul.   r@   )	a_shapeb_shaper%   r&   r'   mkk2ns	            r   mm_floprJ   7   s+    
 DAEB7N7519q=r    c                     [        X5      $ )zCount flops for addmm.)rJ   
self_shaperD   rE   r%   r'   s        r   
addmm_floprN   B   s     7$$r    c                 P    U u  pEnUu  pxn	XG:X  d   eXh:X  d   eXE-  U	-  S-  U-  n
U
$ )z"Count flops for the bmm operation.rC   r@   )rD   rE   r%   r'   brF   rG   b2rH   rI   flops              r   bmm_floprS   G   sA    
 GA!IBA7N77N7519q=1DKr    c                     [        X5      $ )z&Count flops for the baddbmm operation.rS   rL   s        r   baddbmm_floprV   T   s    
 G%%r    x_shapew_shaper%   
transposedc                 |    U S   nU(       a  U OUSS nUtpgn [        U5      [        U5      -  U-  U-  U-  S-  n	U	$ )a  Count flops for convolution.

Note only multiplication is
counted. Computation for bias are ignored.
Flops for a transposed convolution are calculated as
flops = (x_shape[2:] * prod(w_shape) * batch_size).
Args:
    x_shape (list(int)): The input shape before convolution.
    w_shape (list(int)): The filter shape.
    out_shape (list(int)): The output shape after convolution.
    transposed (bool): is the convolution transposed
Returns:
    int: the number of flops
r   rC   Nr   )
rW   rX   r%   rY   
batch_size
conv_shapec_outc_infilter_sizerR   s
             r   conv_flop_countr`   \   s[    * J''Y;J 'E+ 
d;//*<uDtKaODKr    c                    [        XXvS9$ )zCount flops for convolution.rY   )r`   )
rW   rX   _bias_stride_padding	_dilationrY   r%   r&   r'   s
             r   	conv_floprg      s     7YNNr    c                 0   S nSn U
S   (       a"  [        US   5      nU[        XX(       + 5      -  nU
S   (       aY  [        US   5      nU(       a#  U[        U" U 5      U" U5      U" U5      SS9-  nU$ U[        U" U5      U" U 5      U" U5      SS9-  nU$ )Nc                 4    U S   U S   /[        U SS  5      -   $ )Nr   r   rC   )list)r   s    r   tconv_backward_flop.<locals>.t   s$    a%(#d59o55r    r   r   Frb   )r   r`   )grad_out_shaperW   rX   rc   rd   re   rf   rY   _output_padding_groupsoutput_maskr%   rk   
flop_countgrad_input_shapegrad_weight_shapes                   r   conv_backward_floprt      s    6JDL 1~$Yq\2on?OQ_``
1~%il3/!N*;QwZK\I]joppJ
  /!G*a6GK\I]joppJr    c                     U u  p4pVUu  pxpUu  ppX7s=:X  a  U:X  a#  O   eXHs=:X  a  U:X  a  O   eXj:X  a
  X:X  a  Xj:X  d   eSnU[        X4-  XV4X4-  Xi45      -  nU[        X4-  XY4X4-  X45      -  nU$ )zR
Count flops for self-attention.

NB: We can assume that value_shape == key_shape
r   rU   )query_shape	key_shapevalue_shaperP   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopss                   r   sdpa_flop_countr      s     !NA#"Cc$Cc?s?[[q3[[3:#*QTQ[[[K8QUC-s/@AAK8QUC-s/@AAKr    c                    [        XU5      $ )Count flops for self-attention.r   )rv   rw   rx   r%   r&   r'   s         r   	sdpa_flopr     s     ;;??r    c                     SSK Jn  SSKJn  [	        XU45      (       d  U R                  5       R                  5       $ U/U R                  S5      S-
  -  $ )z
If the offsets tensor is fake, then we don't know the actual lengths.
In that case, we can just assume the worst case; each batch has max length.
r   )
FakeTensor)FunctionalTensorr   )torch._subclasses.fake_tensorr   #torch._subclasses.functional_tensorr   r   difftolistsize)offsetsmax_lenr   r   s       r   _offsets_to_lengthsr     sJ    
 9Dg,<=>>||~$$&&9Q!+,,r    )grad_out.c              #   h  #    Ub  [        UR                  5      S:X  d   e[        UR                  5      S:X  d   eUb  UR                  U R                  :X  d   eU R                  u  pn
UR                  u  pnUR                  u  pnUc   eUc   eUR                  UR                  :X  d   e[        XF5      n[        XW5      n[        UU5       H'  u  nnSU	UU
4nSUUU4nSUUU4nUb  UOSnUUUU4v   M)     gU R                  UR                  UR                  Ub  UR                  OS4v   g7f)a'  
Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
each batch element.

In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
N   r   lenr   r   zip)querykeyvaluer   	cum_seq_q	cum_seq_kmax_qmax_k_h_qr{   h_kd_kh_vr   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shapes                          r   %_unpack_flash_attention_nested_shapesr     sJ    $  399~"""5;;1$$$8>>U[[#@@@kkiikk$$$$$$)//111+I=+I=&)-&G"Y	 #y#6OY4M #y#6O4<4Hd!=/CUUU 'H 	
++syy%++AUx~~[_
__s   D0D2c              #   n  #    Ub  [        UR                  5      S:X  d   e[        UR                  5      S:X  d   eUb  UR                  U R                  :X  d   eU R                  u    pn
UR                  u    pnUR                  u    pnUc   eUc   eUR                  UR                  :X  d   e[        XF5      n[        XW5      n[        UU5       H'  u  nnSU	UU
4nSUUU4nSUUU4nUb  UOSnUUUU4v   M)     gU R                  UR                  UR                  Ub  UR                  OS4v   g7f)a+  
Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
each batch element.

In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
N   r   r   )r   r   r   r   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   r{   r   r   r   r   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   s                          r   )_unpack_efficient_attention_nested_shapesr   F  sR    $  399~"""5;;1$$$8>>U[[#@@@131313''''''!!\%7%7777'C	'C		95LE5 #uc2OUC0M #uc2O4<4Hd!=/CUUU 6 	
++syy%++AUx~~[_
__s   D3D5T)r<   c          
      D    [        U UUUUUUS9n
[        S U
 5       5      $ )r   )r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XU5      v   M     g 7fr   r   .0rv   rw   rx   r   s        r   	<genexpr>0_flash_attention_forward_flop.<locals>.<genexpr>  &      6;2KK 	<<6;   r   sum)r   r   r   r   r   r   r   r%   r&   r'   sizess              r   _flash_attention_forward_flopr   v  s?    " 2E  6;  r    c           
      D    [        U UUUUUUS9n
[        S U
 5       5      $ )r   )r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XU5      v   M     g 7fr   r   r   s        r   r   4_efficient_attention_forward_flop.<locals>.<genexpr>  r   r   r   r   )r   r   r   biasr   r   r   r   r&   r'   r   s              r   !_efficient_attention_forward_flopr     s?    " 6!!!!E  6;  r    c                    SnUu  pVpxUu  ppUu  pnnU u  nnnnXYs=:X  a  Us=:X  a  U:X  a   O   eXjs=:X  a  Us=:X  a  U:X  a	  O   eX:X  d   eUU:X  a  X:X  a  UU:X  d   eSnU[        XV-  Xx4XV-  X45      -  nU[        XV-  UU4XV-  UU45      -  nU[        XV-  X4XV-  UU45      -  nU[        XV-  X{4XV-  X45      -  nU[        XV-  X4XV-  X{45      -  nU$ )Nr   rU   )rm   rv   rw   rx   r   rP   ry   rz   r{   r|   r}   r~   r   r   r   r   r   _b4_h4_s4_d4s                        r   sdpa_backward_flop_countr     s2   K NA#"Cc$Cc3'Cc3!s!c!KKa&<#&<&<KKKK#:#*33K 8QUC-s/@AAK 8QUC-sC/@AAK8QUC-sC/@AAK 8QUC-s/@AAK8QUC-s/@AAKr    c                    [        XX#5      $ )z(Count flops for self-attention backward.r   )rm   rv   rw   rx   r%   r&   r'   s          r   sdpa_backward_flopr     s    
 $NXXr    c
                 F    [        UUUU UUUU	S9n[        S U 5       5      $ )N)r   r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XAX#5      v   M     g 7fr   r   r   rv   rw   rx   rm   s        r   r   1_flash_attention_backward_flop.<locals>.<genexpr>  &      CI?KK 	!iUUCIr   r   )r   r   r   r   out	logsumexpr   r   r   r   r&   r'   shapess                r   _flash_attention_backward_flopr     sB    " 3	F  CI  r    c
                 F    [        UUUU UUUU	S9n[        S U 5       5      $ )N)r   r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XAX#5      v   M     g 7fr   r   r   s        r   r   5_efficient_attention_backward_flop.<locals>.<genexpr>  r   r   r   )r   r   r   r   r   r   r   r   r   r   r&   r'   r   s                r   "_efficient_attention_backward_flopr     sB    " 7!!!!	F  CI  r    c                 6    [        U [        5      (       d  U 4$ U $ r   )r   tuple)xs    r   normalize_tupler   .  s    atHr    ) KMBTc                     [        S[        [        [        5      S-
  [        [	        U 5      5      S-
  S-  5      5      n[        U   $ )Nr   r   rC   r   )maxminr   suffixesstr)numberindexs     r   get_suffix_strr   7  s=     3s8}q(3s6{+;a+?A*EFGEE?r    c                 X    [         R                  U5      nU SU-  -  S nU[         U   -   $ )Ni  z.3f)r   r   )r   suffixr   r   s       r   convert_num_with_suffixr   >  s2    NN6"E%c*E8E?""r    c                     US:X  a  gX-  S $ )Nr   0%z.2%r@   )numdenoms     r   convert_to_percent_strr   E  s    zk#r    c                 0   ^  [        T 5      U 4S j5       nU$ )Nc                 >   > [        U 5      u  pT" U6 n[        X25      $ r   )r   r   )r&   	flat_argsspecr   r(   s       r   r)   )_pytreeify_preserve_structure.<locals>.nfK  s#    &t,	mc((r    r   r+   s   ` r   _pytreeify_preserve_structurer  J  s     
1X) )
 Ir    c                     ^  \ rS rSrSr    SS\\\R                  R                  \
\R                  R                     4      S\S\S\\\\4      4U 4S jjjrS\4S	 jrS\\\\\4   4   4S
 jrSS jrS rS rS rSrU =r$ )r   iT  a  
``FlopCounterMode`` is a context manager that counts the number of flops within its context.

It does this using a ``TorchDispatchMode``.

It also supports hierarchical output by passing a module (or list of
modules) to FlopCounterMode on construction. If you do not need hierarchical
output, you do not need to use it with a module.

Example usage

.. code-block:: python

    mod = ...
    with FlopCounterMode(mod) as flop_counter:
        mod.sum().backward()

modsdepthdisplaycustom_mappingc                 n  > [         TU ]  5         [        S 5      U l        X l        X0l        S U l        Uc  0 nUb  [        R                  " SSS9  0 [        EUR                  5        VVs0 s H%  u  pVU[        USS5      (       a  UO
[        U5      _M'     snnEU l	        [        5       U l        g s  snnf )Nc                       [        [        5      $ r   )r   intr@   r    r   <lambda>*FlopCounterMode.__init__.<locals>.<lambda>o  s
    +VYJZr    z<mods argument is not needed anymore, you can stop passing itrC   )
stacklevel_get_rawF)super__init__r   flop_countsr  r  modewarningswarnr!   itemsgetattrr,   r   mod_tracker)selfr  r  r  r  rG   v	__class__s          r   r  FlopCounterMode.__init__h  s     	6ABZ6[
04	!NMMXefg

WeWkWkWmnWmtqqwq*e44!-:JJWmn
 )? os   +,B1rA   c                 N    [        U R                  S   R                  5       5      $ )NGlobal)r   r  valuesr  s    r   get_total_flopsFlopCounterMode.get_total_flops}  s!    4##H-44677r    c                     U R                   R                  5        VVs0 s H  u  pU[        U5      _M     snn$ s  snnf )zReturn the flop counts as a dictionary of dictionaries.

The outer
dictionary is keyed by module name, and the inner dictionary is keyed by
operation name.

Returns:
    Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
)r  r  dict)r  rG   r  s      r   get_flop_countsFlopCounterMode.get_flop_counts  s7     (,'7'7'='='?@'?tq47
'?@@@s   :c                 (  ^ ^
^^ Uc  T R                   nUc  SnSS KnSUl        / SQn/ nT R                  5       m
[	        T
5      mSmU
UUU 4S jn[        T R                  R                  5       5       HB  nUS:X  a  M  UR                  S5      S	-   nXq:  a  M&  U" XgS	-
  5      nUR                  U5        MD     ST R                  ;   a'  T(       d   U H  n	S
U	S   -   U	S'   M     U" SS5      U-   n[        U5      S:X  a  / SQ/nUR                  XCSS9$ )Ni?B r   T)ModuleFLOPz% TotalFc           	        > [        T
R                  U    R                  5       5      nT	UT:  -  m	SU-  n/ nUR                  X0-   [	        UT5      [        UT5      /5        T
R                  U    R                  5        H<  u  pVUR                  US-   [        U5      -   [	        UT5      [        UT5      /5        M>     U$ )N z - )r   r  r  appendr   r   r  r   )mod_namer  r   paddingr  rG   r  global_flopsglobal_suffixis_global_subsumedr  s          r   process_mod.FlopCounterMode.get_table.<locals>.process_mod  s     d..x8??ABK+"==EkGFMM"']C&{LA 
 ((288:eOc!f,+A}=*1l;  ; Mr    r  .r   r)  )r  0r   )leftrightr5  )headerscolalign)r  tabulatePRESERVE_WHITESPACEr  r   sortedr  keyscountextendr   )r  r  r8  headerr  r0  mod	mod_depth
cur_valuesr   r-  r.  r/  s   `         @@@r   	get_tableFlopCounterMode.get_table  s#   =JJE=E'+$.++-&|4"	 	, $**//12Ch		#*I $Sa-8JMM*% 3 t'''0Bq>a   !1-6Fv;!+,F  B\ ]]r    c                     U R                   R                  5         U R                  R                  5         [	        U 5      U l        U R
                  R                  5         U $ r   )r  clearr  	__enter___FlopCounterModer  r  s    r   rF  FlopCounterMode.__enter__  sG     ""$$T*			r    c                    U R                   c   eU R                   R                  " U6 nS U l         U R                  R                  5         U R                  (       a$  [	        U R                  U R                  5      5        U$ r   )r  __exit__r  r  printrB  r  )r  r&   rP   s      r   rJ  FlopCounterMode.__exit__  s`    yy$$$II%	!!#<<$..,-r    c                     XR                   ;   a[  U R                   U   nU" U0 UDSU0D6n[        U R                  R                  5       H  nU R                  U   U==   U-  ss'   M     U$ )Nr#   )r!   setr  parentsr  )r  func_packetr   r&   r'   flop_count_funcrq   pars           r   _count_flopsFlopCounterMode._count_flops  so    ,,,"00=O($F&F#FJ4++334  %k2j@2 5 
r    )r  r  r  r!   r  r  )NrC   TNr   )__name__
__module____qualname____firstlineno____doc__r   r   r   nnr&  r   r	  boolr
   r	   r  r  r   r#  rB  rF  rJ  rS  __static_attributes____classcell__)r  s   @r   r   r   T  s    * MQ 7;+5$uxx2G!GHI+ + 	+
 %T#s(^4+ +*8 8
Ac4S>&9!: 
A:^z r    c                   ,    \ rS rSrS\4S jrSS jrSrg)rG  i  counterc                     Xl         g r   r_  )r  r_  s     r   r  _FlopCounterMode.__init__  s    r    Nc                    U(       a  UO0 nU[         R                  R                  R                  R                  [         R                  R                  R                  R
                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                   R                  [         R                  R"                  R$                  R                  1;   a  [&        $ XR(                  R*                  ;  ac  U[         R                  R"                  R,                  R                  La2  U    UR.                  " U0 UD6nU[&        La  UsS S S 5        $  S S S 5        U" U0 UD6nU R(                  R1                  UR2                  XcU5      $ ! , (       d  f       N== fr   )r   opsatenis_contiguousdefaultmemory_formatis_strides_like_formatis_non_overlapping_and_denser   sym_sizestride
sym_stridestorage_offsetsym_storage_offsetnumel	sym_numeldimprimlayoutNotImplementedr_  r!   device	decomposerS  _overloadpacket)r  functypesr&   r'   rr   s          r   __torch_dispatch__#_FlopCounterMode.__torch_dispatch__  s   !r EIINN0088IINN00>>IINN99AAIINN??GGIINN''//IINN++33IINN))11IINN--55IINN1199IINN55==IINN((00IINN,,44IINN&&..IINN))113 3 "! ||111d%))..BWBWB_B_6_NND3F3N* *  D#F#||(()=)=s&QQ s   L99
Mra  )r@   N)rU  rV  rW  rX  r   r  r|  r\  r@   r    r   rG  rG    s     Rr    rG  )Fr   )Mr   torch.utils._pytreer   r   r   module_trackerr   typingr   r	   r
   r   r   r   r   collectionsr   torch.utils._python_dispatchr   mathr   	functoolsr   r  __all__rd  re  r   r!   __annotations__r,   r   mmr	  rJ   addmmrN   bmmrS   baddbmmrV   r[  r`   convolution_convolutionrg   convolution_backwardrt   r   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionr   r   r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backward,_scaled_dot_product_cudnn_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r   r   r   r   r   r  r   rG  r@   r    r   <module>r     s    F F ) D D D # :    5
6yy~~
 !#tCH~ ", tww/3 #    tzz"%# % #% txx 
C 
 !
 t||$&C & %& 	%#Y%#Y% Cy% 	%
 	%N (($*;*;<=bf Oux O >O
 t001e e 2eN$ DD@@@@B C EI @WZ @C@	-" +` eE#s(OU38_eCHoxPUVY[^V^P_G``ab+`f -` eE#s(OU38_eCHoxPUVY[^V^P_G``ab-`` t44dC  	 D> t88$G 	 H>6 MMIIIIK L ^b Yps YLY t55tD 	 E@ t994H 	 I@GGWJJ
 	HHh 	LL,	
 	i 	y 	1 	00) 	,,i 	,,i 	99;M 	557I 	557I 	!!#@ 	%%'H  	""$B!" 	&&(J#( $# 
L L^"R( "Rr    