
    Gj                        U d dl mZ d dlZd dlZd dlmZmZmZ ddlm	Z	 d dl
mZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlZddgZ ed          Z ed          Z ej        e          Z 	 d dl!m"Z# n7# e$$ r/  e%d dD                       re &                    d           eZ#Y nw xY wej'        j(        Z(d Z)i Z*e+eef         e,d<   d Z-dEdeeeef         geeef         f         fdZ. e.e(j/                  ddde0fd            Z1 e.e(j2                  dFde0fd            Z3 e.e(j4                  dFde0fd            Z5 e.e(j6                  dFde0fd             Z7 e.e(j8                  	 	 	 	 	 dGde0fd!            Z9	 dEd"e:e0         d#e:e0         d$e:e0         d%e;de0f
d&Z< e.e(j=        e(j>        e(j?        e(j@        e(jA        g          ddde0fd'            ZB e.e(jC                  de0fd(            ZDd) ZE e.e(jF        e(jG        e(jH        g          ddde0fd*            ZId+ ZJdd,deeKeKe0d-f         eKe0d-f         eKe0d-f         eKe0d-f         dz  f                  fd.ZLdd,deeKeKe0d-f         eKe0d-f         eKe0d-f         eKe0d-f         dz  f                  fd/ZM e.e(jN        d01          ddde0fd2            ZO e.e(jP        d01          de0fd3            ZQd4 ZR e.e(jS        e(jT        e(jU        g          ddde0fd5            ZV e.e(jW        d01          de0fd6            ZX e.e(jY        d01          de0fd7            ZZdd8de0fd9Z[dd8de0fd:Z\dd8de0fd;Z]i e(j/        e1e(j2        e3e(j4        e5e(j6        e7e(j8        e9e(j=        eBe(j>        eBe(j?        eBe(jA        eBe(j@        eBe(jC        eDe(jF        eIe(jG        eIe(jH        eIe(jS        eVe(jT        eVe(jU        eVe(jN        eOe(jP        eQe(jW        eXe(jY        eZiZ*d< Z^g d=Z_d> Z`d? Zadebfd@ZcdA Zd G dB d          Ze G dC dDe          ZfdS )H    )NoneTypeN)tree_maptree_flattentree_unflatten   )ModuleTracker)AnyTypeVar)Callable)Iterator)	ParamSpec)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formula_T_PJITFunctionc              #   P   K   | ]!}t          t          j        |d           d uV  "d S N)getattrtorchversion).0attrs     [/var/www/html/Carbon-Document/venv/lib/python3.11/site-packages/torch/utils/flop_counter.py	<genexpr>r"      s5      
]
]d75=$--T9
]
]
]
]
]
]    )cudahipxpuz@triton not found; flop counting will not work for triton kernelsc                 H    t          | t          j                  r| j        S | S r   )
isinstancer   Tensorshape)is    r!   	get_shaper,   #   s"    !U\"" wHr#   flop_registryc                 B     t                     d d fd
            }|S )Nout_valc                 P    t          t          ||| f          \  }}} |d|i|S )N	out_shape)r   r,   )r0   argskwargsr2   fs       r!   nfzshape_wrapper.<locals>.nf+   s:    "*9tVW6M"N"Nfiq$6)6v666r#   r   r5   r6   s   ` r!   shape_wrapperr8   *   s@    
1XX 7 7 7 7 7 7 X7 Ir#   Freturnc                 |     dt           t          t          f         dt           t          t          f         f fd}|S )Nflop_formular9   c                      st                      d fd}t          j        j                            |            S )Nr9   c                     t          | t          j        j        t          f          s"t          d|  dt          |                      | t          v rt          d|            t          | <   d S )Nz|register_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), or JitFunction, got z which is of type zduplicate registrations for )	r(   r   _opsOpOverloadPacket_JITFunction
ValueErrortyper-   RuntimeError)targetr;   s    r!   registerz=register_flop_formula.<locals>.register_fun.<locals>.register7   s    v
(C\'RSS G F#F F7;F||F FG G G &&"#J&#J#JKKK$0M&!!!r#   )r9   N)r8   r   utils_pytree	tree_map_)r;   rE   get_rawtargetss   ` r!   register_funz+register_flop_formula.<locals>.register_fun3   sZ     	7(66L	1 	1 	1 	1 	1 	1 	%%h888r#   )r   r   r   )rJ   rI   rK   s   `` r!   r   r   1   sO    8BF#3 R8H       & r#   )r2   c                b    | \  }}|\  }}||k    rt          d| d|           ||z  dz  |z  S )zCount flops for matmul.z3matmul: inner dimensions must match (k == k2), got  and    AssertionError)	a_shapeb_shaper2   r3   r4   mkk2ns	            r!   mm_floprW   H   sR    
 DAqEBBww_ST__[]__```q519q=r#   c                 "    t          ||          S )zCount flops for addmm.rW   
self_shaperQ   rR   r2   r4   s        r!   
addmm_flopr\   T   s     7G$$$r#   c                     | \  }}}|\  }}}	||k    rt          d| d|           ||k    rt          d| d|           ||z  |	z  dz  |z  }
|
S )z"Count flops for the bmm operation.z0bmm: batch dimensions must match (b == b2), got rM   z0bmm: inner dimensions must match (k == k2), got rN   rO   )rQ   rR   r2   r4   brS   rT   b2rU   rV   flops              r!   bmm_flopra   Y   s    
 GAq!IBABww\PQ\\XZ\\]]]Bww\PQ\\XZ\\]]]q519q=1DKr#   c                 "    t          ||          S )z&Count flops for the baddbmm operation.)ra   rZ   s        r!   baddbmm_floprc   h   s    
 GW%%%r#   c	                 "    t          | |          S )zCount flops for _scaled_mm.rY   )
rQ   rR   scale_a_shapescale_b_shape
bias_shapescale_result_shape	out_dtypeuse_fast_accumr2   r4   s
             r!   _scaled_mm_floprk   o   s     7G$$$r#   x_shapew_shaper2   
transposedc                     | d         }|r| n|dd         }|^}}}	 t          |          t          |          z  |z  |z  |z  dz  }	|	S )a  Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    r   rN   Nr   )
rl   rm   r2   rn   
batch_size
conv_shapec_outc_infilter_sizer`   s
             r!   conv_flop_countru      sj    ( J'6''Y;J 'E4+ 
d;///*<uDtKaODKr#   c                (    t          | |||          S )zCount flops for convolution.rn   )ru   )
rl   rm   _bias_stride_padding	_dilationrn   r2   r3   r4   s
             r!   	conv_flopr|      s     7GY:NNNNr#   c                 |   d }d}	 |
d         r+t          |d                   }|t          | |||           z  }|
d         rzt          |d                   }|r2|t           ||            ||           ||          d          z  }n1|t           ||           ||            ||          d          z  }|S )Nc                 R    | d         | d         gt          | dd                    z   S )Nr   r   rN   )list)r*   s    r!   tzconv_backward_flop.<locals>.t   s(    a%(#d59oo55r#   r   r   Frw   )r,   ru   )grad_out_shaperl   rm   rx   ry   rz   r{   rn   _output_padding_groupsoutput_maskr2   r   
flop_countgrad_input_shapegrad_weight_shapes                   r!   conv_backward_flopr      s    6 6 6JDL 1~ a$Yq\22ong?OU_Q_```
1~ q%il33 	q/!!N*;*;QQwZZK\I]I]joppppJJ /!!G**aa6G6GK\I]I]joppppJr#   c                 n   | \  }}}}|\  }}}	}
|\  }}}}||cxk    r|k    rn n||k    r||
k    r|	|k    st          d|  d| d|           ||k     s	||z  dk    rt          d| d| d          d}|t          ||z  ||f||z  ||	f          z  }|t          ||z  ||	f||z  |	|f          z  }|S )z
    Count flops for self-attention.

    Supports GQA (grouped-query attention) where key/value have fewer heads
    than the query. The kernel broadcasts KV heads to match query heads.
    z<sdpa_flop_count: query/key/value shapes are incompatible: q=z, k=z, v=r   zsdpa_flop_count: query heads ()) must be a multiple of key/value heads ()rP   ra   )query_shape	key_shapevalue_shaper^   h_qs_qd_q_b2h_kvs_k_d2_b3_h3_s3d_vtotal_flopss                   r!   sdpa_flop_countr     sJ    #AsC#CsC$Cc3OOOOOOOOOs

sczz?? ?"+? ?1<? ?
 
 	
 TzzS4Z1__(S ( ( $( ( (
 
 	
 K8QWc3/!c'31DEEEK8QWc3/!c'31DEEEKr#   c                $    t          | ||          S )Count flops for self-attention.r   )r   r   r   r2   r3   r4   s         r!   	sdpa_flopr   5  s     ;	;???r#   c                     ddl m} ddlm} t	          | ||f          s6| j        j        dk    r&|                                                                 S |g| 	                    d          dz
  z  S )z
    If the offsets tensor is fake, then we don't know the actual lengths.
    In that case, we can just assume the worst case; each batch has max length.
    r   )
FakeTensor)FunctionalTensormetar   )
torch._subclasses.fake_tensorr   #torch._subclasses.functional_tensorr   r(   devicerB   difftolistsize)offsetsmax_lenr   r   s       r!   _offsets_to_lengthsr   >  s    
 988888DDDDDDg
,<=>> '7>CVZ`C`C`||~~$$&&&9Q!+,,r#   )grad_out.c              #     K   |+t          |j                  dk    rt          d          t          |j                  dk    rt          d          ||j        | j        k    rt          d          | j        \  }}	}
|j        \  }}}|j        \  }}}|t          d          |t          d          |j        |j        k    rt          d          t          ||          }t          ||          }t	          ||d	
          D ]%\  }}d|	||
f}d|||f}d|||f}||nd}||||fV  &dS | j        |j        |j        ||j        ndfV  dS )a;  
    Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   z7sdpa_flop_count: expected key.shape to be 3-dimensionalz9sdpa_flop_count: expected value.shape to be 3-dimensionalzDsdpa_flop_count: grad_out.shape must match query.shape when providedz+sdpa_flop_count: cum_seq_q must not be Nonez+sdpa_flop_count: cum_seq_k must not be NonezAsdpa_flop_count: cum_seq_q and cum_seq_k must have the same shapeTstrictr   lenr*   rP   r   zip)querykeyvaluer   	cum_seq_q	cum_seq_kmax_qmax_k_r   r   h_kd_kh_vr   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shapes                          r!   %_unpack_flash_attention_nested_shapesr   J  s     $  sy>>Q !Z[[[u{q   !\]]]HNek$A$A !ghhhk3i3k3 !NOOO !NOOO?io-- !deee+Iu==+Iu==&)-t&T&T&T 	V 	V"Y	 #y#6OY4M #y#6O4<4Hd!=/CUUUUUU
+sy%+AUx~~[_
______r#   c              #     K   |.t          |j                  dk    rt          d          t          |j                  dk    rt          d          ||j        | j        k    rt          d          | j        \  }}}	}
|j        \  }}}}|j        \  }}}}|t          d          |t          d          |j        |j        k    rt          d          t          ||          }t          ||          }t	          ||d	
          D ]%\  }}d|	||
f}d|||f}d|||f}||nd}||||fV  &dS | j        |j        |j        ||j        ndfV  dS )a?  
    Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   zQ_unpack_efficient_attention_nested_shapes: expected key.shape to be 4-dimensionalzS_unpack_efficient_attention_nested_shapes: expected value.shape to be 4-dimensionalz^_unpack_efficient_attention_nested_shapes: grad_out.shape must match query.shape when providedzH_unpack_efficient_attention_nested_shapes: cu_seqlens_q must not be NonezH_unpack_efficient_attention_nested_shapes: cu_seqlens_k must not be Noneza_unpack_efficient_attention_nested_shapes: cu_seqlens_q and cu_seqlens_k must have the same shapeTr   r   r   )r   r   r   r   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   r   r   r   r   r   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   s                          r!   )_unpack_efficient_attention_nested_shapesr   ~  s     $  sy>>Q !tuuuu{q   !vwwwHNek$A$A   "B  C  C  C1c31c31c3 !klll !klll!333  "Z [ [ ['lCC	'lCC		9TBBB 	V 	VLE5 #uc2OUC0M #uc2O4<4Hd!=/CUUUUUU
+sy%+AUx~~[_
______r#   T)rI   c          	      `    t          | ||||||          }
t          d |
D                       S )r   r   r   r   r   r   r   r   c              3   B   K   | ]\  }}}}t          |||          V  d S r   r   r   r   r   r   r   s        r!   r"   z0_flash_attention_forward_flop.<locals>.<genexpr>  J        2KK 	Y<<     r#   r   sum)r   r   r   r   r   r   r   r2   r3   r4   sizess              r!   _flash_attention_forward_flopr     s]    " 2  E   6;     r#   c           	      `    t          | ||||||          }
t          d |
D                       S )r   )r   r   r   r   r   r   r   c              3   B   K   | ]\  }}}}t          |||          V  d S r   r   r   s        r!   r"   z4_efficient_attention_forward_flop.<locals>.<genexpr>  r   r#   r   r   )r   r   r   biasr   r   r   r   r3   r4   r   s              r!   !_efficient_attention_forward_flopr     s]    " 6!!!!  E   6;     r#   c                 t   |\  }}}}|\  }}	}
}|\  }}}}| \  }}}}||cxk    r|cxk    r|k    rn n|	|k    r||k    st          d          ||	k     s	||	z  dk    rt          d| d|	 d          ||k    r||k    r|
|k    r||k    st          d          d}|t          ||z  ||f||z  ||
f          z  }|t          ||z  ||f||z  ||
f          z  }|t          ||z  |
|f||z  ||f          z  }|t          ||z  ||
f||z  |
|f          z  }|t          ||z  ||f||z  ||
f          z  }|S )Nz<sdpa_backward_flop_count: batch/heads mismatch among tensorsr   z'sdpa_backward_flop_count: query heads (r   r   zJsdpa_backward_flop_count: grad_out/value/key/query shapes are incompatibler   )r   r   r   r   r^   r   r   r   r   r   r   r   r   r   r   r   _b4_h4_s4_d4r   s                        r!   sdpa_backward_flop_countr     s   "AsC#CsC$Cc3'Cc3""""""""s"""""ts{{sczzJ
 
 	
 TzzS4Z1__(c ( ( $( ( (
 
 	
 3JJ3#::#**X
 
 	
 K 8QWc3/!c'31DEEEK 8QWc3/!c'31DEEEK8QWc3/!c'31DEEEK 8QWc3/!c'31DEEEK8QWc3/!c'31DEEEKr#   c                &    t          | |||          S )z(Count flops for self-attention backward.r   )r   r   r   r   r2   r3   r4   s          r!   sdpa_backward_flopr     s    
 $NKKXXXr#   c
           
      b    t          |||| ||||	          }t          d |D                       S )Nr   r   r   r   r   r   r   r   c              3   D   K   | ]\  }}}}t          ||||          V  d S r   r   r   r   r   r   r   s        r!   r"   z1_flash_attention_backward_flop.<locals>.<genexpr><  L        ?KK 	!iUU     r#   r   )r   r   r   r   out	logsumexpr   r   r   r   r3   r4   shapess                r!   _flash_attention_backward_flopr   !  s`    " 3	 	 	F   CI     r#   c
           
      b    t          |||| ||||	          }t          d |D                       S )N)r   r   r   r   r   r   r   r   c              3   D   K   | ]\  }}}}t          ||||          V  d S r   r   r   s        r!   r"   z5_efficient_attention_backward_flop.<locals>.<genexpr>]  r   r#   r   )r   r   r   r   r   r   r   r   r   r   r3   r4   r   s                r!   "_efficient_attention_backward_flopr   B  s`    " 7!!!!	 	 	F   CI     r#   r/   c          	      h    t          | |||||n|||          }
t          d |
D                       S )z$Count flops for varlen_attn forward.Nr   c              3   B   K   | ]\  }}}}t          |||          V  d S r   r   r   s        r!   r"   z,_varlen_attn_forward_flop.<locals>.<genexpr>y  r   r#   r   )r   r   r   cu_seq_qcu_seq_kr   r   r0   r3   r4   r   s              r!   _varlen_attn_forward_flopr   c  sf     2&2((  E   6;     r#   c          	      ,    t          |||||||          S )z(Count flops for varlen_attn_out forward.)r   )r   r   r   r   r   r   r   r   r0   r3   r4   s              r!   _varlen_attn_out_flopr     s%     %sE8Xue  r#   c
          
      b    t          |||| ||||	          }t          d |D                       S )z%Count flops for varlen_attn backward.r   c              3   D   K   | ]\  }}}}t          ||||          V  d S r   r   r   s        r!   r"   z-_varlen_attn_backward_flop.<locals>.<genexpr>  r   r#   r   )r   r   r   r   r   lser   r   r   r   r0   r3   r4   r   s                 r!   _varlen_attn_backward_flopr    s`      2	 	 	E   CH     r#   c                 6    t          | t                    s| fS | S r   )r(   tuple)xs    r!   normalize_tupler    s     a tHr#   ) KMBTc                     t          dt          t          t                    dz
  t          t	          |                     dz
  dz                      }t          |         S )Nr   r   rN   r   )maxminr   suffixesstr)numberindexs     r!   get_suffix_strr    sJ     3s8}}q(3s6{{+;+;a+?A*EFFGGEE?r#   c                 j    t                               |          }| d|z  z  d}|t           |         z   S )Ni  z.3f)r  r  )r  suffixr  r   s       r!   convert_num_with_suffixr    s6    NN6""E%++E8E?""r#   c                      |dk    rdS | |z  dS )Nr   0%z.2% )numdenoms     r!   convert_to_percent_strr    s     zztEkr#   c                 <     t                      fd            }|S )Nc                 R    t          |           \  }} | }t          ||          S r   )r   r   )r3   	flat_argsspecr   r5   s       r!   r6   z)_pytreeify_preserve_structure.<locals>.nf  s/    &t,,	4amc4(((r#   r   r7   s   ` r!   _pytreeify_preserve_structurer!    s3    
1XX) ) ) ) X)
 Ir#   c                        e Zd ZdZ	 	 	 	 ddej        j        eej        j                 z  dz  dede	de
eef         dz  d	df
 fd
Zd	efdZd	e
ee
eef         f         fdZddZd Zd Zd Z xZS )r   a  
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        with FlopCounterMode(mod) as flop_counter:
            mod.sum().backward()

    NrN   Tmodsdepthdisplaycustom_mappingr9   c                 R   t                                                       t          d           | _        || _        || _        d | _        |i }|t          j        dd           i t          d |
                                D             | _	        t                      | _        d S )Nc                  *    t          t                    S r   )r   intr  r#   r!   <lambda>z*FlopCounterMode.__init__.<locals>.<lambda>  s    +VYJZJZ r#   z<mods argument is not needed anymore, you can stop passing itrN   )
stacklevelc                 Z    i | ](\  }}|t          |d d          r|nt          |          )S )_get_rawF)r   r8   r   rT   vs      r!   
<dictcomp>z,FlopCounterMode.__init__.<locals>.<dictcomp>  s<    nnntqRSqwq*e44J!!-:J:Jnnnr#   )super__init__r   flop_countsr$  r%  modewarningswarnr-   itemsr   mod_tracker)selfr#  r$  r%  r&  	__class__s        r!   r2  zFlopCounterMode.__init__  s     	6ABZBZ6[6[
-1	!NMXefgggg

nnWeWkWkWmWmnnn
 )??r#   c                 Z    t          | j        d                                                   S )NGlobal)r   r3  valuesr9  s    r!   get_total_flopszFlopCounterMode.get_total_flops  s$    4#H-4466777r#   c                 H    d | j                                         D             S )a  Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        c                 4    i | ]\  }}|t          |          S r  )dictr.  s      r!   r0  z3FlopCounterMode.get_flop_counts.<locals>.<dictcomp>&  s$    @@@tq!477@@@r#   )r3  r7  r>  s    r!   get_flop_countszFlopCounterMode.get_flop_counts  s(     A@t'7'='='?'?@@@@r#   c                 @   
 | j         }|d}dd l}d|_        g d}g }                                 
t	          
          d
 fd}t           j                                                  D ]L}|dk    r	|                    d          d	z   }||k    r( |||d	z
            }|	                    |           Md j        v r$s"|D ]}	d
|	d         z   |	d<    |dd          |z   }t          |          dk    rg dg}|                    ||d          S )Ni?B r   T)ModuleFLOPz% TotalFc           	         t          
j        |                                                    }	|k    z  	d|z  }g }|                    || z   t	          |          t          |          g           
j        |                                          D ]L\  }}|                    |dz   t          |          z   t	          |          t          |          g           M|S )N z - )r   r3  r=  appendr  r  r7  r  )mod_namer$  r   paddingr=  rT   r/  global_flopsglobal_suffixis_global_subsumedr9  s          r!   process_modz.FlopCounterMode.get_table.<locals>.process_mod8  s     d.x8??AABBK+"==EkGFMM("']CC&{LAA   
 (288::  1eOc!ff,+A}==*1l;;    
 Mr#   r<  .r   rH  )r<  0r  )leftrightrS  )headerscolalign)r$  tabulatePRESERVE_WHITESPACEr?  r  sortedr3  keyscountextendr   )r9  r$  rV  headerr=  rO  mod	mod_depth
cur_valuesr   rL  rM  rN  s   `         @@@r!   	get_tablezFlopCounterMode.get_table(  s   =JE=E 	'+$...++--&|44"	 	 	 	 	 	 	 	, $*//1122 	& 	&Ch		#*I5  $S)a-88JMM*%%%%
 t'''0B' * *q>a [1--6Fv;;!+++,F  B\ ]]]r#   c                     | j                                          | j                                         t	          |           | _        | j                                         | S r   )r3  clearr8  	__enter___FlopCounterModer4  r>  s    r!   rc  zFlopCounterMode.__enter__g  sT       ""$$$$T**		r#   c                     | j         t          d           | j         j        | }d | _         | j                                         | j        r't          |                     | j                             |S )Nz<Internal error: FlopCounter.__exit__ called but mode is None)r4  rP   __exit__r8  r%  printr`  r$  )r9  r3   r^   s      r!   rf  zFlopCounterMode.__exit__n  sq    9 !_```DI%	!!###< 	.$..,,---r#   c                     || j         v rP| j         |         } ||i |d|i}t          | j        j                  D ]}| j        |         |xx         |z  cc<   |S )Nr0   )r-   setr8  parentsr3  )r9  func_packetr   r3   r4   flop_count_funcr   pars           r!   _count_flopszFlopCounterMode._count_flopsx  s    $,,,"0=O($F&FF#FFFJ4+344 A A %k222j@2222
r#   )NrN   TNr   )__name__
__module____qualname____doc__r   nnrE  r   r)  boolrB  r	   r2  r?  r  rC  r`  rc  rf  rn  __classcell__)r:  s   @r!   r   r     sD        * DH 48+ +(/D$99D@+ + 	+
 !cNT1+
 >B+ + + + + +*8 8 8 8 8
Ac4S>&9!: 
A 
A 
A 
A<^ <^ <^ <^~          r#   c                   6    e Zd ZdZdeddfdZd Zd Zd
d	ZdS )rd  Tcounterr9   Nc                     || _         d S r   )rw  )r9  rw  s     r!   r2  z_FlopCounterMode.__init__  s    r#   c                     ddl }|                     | j        j                  }| 5   || }ddd           n# 1 swxY w Y   |                     | j        j                  }|| j        _        ||fS )a  Execute a branch function and capture its FLOP counts without
        affecting self.counter.flop_counts

        Args:
            branch_fn: The branch function to execute
            operands: Arguments to pass to the branch function

        Returns:
            Tuple of (result, flop_counts) where result is the branch output
            and flop_counts is a copy of the FLOP counts after execution
        r   N)copyrw  r3  )r9  	branch_fnoperandsrz  checkpointed_flop_countsresultr3  s          r!   $_execute_with_isolated_flop_countingz5_FlopCounterMode._execute_with_isolated_flop_counting  s     	#'99T\-E#F#F  	* 	*Y)F	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	*ii 899#; {""s   8<<c                    |t           j        j        j        t           j        j        j        hv }|rsddlm} ddlm}  ||d                   }t          ||          s)t          |d          r|j        }nnt          ||          )| j                            |d ||          S |t           j        j        j        u r|\  }	}
}}|                     |
|          \  }}|t           u rt           S |                     ||          \  }}|t           u rt           S t#          |                                          t#          |                                          z  }i }|D ]}||         }||         }i }t#          |                                          t#          |                                          z  }|D ]A}|                    |d          }|                    |d          }t)          ||          ||<   B|||<   |                                D ]*\  }}| j        j        |                             |           +|S t           S )Nr   )
get_kernelr   
kernel_idxfn)r   opshigher_ordertriton_kernel_wrapper_mutation triton_kernel_wrapper_functional*torch._higher_order_ops.triton_kernel_wrapr  triton.runtime.jitr   r(   hasattrr  rw  rn  condr  NotImplementedri  rY  getr  r7  r3  update)r9  functypesr3   r4   	is_tritonr  r   kernel_namepredtrue_branchfalse_branchr|  true_outtrue_flop_counts	false_outfalse_flop_countsall_mod_keysmerged_flop_counts	outer_keytrue_func_countsfalse_func_countsmerged_func_countsall_func_keysfunc_keytrue_val	false_val
inner_dicts                               r!   _handle_higher_order_opsz)_FlopCounterMode._handle_higher_order_ops  s   UY3R"Y3TV V	 8	"MMMMMM666666$*VL%9::K k:: ;-- "-.KK	 !k:: 
 <,,[$fMMMUY+000
 9=5D+|X)-)R)RX* *&H& >))%%+/+T+Th, ,(I( N**%% /446677#>O>T>T>V>V:W:WWL!#) C C	#3I#> $5i$@!%'" #$4$9$9$;$; < <sCTCYCYC[C[?\?\ \ - L LH/33Ha@@H 1 5 5h B BI36x3K3K&x000B"9-- *<)A)A)C)C G G%	:(3:::FFFF O!!r#   r  c                    |r|ni }|t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j	        j        t           j        j        j
        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        hv rt(          S t+          |t           j        j                  r|                     ||||          S || j        j        vr\|t           j        j        j        j        ur?| 5   |j        |i |}|t(          ur|cd d d            S 	 d d d            n# 1 swxY w Y    ||i |}| j                            |j        |||          S r   )r   r  atensym_is_contiguousdefaultis_contiguousmemory_formatis_strides_like_formatis_non_overlapping_and_denser   sym_sizestride
sym_stridestorage_offsetsym_storage_offsetnumel	sym_numeldimprimlayoutr  r(   r>   HigherOrderOperatorr  rw  r-   r   	decomposern  _overloadpacket)r9  r  r  r3   r4   rr   s          r!   __torch_dispatch__z#_FlopCounterMode.__torch_dispatch__  s%   !)r EIN4<IN08IN0>IN9AIN?GIN'/IN+3IN)1IN-5IN19IN5=IN(0IN,4IN&.IN)13 3 3  "!dEJ:;; 	L00udFKKK t|111d%).BWB_6_6_  "DND3F33N**       *               dD#F##|(()=sD&QQQs   <H..H25H2)r  N)	ro  rp  rq  supports_higher_order_operatorsr   r2  r  r  r  r  r#   r!   rd  rd    su        &*# D    # # #(;" ;" ;"z"R "R "R "R "R "Rr#   rd  )Fr   )NNNFN)gr  r   loggingr   torch.utils._pytreer   r   r   module_trackerr   typingr	   r
   collections.abcr   r   typing_extensionsr   collectionsr   torch.utils._python_dispatchr   mathr   	functoolsr   r5  __all__r   r   	getLoggerro  logr  r   r@   ImportErroranywarningr  r  r,   r-   rB  __annotations__r8   r   mmr)  rW   addmmr\   bmmra   baddbmmrc   
_scaled_mmrk   r   rt  ru   convolution_convolutioncudnn_convolution_slow_conv2d_forwardconvolution_overrideabler|   convolution_backwardr   r   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionr   r   r  r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backward,_scaled_dot_product_cudnn_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r   r   r  r  r  r  r  r  r  r!  r   rd  r  r#   r!   <module>r     s	            F F F F F F F F F F ) ) ) ) ) )         $ $ $ $ $ $ $ $ $ $ $ $ ' ' ' ' ' ' # # # # # # : : : : : :             5
6WT]]Yt__g!!>>>>>>>   
s
]
]F\
]
]
]]] XVWWWLLL y~  
 !#tCH~ " " "   XxB?O>PRZ[]_a[aRb>b5c    . tw/3 	 	 	# 	 	 	  	 tz""% %# % % % #"% tx   C    !  t|$$& &C & & & %$& t'' % % 	% % % ('%( 	$ $#Y$#Y$ Cy$ 	$
 	$ $ $ $L ().15	7 8 8
 cg O O Oux O O O8 8
O t011e e e e 21eN  8 D@@B C C EI @ @ @WZ @ @ @C C@	- 	- 	-" 1` 1` 1` eE#s(OU38_eCHouSRUXY]G]]^_1` 1` 1` 1`r 4` 4` 4` eE#s(OU38_eCHouSRUXY]G]]^_4` 4` 4` 4`n t4dCCC    	   DC> t8$GGG 	   HG>" " "J MIIK L L ^b Y Y Yps Y Y YL LY t5tDDD 	   ED@ t94HHH 	   IHR    	   L    	   >    	   @GWJ
 	Hh 	L,	
 	O_ 	i 	y 	I 	!9 	y 	1 	0) 	,i 	,i 	9;M  	57I!" 	57I#$ 	!#@%'H"$B&(J+ 0   $##  # # # #        
  N N N N N N N N`yR yR yR yR yR( yR yR yR yR yRs   B 1B<;B<