
    IЦi                      % S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKJr  S SKJr  S SKJrJrJrJrJrJrJrJ r J!r!J"r"J#r#J$r$J%r%J&r&J'r'J(r(  S SK)J*r*J+r+J,r,J-r-  S SKJ.r.  S SK/r/S SK0r0S SK1J2r2  \%(       a  S S	K3J4r4  S S
K5J6r6  SS/r7\Rp                  " S5      S 5       r9S SK:J;r;  S SK<J=r=  S SK>J?r?  S SK@JArA  S SKBJCrC  S SKDJErE  S SKFJGrGJHrHJIrIJJrJJKrK  S SKLJMrMJNrN  S SKOJPrPJQrQ  SSKRJSrS  SSKTJUrV  \R                  S:H  rW\
R                  " \Y5      rZ\&" S5      r[\\/R                  \/R                  4   r]\ \'\0R                  \_\0R                  4      raSSS.rbSrcSrdS re\e\eS-
  -  S :X  a  \eS!:  d   S"5       eS# rfSS$ jrg " S% S&\/R                  5      riSSS' jjrj\Rp                  " S5      SS( j5       rkSS) jrlSS* jrmSS+ jrnSS, jro      SS- jrUS. rp    SS/ jrq    SS0 jrrSS1 jrs S   SS2 jjrtS3 ruSSS4 jjrv S       SS5 jjrw S SS6 jjrxSS7 jrySS8 jrzSS9 jr{S: r|SS; jr}\," S<5      r~\&" S=S>S?9r " S@ SA\!\\~\4   5      rSSB jrSC rSD rSE r S   SSF jjrSG rSSH jrSI rSSJ jrSK rSSL jrSSM jrSSN jrSSO jr    SSP jrSSQ jr/ rSR\SS'   SST jrSU r\GR*                  SSV j5       rSSW jr    SSX jr\Rp                  " S!5      SY 5       r " SZ S[\5      r " S\ S]5      r " S^ S_\5      r\GR*                  S` 5       r " Sa Sb5      r " Sc Sd\5      r\Rp                  " S5      SSSe jj5       rSSf jrSSg jrSSh jrSSi jrSjSjSk.Sl jrSm r\Rp                  " S5      Sn 5       r\Rp                  " S5      So 5       rSp rSq rSr rSs rSt r SSu jrSv r " Sw Sx5      rSSy jrSz rS{ rS| rS} rS~ r\GR*                  S 5       rSS jrS rS rS rS rS rSS jr\GR*                  S 5       rS r\Rp                  " S5      S 5       r\Rp                  " S5      S 5       r\Rp                  " S5      SS j5       rSS jrSS jrSS jrSS jrSS jrSS jrSS jr " S S\GR                  5      rS rSS jrS rS rS rS rS r SS jr SS jrSS jrSS jr\GR                   " S S5      5       r\GR*                  S 5       rSS jrSS jrSS jrS r  GS S jrS rGSS jrGSS jrS rS rGSS jr      GSS jrGSS jr      GSS jr      GSS jrGSS jrS rS rGS	S jrSSSSSS.r\GR                  5        V Vs0 s H  u  pX_M	     snn r\GR                  " S5      rGS
S jrGSS jrGSS jrGSS jr\Rp                  " S5      S 5       r\GR                   " S S5      5       r0 rS\S'       GSS jrGSS jr\+" S>S9SS>S.GSS jjj5       rGSS jrgs  snn f (      )annotationsN)datetime)StringIO)AnyCallableDictGenericIterableList
NamedTupleOptionalProtocolSequenceSetTupleTYPE_CHECKINGTypeVarUnion
ValuesView)Concatenatedataclass_transform	ParamSpec	TypeGuard)mock)DeviceProperties)ELEMENTWISE_TYPE_PROMOTION_KIND)tree_map_onlycudaxpuc                     [          V s/ s H*  n [        [        U 5      R                  5       (       d  M(  U PM,     nn [	        U5      S::  d   e[	        U5      S:X  a  SnU$ UR                  5       nU$ s  sn f )N   r   r   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      T/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torch/_inductor/utils.pyget_gpu_typer,   @   sh    &KY'%*;*H*H*J!YJKz?aZA-vHO 4>>>3CHO Ls
   'A2A2)get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesr!   )config)ceildivwin32_Tz.cubinz.spv)r   r      @      zmust be power of 2c                *    U [         -   S-
  [         * -  $ )z/Round up to the nearest multiple of ALIGN_BYTESr!   )ALIGN_BYTES)nbytess    r+   _alignrF   m   s    [ 1$44    c                   [        U [        R                  [        R                  45      (       a#  [	        [        [        U R                  5      5      $ [        U [        5      =(       d"    [        R                  " U [        5      [        :H  $ )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrD   )vs    r+   rO   rO   r   sT    !eii+,,3{AFF+,,aK599Q#<#KKrG   c                  4    \ rS rSrSrSrSr\SS j5       rSr	g)	rQ   y   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr!   Tc                    [        U[        [        R                  45      (       a  [	        [        U5      5      $ [        U5      (       a  U$ g N)rI   intrJ   IntegerrF   rO   )clsvalues     r+   eval
align.eval   s<    ec5==122#e*%%uL rG    N)r\   
sympy.ExprreturnzOptional[sympy.Expr])
__name__
__module____qualname____firstlineno____doc__nargs
is_integerclassmethodr]   __static_attributes__r_   rG   r+   rQ   rQ   y   s!    FEJ rG   rQ   c                   U " 5         [         R                  R                  5         [         R                  " [	        S5      [         R                  SS9n[         R                  R                  SS9n[         R                  R                  SS9nUR                  5         [        S5       H  nUR                  5         U " 5         M     UR                  5         [         R                  R                  5         UR                  U5      S-  n[        S[	        X-  5      5      n[        S[	        X'-  5      5      n	[        U5       H
  nU " 5         M     [         R                  R                  [         R                  R                  R                  /S9 n
[        U	5       H  nUR                  5         U " 5         M     [         R                  R                  5         S	S	S	5        [        R!                  S
5        [        R!                  W
R#                  5       R%                  SSS95        ['        U
R)                  5        Vs/ s H7  nUR*                  [,        R                  :X  d  M#  UR.                  S:w  d  M5  UPM9     sn5      n[1        U5      U	-  S:w  a  [3        S[1        U5      U	5      e[1        U5      U	-  n['        [5        U5       VVs/ s H  u  pX-  S:w  d  M  UPM     snn5      nUR7                  5         UR#                  5       n[        R!                  S5        [        R!                  UR%                  SS95        [9        S U 5       5      S-  U	-  n[        R!                  SU5        U$ ! , (       d  f       GN= fs  snf s  snnf )a:  
Returns benchmark results by examining torch profiler events.
This could be more accurate as it doesn't count CPU side overhead.
However, this also requires manually excluding irrelevant event, e.g.
vectorized_elementwise_kernel which is used to fill L2 cache,
various CUDA events, etc, so could also be fragile.
g    Ar   )dtypedeviceT)enable_timing   r!   )
activitiesNz
raw eventsself_device_time_total)sort_by	row_limitzContext Syncr   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %szprofiling time breakdown)rt   c              3  8   #    U  H  oR                   v   M     g 7frX   )device_time_total).0events     r+   	<genexpr>+do_bench_using_profiling.<locals>.<genexpr>   s     A=%%%=   g     @@zprofiling results: %s ms)r$   r   synchronizeemptyrY   Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAlogdebugkey_averagestabler0   eventsdevice_typer/   namer&   RuntimeError	enumerate_build_treesum)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpirx   filtered_eventsnum_event_per_groupactual_eventsress                    r+   do_bench_using_profilingr      s    D	JJKKJuyyHE **"""6K

  t 4I1X
  	JJ**959K 1c&./0H1c#+,-H 8_
  
		NN++00
 
  
 
xAKKMD	 ! 	

 
 IIlIIann$$-EQS$TU 	
#  JOO3 8=

n8T #	
O ?h&!+- 	
 	
 o.9 &o6	
6&!+ 6	
M !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.J_
 
$	
	
s+   AM1<"N"N4N
N
N
1
N c                     SSK Jn   [        R                  R	                  SS5        U S L=(       a%    [        [        [        R                  SS 5      S5      $ ! [         a     g[         a  nS[        U5      ;   d   e S nAgS nAff = f)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr   Fztorchvision::nms does not exist)torchvision.opsr   r$   _C%_dispatch_has_kernel_for_dispatch_keyhasattrr#   opsImportErrorr   str)r   es     r+   has_torchvision_roi_alignr      s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 
B$	B-BBc                t   U c   [         R                  " S5      R                  $ [        U [        5      (       a  [         R                  " U 5      n U R
                  S;  aY  U R                  cL  [        U R
                  5      n[         R                  " U R
                  UR                  R                  5       S9$ U $ )Ng        )cpumeta)index)
r$   tensorrm   rI   r   typer   r-   Workercurrent_devicerm   device_interfaces     r+   decode_devicer      s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMrG   c                ~    [         R                  " [        R                  U [        R
                  R                  5      $ rX   )	functoolsreduceoperatormulrJ   SOne)its    r+   sympy_productr      s#    HLL"eggkk::rG   c           	         [        U 5      [        U5      :X  d   e[        R                  " [        S [	        X5       5       5      5      $ )Nc              3  .   #    U  H  u  pX-  v   M     g 7frX   r_   )rw   abs      r+   ry   sympy_dot.<locals>.<genexpr>   s     >odaAEos   )r&   rJ   expandr   zip)seq1seq2s     r+   	sympy_dotr      s6    t9D	!!!<<>c$o>>??rG   c                b    U  Vs0 s H  n[        U5      U_M     snR                  5       $ s  snf rX   )idvalues)r   r(   s     r+   uniquer      s+     !bBqE1Hb!((**!s   ,c           
        [        U [        R                  5      (       d  [        U[        R                  5      (       a4  [        [        R                  " U 5      [        R                  " U5      5      $ [        U [
        5      (       a  [        U[
        5      (       d$   U  S[        U 5       SU S[        U5       35       e[        X5      $ )Nz: , )rI   rJ   Exprr3   sympifyrY   r   runtime_ceildiv)numerdenoms     r+   r=   r=     s     %$$
5%**(E(Eu}}U+U]]5-ABB eS!!js' ' 9
4;-r%4;-89  5((rG   c                <   U c  g[        U 5      R                  S5      S   n0 SS_SS_SS	_S
S_SS_SS_SS	_SS_SS_SS_SS_SS_SS_SS_SS_S S!_S"S#_S$S%S&.En[        UR                  5       5       H  nX2U'   M	     [	        U [         5      (       a  U $ S'X!    3$ )(Nz*i8.rr   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float16fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64uint8u8uint16u16u32u64)uint32uint64*)r   splitlistr   rI   )key	dtype_strtysrS   s       r+   _type_ofr    s>   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	6 	F 	6 	6 	 	 	 	  	!" 	%#$ 'C, #**,A  S#&&3@a/?,@@rG   c                Z    U  Vs/ s H  n[         R                  " U5      PM     sn$ s  snf )z
Gets the shape and stride of a tensor. For non-symbolic tensors, this is
trivial. But for symbolic tensors, we need to map from SymIntNode into
sympy.Expr.
)rJ   r   )lstr   s     r+   convert_shape_to_inductorr	  2  s%     '**cEMM!c***s    (c                   SSK Jn  U  Vs/ s Hr  n[        U[        5      (       a  UOW[        U[        R
                  5      (       a  [        U5      O-UR                  R                  R                  R                  USS9PMt     sn$ s  snf )zn
Takes a list of shapes from Inductor and converts them into symints (or just
ints if all shapes are static).
r!   VN)hint)
virtualizedr  rI   rY   rJ   rZ   graphsizevars	shape_envcreate_symintnode)r  r  r   s      r+   convert_shape_to_symintr  =  s       A !S!!  a// AWW%%//AA!$AO   s   A9Bc                    [        U [        R                  R                  5      (       d   e[	        S U R
                  R                   5       5      $ )z%
Does this op overload have aliasing
c              3  <   #    U  H  oR                   S Lv   M     g 7frX   )
alias_inforw   r   s     r+   ry   is_view.<locals>.<genexpr>Y  s     F1EA||4'1Es   )rI   r$   _ops
OpOverloadany_schema	arguments)ops    r+   is_viewr  T  s<     b%**//0000F1E1EFFFrG   c                  ^ U R                   S:X  d  g[        U R                  [        R                  R
                  5      (       d  U R                  [        R                  L d  gU R                  [        R                  L d  [        U R                  5      (       a  [        U4S jU R                   5       5      $ [        R                  R                  U R                  R                  ;   =(       d    TSL=(       a    T" U R                  5      $ )z
Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

Uses in views ops will follow the views uses
call_functionFc              3  <   >#    U  H  n[        UT5      v   M     g 7frX   )is_pointwise_use)rw   uis_pointwise_fns     r+   ry   #is_pointwise_use.<locals>.<genexpr>n  s     KA#A77s   N)r  rI   targetr$   r  r  r   getitemr  rM   usersTag	pointwisetags)user%  s    `r+   r#  r#  \  s     66_$ 	3::uzz4455xGWGW9W
zzX%%%)<)<KKKK99#**//1 t#C

(CrG   c           	       ^^ [         R                  R                  5       m/ mUU4S jnTR                  " U /[	        [         R
                  X1U45      Q76 n[        U R                  R                  5      S:X  a3  [        U R                  R                  S   R                  5      S:X  a  U4nTR                  U5        [         R                  R                  0 T5      nUT4$ )Nc                `   > TR                  U 5        TR                  S[        T5       35      $ )Narg)appendplaceholderr&   )r0  g
graph_argss    r+   add_tensor_arg)gen_gm_and_inputs.<locals>.add_tensor_argy  s,    #}}s3z?"3455rG   r!   r   Tensor)r$   fxGraphr!  r   r7  r&   r  returnsr   r   outputGraphModule)r'  rP   kwargsr5  nodegmr3  r4  s         @@r+   gen_gm_and_inputsr@  u  s    AJ6 ??u||^F^LD 	FNN""#q(&&q)../8;wHHTN			b!	$Bz>rG   c                t    U S:X  a  g [        U 5      nUR                  5       (       a  UR                  5         g g Nr   )r-   r%   r|   r   s     r+   r|   r|     s7    /7$$&&$$& 'rG   c                    [        U5        [        R                  " S5        [        R                  " 5       n[        U5       H  nU " U6 n[        U5        M     [        R                  " 5       nWc   eXt-
  $ )Ni9  )r|   r$   manual_seedtimeperf_counterr   )modelexample_inputstimesrm   t0r   resultt1s           r+   timedrM    sk     	d				B5\'F  
			B7NrG   c                    [         R                  " [        U5       Vs/ s H  n[        XX%5      PM     sn5      n[         R                  " U5      U-  n[        X-  S 5        U$ s  snf )Nz.6f)r$   r   r   rM  medianprint)	r   rP   rI  repeatbaselinerm   r   timingstooks	            r+   print_performancerU    sY     llE&MRMqE"E:MRSG<< 5(D	T_S!#K Ss   A%c                F   ^ [        X5      " 5       m[        XU4S j5        g)zKReplace obj.method() with a new method that returns a precomputed constant.c                    > T $ rX   r_   )rK  s   r+   <lambda>#precompute_method.<locals>.<lambda>  s    rG   N)r#   setattr)objmethodrK  s     @r+   precompute_methodr]    s    S!#FC(rG   c                ,    U H  n[        X5        M     g)zFReplace methods with new methods that returns a precomputed constants.N)r]  )r[  methodsr\  s      r+   precompute_methodsr`    s    #& rG   c                8    [        X:  5      [        X:  5      -
  $ rX   )rY   )r   r   s     r+   cmprb    s    qu:AE
""rG   c                T    [        U 5      S:X  a  [        U 5      " U S   /5      U-  $ U $ )Nr!   r   )r&   r   )r(   sizes     r+   pad_listlikere    s+    
1v{Aw!v%%rG   c                <    [        U 5      S:X  a  / $ S n[        XS9$ )Nr   c                P    [        U [        5      (       a  U $ U R                  5       $ rX   )rI   r   get_name)elems    r+   	sort_functuple_sorted.<locals>.sort_func  s"    dC  K ==?"rG   r  )r&   sorted)r(   rj  s     r+   tuple_sortedrn    s$    
1v{	# !##rG   PRVT)	covariantc                  2    \ rS rSr\SS j5       rSS jrSrg)CachedMethodi  c                    g rX   r_   selfs    r+   clear_cacheCachedMethod.clear_cache  s    rG   c                    g rX   r_   rv  rP   r=  s      r+   __call__CachedMethod.__call__  s    rG   r_   Nra   None)rP   zP.argsr=  zP.kwargsra   rp  )rb   rc   rd   re   staticmethodrw  r{  rj   r_   rG   r+   rs  rs    s     rG   rs  c           	        ^ U R                   nSU S3mSU 0n[        SU ST ST S3R                  5       U5        [        R                  " U 5      " X! S3   5      nU4S	 jnXCl        U$ )
N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zl
            except AttributeError:
                rv = fn(self)
                object.__setattr__(self, "z)", rv)
                return rv
        _cache_on_selfc                B   > [        U T5      (       a  [        U T5        g g rX   )r   delattr)rv  r  s    r+   rw  "cache_on_self.<locals>.clear_cache  s    4D# rG   )rb   execlstripr   wrapsrw  )r   r   ctxwrapperrw  r  s        @r+   cache_on_selfr    s    ;;DtfF
C *CF  E "+ ,/% 0		 FH oob!#n&=">?G &NrG   c           
        SSK Jn  [        U [        5      (       ay  [        R
                  " [        R                  U  Vs/ s H?  n[        US5      (       d  M  UR                  (       d  M)  UR                  R                  PMA     sn[        5       5      $ [        XR                  5      (       a  U R                  $ [        5       $ s  snf )Nr!   irr>  ) r  rI   r  r   r   r   or_r   r>  originssetExternKernel)node_scheduler  r>  s      r+   aggregate_originsr    s    -&&LL *)D4( "-1YY "		!!)
 E
 	
 
M??	3	3$$$us   C
C
+C
c                &   [        U 5      nUS:X  a~  U Vs/ s H\  nUR                  S:X  d  M  SUR                  ;   d  M'  UR                  S   c  M9  UR                  S   R                  R                  PM^     nn[        [        U5      5      nOUS:X  a  / nU H  nUR                  S:X  d  M  SUR                  ;   d  M'  UR                  S   S   n[        US   [        5      (       a  UR                  US   5        Mg  UR                  US   R                  5        M     [        [        U5      5      nO:US:X  a.  U Vs/ s H   o3R                  S:X  d  M  UR                  PM"     nnO[        eUnSR                  S	/U-   5      $ s  snf s  snf )
Noriginal_atenr!  r$   source_fn_stackrr   r!   inductor_noder   fused)r  r  r   _overloadpacketrb   rm  r  rI   r   r1  r   NotImplementedErrorjoin)r  descriptive_namesall_originsoriginsources	source_fns         r+   get_fused_kernel_namer    s~   #M2KO+ &
%yyO+ B  6;;. B O,	 BFKK(88AA% 	 
 W&	g	%!FyyO+0AV[[0P"KK(9:2>	ilC00NN9Q<0NN9Q<#8#89 " W&	o	-&1
&1FYY/5QKFKKk 	 
 "!G88WI'((5
(
s"   F	F	 F	'F	FFc                  ^ [        U 5      nU Vs/ s H  o3R                  S:X  d  M  UPM     nn[        R                  " [        5      n[        R                  " [        5      nS m[        U5      (       a  U Vs1 s H  owR                  iM     nn[        U5      S:X  a^  US   R                  m[        TS5      (       d+  0 n	[        TR                  5       H	  u  pXU'   M     U	Tl
        UR                  U4S jS9  U H  nSUR                  ;   aO  UR                  S   b?  [        UR                  S   R                  5      nXl   R                  UR                   5        SUR                  ;   d  Mt  UR                  S   S   R                   nX\   R                  UR                   5        M     Tb  S	OS
nUR"                   SU SSR%                  UR'                  5       5       SSR%                  UR'                  5       5       S3nUR"                   S3/n[)        UR+                  5       5       HA  u  nnUR                  UR"                   SU SSR%                  [)        U5      5       35        MC     TbU  UR                  UR"                   S35        U H1  nUR                  UR"                   SUR-                  5        35        M3     USR%                  U5      4$ s  snf s  snf )Nr!  r!   r   )_inductor_kernel_metadata_node_to_idx_mapc                "   > TR                   U    $ rX   )r  )nsingle_graphs    r+   rX  %get_kernel_metadata.<locals>.<lambda>F  s    lTTUVWrG   rl  r  	from_nodezTopologically SortedUnsorted z Source Nodes: [r   z], Original ATen: []z" Source node to ATen node mapping:z   z => z Graph fragment:
)r  r  collectionsdefaultdictr  r&   r  r   r   nodesr  sortr   r   r  r1  r   commentr  keysrm  itemsformat_node)r  r  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictr  unique_graphsnode_to_idx_mapidxr>  r  sort_strmetadatadetailed_metadataoriginal_noder  r  s                     @r+   get_kernel_metadatar  0  s   #M2K+6W;)):Vf;NW ,,T2N$006
 L
>*89.Q.9}")!,22L<)TUU"$'(:(:;FC),A& <IXFW    dii'DIIo,F,Rdii0@@AC#**4995$))#))K(+00C&&tyy1  *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= >u  s=/diiu6N5OP	
 !?   GOO#44D!EFA $$'8AMMO;L%MN  
 TYY0111g X :s   K#K#:K(c                   [        U 5      n [        U 5      nU (       ak  U R                  5       nUR                   HB  nU(       a  U" U5      (       a  M  XB;  d  M   UR	                  U5        U R                  U5        MD     U (       a  Mk  U$ )zJReturns the set of nodes whose values depend on those within initial_queue)r  r  r'   r)  addr1  )initial_queueskip_filterdominated_setr>  users        r+   dominated_nodesr  h  sx     'M&M
  "JJD{400(!!$'$$T*  - rG   c                >  ^^ SS K nSSKJm  UU4S jmUR                  5        Vs/ s H  nT" U5      (       d  M  UR                  PM      nnU  Vs/ s H  nT" U5      (       d  M  UR                  PM      nn[        UR                  " / UQUQ76 5      $ s  snf s  snf )Nr   r!   r  c                  > [        U TR                  5      (       a  T" U R                  5      $ [        U TR                  5      (       a  T" U R                  5      $ [        U TR                  5      =(       a    [        U TR
                  5      $ rX   )rI   	TensorBoxdata
StorageBoxIRNode	Pointwise)r  r  is_unrealized_nodes    r+   r  *gather_origins.<locals>.is_unrealized_node  sd    a&&%aff--a''%aff--!RYY'GJq",,,GGrG   )	itertoolsr  r  r   r  r  chain)	rP   r=  r  valkwarg_originsr0  arg_originsr  r  s	          @@r+   gather_originsr  {  s    H -3MMOWOS?QRU?V[S[[OMW*.J$32DS2I;3;;$KJy<<m<== XJs   BBB&Bc                ^   [        U [        R                  5      (       a  U R                  $ [        U [        R                  5      (       a)  SR                  [        [        U R                  5      5      $ [        U [        R                  5      (       a)  SR                  [        [        U R                  5      5      $ [        U [        [        [        [        45      (       aC  U R                  R                   SSR                  [        [        U R                  5      5       S3$ [!        U 5      $ )z
Normal sympy str is very slow, this is a lot faster.  The result are
somewhat worse, as it doesn't do as much simplification.  So don't
use this for final codegen.
z + z * (r   ))rI   rJ   Symbolr   rK   r  rN   	sympy_strrP   Mulr7   r4   r5   r6   funcrb   r   )exprs    r+   r  r    s     $%%yy$		""zz#i344$		""zz#i344$(HhGHH))$$%QtyyY		1J'K&LANNt9rG   c                    SSK Jn  [        R                  (       a9  [	        UR
                  SS 5      =n(       a  UR                  S:w  a  [        U 5      $ [        R                  " 5       $ )Nr!   r  current_node
index_expr)
r  r  r<   compute_all_boundsr#   interpreterr'  r:   r;   unknown)r   r  fx_nodes      r+   get_bounds_index_exprr    sN     	!!~tDDWDNNl*5!!""$$rG   c                D    U [         R                  :w  d   e[        XSSS9$ )1
Used to generate an integer-nonnegative symbol.
Tintegernonnegative)r9   SIZEr8   )prefixr  s     r+   sympy_index_symbol_with_prefixr    s'     TYY vDdCCrG   c                b    U =(       d    [         R                  =(       a    [         R                  $ rX   )r<   debug_index_assertsassert_indirect_indexing)checks    r+   generate_assertr    s    /V//TV5T5TTrG   c                D    U S   S:w  d   e[         R                  " U SSS9$ )r  r   sTr  )rJ   r  r   s    r+   sympy_index_symbolr    s)     7c>> <<d==rG   c                    S n[         R                  " U 5      R                  UR                  5        VVs0 s H  u  p4X2" X45      _M     snn5      $ s  snnf )z
When the passed replacement symbol v is a string, it is converted to a symbol with name v that
have the same replaced expression integer and nonnegative properties.
c                    [        U [        R                  5      (       d   e[        U[        5      (       a*  [        R                  " UU R
                  U R                  S9$ U$ )Nr  )rI   rJ   r   r   r  rh   is_nonnegative)replacedreplacements     r+   	to_symbolsympy_subs.<locals>.to_symbol  sT    (EJJ////k3''<< ++$33  rG   )rJ   r   xreplacer  )r  replacementsr  krS   s        r+   
sympy_subsr    sP    	 ==''(4(:(:(<=(<IaO	(<= =s   A
c                   [        U [        R                  5      =(       dd    [        U [        R                  5      =(       aC    [	        S [
        R                  " U R                  5       U R                  5       5       5       5      $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7frX   is_symbolicrw   r(   s     r+   ry   is_symbolic.<locals>.<genexpr>  s     N(M1A(Mr{   )	rI   r$   SymIntr7  r  r  r  rd  stride)r   s    r+   r  r    sS    a& 1ell# 	ON	!((*(MNNrG   c                 &    [        S U  5       5      $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7frX   r  r  s     r+   ry   "any_is_symbolic.<locals>.<genexpr>  s     ,t!{1~~tr{   r  )rP   s    r+   any_is_symbolicr    s    ,t,,,rG   c                H   SSK Jn  1 Skn[        R                  " 5       (       a  UR	                  1 Sk5        U R
                  R                   HQ  n[        UR                  5      U;   a  Us  $ UR                  R                  S5      =nc  M@  U" U5      (       d  MO  Us  $    g )Nr   )free_unbacked_symbols>   run_with_rng_staterun_and_save_rng_stateaten._assert_scalaraten._local_scalar_densefbgemm.dense_to_jagged.default%fbgemm.jagged_to_padded_dense.default,aten._fused_moving_avg_obs_fq_helper.default7aten._fused_moving_avg_obs_fq_helper_functional.default>   aten.scatter.srcaten.scatter_add_aten.scatter.reduceaten.index_put.defaultaten.index_put_.defaultaten.scatter_reduce.twoaten.scatter_add.defaultaten.scatter_reduce_.twoaten.scatter.value_reduceaten.scatter_reduce.two_outaten._unsafe_index_put.default0aten._unsafe_masked_index_put_accumulate.defaultr  )%torch.fx.experimental.symbolic_shapesr  r$   $are_deterministic_algorithms_enabledupdater  r  r   r'  r   get)r?  r  forbidden_setr>  r  s        r+   %get_first_incompatible_cudagraph_noder-    s     LM 1133	
  t{{},K99==''C49Ns9S9SK	 
 rG   c                    [        [        [        U R                  R                  5      5      5      nUR
                  S:X  d   eU$ )z$Get the output node from an FX graphr;  )nextiterreversedr  r  r  )r?  	last_nodes     r+   output_noder3    s6    T(288>>234I<<8###rG   z	List[Any]_registered_cachesc                    [        U S5      (       a  [        U R                  5      (       d  [        U  S35      e[        R                  U 5        U $ )ze
Use this decorator to register any caches that should be cache_clear'd
with fresh_inductor_cache().
cache_clearz# does not have a cache_clear method)r   callabler6  AttributeErrorr4  r1  r[  s    r+   clear_on_fresh_inductor_cacher:     sE    
 3&&hs.G.Gu$GHIIc"JrG   c                 >    [          H  n U R                  5         M     g)z
Clear all registered caches.
N)r4  r6  r9  s    r+   clear_inductor_cachesr<  ,  s     " "rG   c              #  V  #    [        5         [        R                  " US9n [        R                  R                  [        R                  SU05         [        R                  SU5        [        R                  R                  US5      n[        R                  R                  [        R                  SU05         Sv   [        U [
        5      (       a  [        U 5      S:X  d   S5       e[        R                  R                  U5      (       a{  [        R                  " U5      nU R!                  U Vs0 s HH  nS	U;  d  M  U[        R                  R#                  [        R                  R                  XF5      5      _MJ     sn5        SSS5        SSS5        U(       a  [$        R&                  " U5        [        5         gs  snf ! , (       d  f       NC= f! , (       d  f       NL= f! [(         a&    [*        (       d   [        R-                  S
U5        e  Naf = f! [        5         f = f7f)z
Contextmanager that provides a clean tmp cachedir for inductor.

Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
generated with this cache instance.
)dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNr   z!expected empty cache_entries dictz.lockz(on error, temporary cache dir kept at %s)r<  tempfilemkdtempr   patchdictosenvironr   r   pathr  rI   r&   existslistdirr*  getsizeshutilrmtree	Exception_IS_WINDOWSwarning)cache_entriesr>  deleteinductor_cache_dirtriton_cache_dirfilesfs          r+   fresh_inductor_cacherW  4  s     !))c2 ZZ__JJ24FG
 II35GH!ww||,>I.@BR-STmT22}-2W4WW2ww~~&677 "

+; <%,, */).A#*!#3 !V277??277<<@P3T#U U). U
$ MM,- 	# UT
 
(  { KKBDVW  	s   H)0G' A'G9A9G2
G  AG GG%G' 5H) G
G	G
G$ G' '-HH HH H&&H)c           
     z    U R                   n[        [        U 5      5      n[        [	        [        X!SS95      5      $ )NT)r  reverse)__getitem__r   r&   r  r1  rm  )seqgettera_rs      r+   argsortr^  `  s/    __F
C/C>?@@rG   c           	     @  ^  U 4S jn[        U5       VVs/ s H>  u  p4U[        U[        R                  5      (       a  UR                  R
                  OU4PM@     nnn[        U[        R                  " U5      S9nU VVs/ s H  u  p6UPM	     nnnU$ s  snnf s  snnf )Nc                z   > U u  p#Uu  pEU4S jnU" X5:  5      (       a  gU" X5:  5      (       a  gX$:  a  gX$:  a  gg)Nc                R   > [        U [        5      (       a  U $ TR                  U SS9$ )NT)size_oblivious)rI   r   evaluate_expr)r  r  s    r+   evaluate*argsort_sym.<locals>.cmp.<locals>.evaluaten  s+    $%%**4*EErG   rr   r!   r   r_   )r   r   a_idxa_valb_idxb_valrd  r  s          r+   rb  argsort_sym.<locals>.cmpj  sN    	F
 EM""EM""
 ==rG   rl  )	r   rI   r$   r  r>  r  rm  r   
cmp_to_key)r  r[  rb  r  r  exprsr   rK  s   `       r+   argsort_symrm  g  s    4  n$FC 
Z5<<88affkka@$ 
  5i22378E %&fccF&M
 's   ABBc                H    [         R                  " SU S9R                  5       $ )Nr_   rl   )r$   r}   element_sizero  s    r+   get_dtype_sizerq    s    ;;r'4466rG   c                       \ rS rSr% S\S'   Srg)LineContexti  r   contextr_   Nrb   rc   rd   re   __annotations__rj   r_   rG   r+   rs  rs    s    LrG   rs  c                      \ rS rSrSrSS jrSS jrSS jrSS jrS r	S r
S	 rS
 rS rS rSS jrSS jrSS jrSS jrSS jrS rS rSrg)IndentedBufferi     c                    / U l         Xl        g rX   )_lines_indent)rv  initial_indents     r+   __init__IndentedBuffer.__init__  s    %rG   c                   [        5       nSn/ nU R                   H  n[        U[        5      (       a  U" 5       nUc  M$  O3[        U[        5      (       a  UR                  X$R                  45        MX  [        U[        5      (       d   eUR                  U5        UR                  S5        USUR                  S5      -   -  nM     UR                  5       U4$ )Nr!   r  )r   r{  rI   DeferredLineBasers  r1  rt  r   writecountgetvalue)rv  bufr   linemaplines        r+   getvaluewithlinemap"IndentedBuffer.getvaluewithlinemap  s    jKKD$ 011v<  D+..<<01dC((((IIdOIIdOTZZ%%%A   ||~w&&rG   c                *    U R                  5       u  pU$ rX   )r  )rv  rS   r   s      r+   r  IndentedBuffer.getvalue  s    '')rG   c                   [        5       nU R                   H  n[        U[        5      (       a  U" 5       nUc  M$  O[        U[        5      (       a  M<  [        U[
        5      (       d   eUR                  S5      (       a  UR                  US S 5        M  UR                  U5        UR                  S5        M     UR                  5       $ )N\rr   r  )	r   r{  rI   r  rs  r   endswithr  r  )rv  r  r  s      r+   getrawvalueIndentedBuffer.getrawvalue  s    jKKD$ 011v<  D+..dC((((}}T""		$s)$		$		$   ||~rG   c                8    U R                   R                  5         g rX   )r{  clearru  s    r+   r  IndentedBuffer.clear  s    rG   c                ,    [        U R                  5      $ rX   )r   r{  ru  s    r+   __bool__IndentedBuffer.__bool__  s    DKK  rG   c                :    SU R                   U R                  -  -  $ )Nr  )r|  tabwidthru  s    r+   r  IndentedBuffer.prefix  s    dllT]]233rG   c                &    U R                  S5        g )Nr  	writelineru  s    r+   newlineIndentedBuffer.newline  s    trG   c                   [        U[        5      (       a  U R                  R                  U5        g [        U[        5      (       a9  U R                  R                  UR                  U R                  5       5      5        g UR                  5       (       a.  U R                  R                  U R                  5        U 35        g U R                  R                  S5        g Nr  )rI   rs  r{  r1  r  with_prefixr  striprv  r  s     r+   r  IndentedBuffer.writeline  s    dK((KKt$.//KKt//>?ZZ\\KK$++-78KKr"rG   c                8    U H  nU R                  U5        M     g rX   r  )rv  linesr  s      r+   
writelinesIndentedBuffer.writelines  s    DNN4  rG   c                H   ^ ^ [         R                  UU 4S j5       nU" 5       $ )Nc               3     >#    T=R                   T -  sl          S v   T=R                   T -  sl         g ! T=R                   T -  sl         f = f7frX   r|  )offsetrv  s   r+   r  "IndentedBuffer.indent.<locals>.ctx  s8     LLF"L'&&s   A4 AAA)
contextlibcontextmanager)rv  r  r  s   `` r+   indentIndentedBuffer.indent  s$    		"	"	' 
#	' urG   c                .    U =R                   U-  sl         g rX   r  rv  r  s     r+   	do_indentIndentedBuffer.do_indent      rG   c                .    U =R                   U-  sl         g rX   r  r  s     r+   do_unindentIndentedBuffer.do_unindent  r  rG   c           	        [        U[        5      (       a  [        S5      nUR                   HR  n[        U[        5      (       a  M  U(       d  M#  [        U[        U5      [        UR                  5       5      -
  5      nMT     [        R                  " U5      (       a  SnUR                   HV  n[        U[        5      (       a  U R                  R                  U5        M5  [        R                  X[        U5      S  5        MX     g [        R                  " U5      nU(       a  UR                  5       nU(       d  g UR                  5       nUR!                  S5       H  nU R                  U5        M     g )Ninfr   r  )rI   rx  floatr{  rs  minr&   r  mathisinfr1  r  rY   textwrapdedentrstripr  )rv  
other_coder  r  r  s        r+   spliceIndentedBuffer.splice  s   j.115\F"))!$44 TS5G)GHF * zz&!!"))dK00KK&&t,",,TF3FG	 * "4J'..0
#**,J"((.t$ /rG   c                    [        U R                  S9nU R                   Vs/ s H
  o1" U5      PM     snUl        U$ s  snf N)r}  )rx  r|  r{  )rv  r  r   r  s       r+   rN   IndentedBuffer.map  s8    DLL9-1[[9[Td4j[9

 :s   =c                @    [        U 5       SU R                  5        S3$ )Nr  r  )r   r  ru  s    r+   __repr__IndentedBuffer.__repr__
  s     t*Qt}}/q11rG   c                    U R                   UR                   :X  d   e[        U R                   S9nUR                  U R                  5        UR                  UR                  5        U$ r  )r|  rx  r  r{  )rv  otherr   s      r+   __add__IndentedBuffer.__add__  sK    ||u}},,,DLL9t{{#u||$
rG   )r|  r{  Nr   )ra   z)tuple[str, list[tuple[int, LineContext]]]ra   r   rV   )F)r  zCallable[[Any], Any]ra   rx  )rb   rc   rd   re   r  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  rN   r  r  rj   r_   rG   r+   rx  rx    s\    H&'$$!4#!	%.
2rG   rx  c                  2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )FakeIndentedBufferi  c                "   > [         TU ]  5         g rX   )superr~  )rv  	__class__s    r+   r~  FakeIndentedBuffer.__init__  s    rG   c                V    US:X  a  [         R                  X5      $ [        SU S35      e)Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )rv  r   s     r+   r  #FakeIndentedBuffer.__getattribute__  s9    ;**466!$ (= =
 	
rG   r_   r}  )rb   rc   rd   re   r~  r  rj   __classcell__r  s   @r+   r  r    s    
 
rG   r  c              #  x   #     S v   U [         l        U[         l        g ! U [         l        U[         l        f = f7frX   )sysstdoutstderr)initial_stdoutinitial_stderrs     r+   restore_stdout_stderrr  $  s,     $#
#
 $
#
s   : :7:c                  P    \ rS rSrSrS rSS jrSS jrS rS r	S r
S	 rS
 rSrg)r  i-  z.A line that can be 'unwritten' at a later timec                >    UR                  5       (       d  SnXl        g r  )r  r  r  s     r+   r~  DeferredLineBase.__init__0  s    zz||D	rG   c                    [         e)zJReturns either self.line or None to indicate the line has been 'unwritten'r  ru  s    r+   r{  DeferredLineBase.__call__5      !!rG   c                    [         e)z3Returns a new deferred line with the same conditionr  r  s     r+   	_new_lineDeferredLineBase._new_line9  r  rG   c                @    U R                  U U R                   35      $ rX   r  r  )rv  r  s     r+   r  DeferredLineBase.with_prefix=  s    ~~455rG   c                T    U R                  U R                  R                  5       5      $ rX   )r  r  r  ru  s    r+   r  DeferredLineBase.lstrip@  s    ~~dii..011rG   c                >    U R                  U R                  U   5      $ rX   r  )rv  r   s     r+   rZ  DeferredLineBase.__getitem__C  s    ~~dii.//rG   c                ,    [        U R                  5      $ rX   )r   r  ru  s    r+   r  DeferredLineBase.__bool__F  s    DIIrG   c                ,    [        U R                  5      $ rX   )r&   r  ru  s    r+   __len__DeferredLineBase.__len__I  s    499~rG   )r  N)ra   Optional[str])r  r   ra   r  )rb   rc   rd   re   rf   r~  r{  r  r  r  rZ  r  r  rj   r_   rG   r+   r  r  -  s-    8
""620rG   r  c                  D   ^  \ rS rSrSrSU 4S jjrSS jrS	S jrSrU =r	$ )
DelayReplaceLineiM  z6At end of codegen call `line.replace(key, value_fn())`c                <   > [         TU ]  U5        Xl        X l        g rX   )r  r~  r  value_fn)rv  r  r  r  r  s       r+   r~  DelayReplaceLine.__init__P  s     rG   c                j    U R                   R                  U R                  U R                  5       5      $ rX   )r  replacer  r  ru  s    r+   r{  DelayReplaceLine.__call__U  s#    yy  4==?;;rG   c                D    [        U R                  U R                  U5      $ rX   )r  r  r  r  s     r+   r  DelayReplaceLine._new_lineX  s    $-->>rG   )r  r  )r  r   r  zCallable[[], str]r  r   r  )r  r   ra   r  )
rb   rc   rd   re   rf   r~  r{  r  rj   r  r  s   @r+   r  r  M  s    @!
<? ?rG   r  c                   [        U [        R                  5      (       a  U nO[        R                  " SU 5      n[        R                  " U5      n[        R
                  R                  (       aF  UR                  c   eUR                  S:  d  UR                  S:X  a  [        R                  S5        ggSnUR                  nXC:  a  [        R                  SX4S	.S
9  gg)Nr   	   
   z6GPU arch does not support max_autotune_gemm mode usageFTD   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)rI   r$   rm   r   createversionhipmajorr   rP  multi_processor_count)index_or_devicerm   propr
  r  s        r+   
is_big_gpur  \  s    /5<<00 fo6""6*D }}zz%%%::>TZZ2-KKPQG**I:%> 	 	
 rG   c                 ~    [         R                  =(       d'    [         R                  =(       d    [         R                  $ rX   )r<   max_autotunemax_autotune_gemmsearch_autotune_cacher_   rG   r+   use_max_autotuner  y  s&    Wv77W6;W;WrG   c                    U R                   R                  S:H  =(       a+    U R                  U;   =(       a    [        U R                   5      $ )Nr   )rm   r   rl   r  )layoutallowed_layout_dtypess     r+   _use_template_for_cudar    s<    f$ 	&LL11	&v}}%rG   c                    U R                  5       [        R                  R                  5       R                  S5       Vs/ s H  oR	                  5       PM     sn;   $ s  snf N,)upperr<   max_autotune_gemm_backendsr  r  backendr(   s     r+   _use_autotune_backendr%    P    ==?!<<BBDJJ3OOa	O      Ac                    U R                  5       [        R                  R                  5       R                  S5       Vs/ s H  oR	                  5       PM     sn;   $ s  snf r  )r!  r<   max_autotune_conv_backendsr  r  r#  s     r+   _use_conv_autotune_backendr*    r&  r'  F)enable_int32enable_float8c                  SSK JnJn  [        R                  [        R
                  [        R                  /nU(       a>  [        R                  [        R
                  [        R                  [        R                  /nU(       a/  UR                  [        R                  [        R                  /5        U R                  R                  S:H  =(       a    [        X5      =(       d/    U R                  R                  S:H  =(       a    U R                  U;   =(       a@    [        5       =(       a/    [!        S5      =(       a    U" U R                  UR"                  5      $ )Nr!   )BackendFeaturehas_backend_featurer   r   TRITON)codegen.commonr.  r/  r$   r   r   r   r   extendr   r   rm   r   r  rl   r  r%  TRITON_TEMPLATES)r  r+  r,  r.  r/  layout_dtypess         r+   use_triton_templater5    s    C]]ENNEMMBMu{{Se1153D3DEF ""f, B*6AO ""e+M0M		P 		P "(+		P  ~/N/NOrG   c                $   SSK Jn  UR                  R                  R	                  X-  U-  SS9nUS::  d  U[
        R                  R                  :  a  gSSKJ	n  [        R                  R                  (       a  g[        R                  [        R                  [        R                  [        R                   /n[#        X5      =(       a    [%        5       =(       a    ['        S5      nU(       a"  U" 5       (       d  [(        R+                  S	5        gU$ )
Nr!   r  rr   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir is set correctly. Skipping CUTLASS backend for now.)r  r  r  r  	size_hintr<   r   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsr9  r$   r  r  r   r   r   r   r  r  r%  r   rP  )	r  mr  r  r  	gemm_sizer9  r4  r   s	            r+   use_cutlass_templater@    s      **1519r*BIA~V[[%N%NN> }}]]ENNEMM5;;OMv5 	-	-!),  !##KK4
 JrG   c                T    [         R                  R                  U 5      R                  $ rX   )r$   r   get_device_propertiesgcnArchNamerm   s    r+   _rocm_native_device_arch_namerE    s    ::++F3???rG   c                      SS K n SSKJnJn  SSKJn  [        R                  R                  U R                  5      nXAX#4$ ! [         a    S nS n " S S5      nS n N"f = f)Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     / $ rX   r_   r_   rG   r+   rG  *try_import_ck_lib.<locals>.gen_ops_library      IrG   c                     / $ rX   r_   r_   rG   r+   rH  .try_import_ck_lib.<locals>.gen_ops_preselected  rL  rG   c                      \ rS rSrSrg)*try_import_ck_lib.<locals>.CKGemmOperationi  r_   N)rb   rc   rd   re   rj   r_   rG   r+   rI  rP    s    rG   rI  )ck4inductor(ck4inductor.universal_gemm.gen_instancesrG  rH  ck4inductor.universal_gemm.oprI  rF  rH  dirname__file__r   )rQ  rG  rH  rI  package_dirnames        r+   try_import_ck_librW    sh    	
	
 ''//+*>*>? -@QQ  			 	 s   ;A A A c                   [        5       (       d  g[        R                  R                  (       d  gU R                  R
                  S:X  d  g[        U R                  5      n[        R                  R                   Vs0 s H  o"R                  S5      S   U_M     sn=(       d    UR                  S5      S   U0nUR                  5       [        R                  R                  -   Vs/ s H  nX2   PM	     nnU(       d  gU R                  [        R                  [        R                  [        R                   4;  a  g[#        5       u  n    nU(       d  [$        R'                  S5        g[        R(                  " 5       (       a  U[        R                  l        [        R                  R*                  (       d  [$        R'                  S5        gU[        R                  R*                  :w  a  [$        R'                  S5        ggs  snf s  snf )	NFr   :r   z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)r  r$   r  r  rm   r   rE  r<   rocmarchr  r  ck_supported_archrl   r   r   r   rW  r   rP  	is_fbcodeck_dir)r  native_archr  requested_archsrequested_supported_archsck_package_dirnamer   s          r+   use_ck_templaterc    s   ====' 0>K39;;3C3CD3Cawws|A)3CD #q!;IO
 !%%'&++*G*GG!GA 	G  ! %||EMM5>>5==II"3"51aBC/;;BCV[[///01= E!s   =H(Hc                    SSK Jn  [        S5      =(       a>    [        U 5      =(       a,    UR                  R
                  R                  X-  U-  SS9S:  $ )Nr!   r  CKrr   r7  r   )r  r  r%  rc  r  r  r;  )r  r>  r  r  r  s        r+   use_ck_gemm_templaterf    sP     	d# 	CF#	CGG&&quqy2&>BrG   c                <    [        S5      =(       a    [        U 5      $ )Nre  )r*  rc  r  s    r+   use_ck_conv_templateri     s    %d+G0GGrG   c                V    [        5       =(       a    U R                  R                  S:H  $ rB  )r  rm   r   rh  s    r+   _use_template_for_cpurk  $  s    =&--"4"4"==rG   c                Z    [        XUSS9=(       a    UR                  R                  5       $ )NF)require_constant_mat2)use_cpp_gemm_templater  is_contiguous)r  mat1mat2s      r+   use_cpp_bmm_templaterr  (  s(    fDN 	(KK%%'rG   c                   SSK Jn  SSKJn  SSKJn  SSKJn  [        U 5      (       a  [        S5      (       d  g[        R                  R                  (       d  gUR                  5       [        R                  :H  n	[        R                   [        R"                  [        R$                  [        R                  /n
U" UUU	(       a  U R&                  OS US9u  ppp[)        X45      (       a  g[+        X%R,                  5      (       a  UR/                  5       nU" UR                  5       5      u  pU" S	UUUUR                  5       UR                  5       U[1        5       S
9nS nU R&                  U
;   =(       aT    US L=(       aI    U" U5      =(       a:    [+        X%R2                  5      =(       a    UR5                  5       =(       d    U(       + $ )Nr!   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposed
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsc                N    U R                  5         U R                  5       S   S:H  $ )Nrr   r!   )freeze_layout
get_strider(   s    r+   is_last_dim_stride12use_cpp_gemm_template.<locals>.is_last_dim_stride1X  s"    	||~b!Q&&rG   )r  r  codegen.cpp_micro_gemmrt  codegen.cpp_utilsru  kernel.mm_commonrv  rk  r%  r<   cppweight_prepack	get_dtyper$   r   r   r   halfrl   has_free_symbolsrI   BaseViewunwrap_viewparallel_num_threadsr  is_module_buffer)r  rp  rq  ry  rm  r  rt  ru  rv  	int8_gemmr4  r>  r  r  r}  r   rz  r  s                     r+   rn  rn  /  sk    9M) ((0Ee0L0L::$$ EKK/I]]ENNEJJLM")"+&,,'	#A!T $$$!@AQROL"			NN$^^%!(*	J'
 	% 	Cd"	C%	C t]]+	C ""$A,A(ArG   c                 D    [        5       (       + =(       d    [        S5      $ )NATEN)r  r%  r_   rG   r+   use_aten_gemm_kernelsr  e  s    !!B%:6%BBrG   c                  Z    \ rS rSr% \R
                  " S5      rS\S'   S
S jrS r	S r
Srg	)DebugDirManagerii  r   r   prev_debug_namec                @    [        [        R                  5      U l        g rX   )r/  r  counterr   ru  s    r+   r~  DebugDirManager.__init__m  s    ../rG   c                    [         R                  R                  R                  U l        U R                   SU R
                   3U l        U R                  [         R                  R                  l        g )N_tmp_)r$   _dynamor<   debug_dir_rootr  r   new_nameru  s    r+   	__enter__DebugDirManager.__enter__p  sM    $}}33BB//0dggY?.2mm+rG   c                    [         R                  " U R                  5        U R                  [        R
                  R                  l        g rX   )rL  rM  r  r  r$   r  r<   r  )rv  rP   s     r+   __exit__DebugDirManager.__exit__u  s*    dmm$.2.B.B+rG   )r   r  r  Nr}  )rb   rc   rd   re   r  r  r  rv  r~  r  r  rj   r_   rG   r+   r  r  i  s&    ooa G0<
CrG   r  c                   ^ SSK Jn  / mSU4S jjn[        R                  R	                  USU5         [
        R                  R                  5         U " U0 UD6nS S S 5        UT4$ ! , (       d  f       WT4$ = f)Nr!   GraphLoweringc                (   > TR                  U 5        g rX   r1  codesource_codess    r+   save_output_code*run_and_get_code.<locals>.save_output_code      D!rG   r  r  r   r  r  r   rD  r  r$   r  reset)r   rP   r=  r  r  rK  r  s         @r+   run_and_get_coder  z  ss    $ L" 
		=*<>N	OT$V$ 
P < 
P	O <s   'A&&
A7c                &   ^  U 4S jn[        U5      $ )Nc                 R   > T" 5       n U R                  5       R                  5         U $ rX   )r   backward)rK  r   s    r+   run_with_backward1run_fw_bw_and_get_code.<locals>.run_with_backward  s!    

rG   )r  )r   r  s   ` r+   run_fw_bw_and_get_coder    s    
 -..rG   c                t  ^^ SSK Jn  / mSU4S jjmS	U4S jjn[        R                  R	                  USU5         [        R                  R	                  UST5         [
        R                  R                  5         U " U0 UD6nSSS5        SSS5        T$ ! , (       d  f       N= f! , (       d  f       T$ = f)
zLGet the inductor-generated code, but skip any actual compilation or running.r!   r  c                (   > TR                  U 5        g rX   r  r  s    r+   r  "get_code.<locals>.save_output_code  r  rG   c                   >  " S S5      nU R                   (       a  U R                  5       OU R                  5       u  p#T" U5        U" 5       $ )Nc                  (    \ rS rSrSrSS jrS rSrg)@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulei  z4This is empty to replace the generated triton modulec                    g rX   r_   ru  s    r+   r~  Iget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__  s    rG   c                    g rX   r_   rz  s      r+   callEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call  s    rG   r_   Nr}  )rb   rc   rd   re   rf   r~  r  rj   r_   rG   r+   DummyModuler    s    FrG   r  )cpp_wrappercodegen_with_cpp_wrappercodegen)rv  r  r  r   r  s       r+   patched_compile_to_module+get_code.<locals>.patched_compile_to_module  sD    	 	 04/?/?D))+T\\^ 	
 	}rG   compile_to_moduler  Nr  )rv  r  r  )r   rP   r=  r  r  r   r  r  s         @@r+   get_coder    s    $ L"( 
		*,E
zz(:<LM N
  NM
 
 s#   "B('BB(
B%	!B((
B7c                    [        U /UQ70 UD6nS[        U5      s=::  a  S::  d  O   S[        U5       35       eUS   $ Nr!      z%expected one or two code outputs got r   )r  r&   )r   rP   r=  r  s       r+   get_triton_coder    sS    B000L 	
S#!#C	.s</@.ABC#?rG   c                    [        U /UQ70 UD6u  p4S[        U5      s=::  a  S::  d  O   S[        U5       35       eUS   $ r  )r  r&   )r   rP   r=  r   r  s        r+   run_and_get_triton_coder    sU    &r;D;F;OA 	
S#!#C	.s</@.ABC#?rG   c                   ^^^ SSK Jm  SSKJn  UR                  m/ mUUU4S jn[
        R                  R                  USU5         U " U0 UD6nS S S 5        UT4$ ! , (       d  f       WT4$ = f)Nr   r  )CompiledFxGraphc                 h   > T" U 0 UD6  U S   n[        UT5      (       d   eTR                  U5        g )Nr  )rI   r1  )rP   r=  r  r  graph_lowerings	real_inits      r+   	fake_init-run_and_get_graph_lowering.<locals>.fake_init  s:    4"6"Q%////u%rG   r~  )torch._inductor.graphr  torch._inductor.output_coder  r~  r   rD  r  )	r   rP   r=  r  r  rK  r  r  r  s	         @@@r+   run_and_get_graph_loweringr    so    3;((IO& 
		?J		BT$V$ 
C ?"" 
C	B ?""s   	A
A-c              #     #    SSK Jn  UR                  U    n [        R                  " X5      UR                  U '   Sv   X2R                  U '   g! X2R                  U '   f = f7f)zs
Override the lowering of aten_op with override_fn.
The first argument of override_fn is the original lowering fn.
r   )loweringN)torch._inductorr  	loweringsr   partial)aten_opoverride_fnr  orig_fns       r+   override_loweringr    sY      )  )G.&/&7&7&M7#&-7#g7#s   A"'A  A"AA"c                   ^ ^^ SSK Jn  UR                  mUUU 4S jn[        R                  R
                  R                  USU5      $ )zf
Add hook functions to be called at the beginning and end of Scheduler.__init__.
Used for unit tests.
r   )	Schedulerc                F   > T" X5        T" X5      nT(       a  T" X5        U$ rX   r_   )	schedulerr  outr  post_fnpre_fns      r+   r  (add_scheduler_init_hook.<locals>.wrapper  s%    y i'I%
rG   r~  )torch._inductor.schedulerr  r~  unittestr   rD  r  )r  r  r  r  r  s   ``  @r+   add_scheduler_init_hookr    s9    
 4  G ==%%iWEErG   c                    [         R                  (       a  [        R                  U 5        g[        R	                  U 5        g)z
Warnings that will be actionable for PyTorch developers, but not
end users.  Allows us to easily disable them in stable releases but
keep them on for nightly builds.
N)r<   developer_warningsr   rP  info)msgs    r+   developer_warningr     s$       CrG   c                     [         R                  R                  S5      n U S-   [        [         R                  5      :  aV  [        [         R                  U S-      5      S:  a3  [         R                  U S-      S   S:w  a  [         R                  U S-      $ [         R                   H)  nUR                  S5      (       d  M  U[        S5      S s  $    g! [         a     NJf = f)a  
An experimental API used only when config.benchmark_kernel is true.

The benchmark name is only available at codegen time. So we can not
directly call it in benchmark_all_kernels which is run after codegen.

The function assumes the argument after --only is the benchmark name.
It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
scripts, this function may return None.

There are 2 flavors of --only argument we need handle:
1. --only model_name
2. --only=model_name
z--onlyr!   r   -z--only=N)r  argvr   r&   
ValueError
startswith)r  r0  s     r+   get_benchmark_namer    s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx>>)$$s9~'((   s   BC 
C"!C"c                &    [        S U  5       5      $ )Nc              3  *   #    U  H	  oS :H  v   M     g7fr!   Nr_   r	  s     r+   ry   is_ones.<locals>.<genexpr>,       %u!Avu   rM   r  s    r+   is_onesr  +      %u%%%rG   c                &    [        S U  5       5      $ )Nc              3  *   #    U  H	  oS :H  v   M     g7f)r   Nr_   r	  s     r+   ry   is_zeros.<locals>.<genexpr>0  r   r  r  r  s    r+   is_zerosr	  /  r  rG   c                &    [        S U  5       5      $ )Nc              3     #    U  HI  n[        U[        R                  5      (       d  M$  UR                  [        R                  " S 5      :H  v   MK     g7f)r   N)rI   r$   r7  rm   )rw   items     r+   ry    is_cpu_device.<locals>.<genexpr>4  s9      DdELL) 	+u||E**s
   #A*Ar  )inputss    r+   is_cpu_devicer  3  s       rG   c                    [        U [        R                  5      (       d   S5       eU R                  (       a  [        R
                  $ [        R                  $ )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rI   rJ   r   rh   r$   r   r   )r  s    r+   get_sympy_Expr_dtyper  ;  sI    UZZ  BAB  ~~{{}}rG   c              /     #    U (       a.  [         R                  R                  " U0 UD6 nUv   S S S 5        g S v   g ! , (       d  f       g = f7frX   )r$   r   r   )should_profilerP   r=  r   s       r+   maybe_profiler  E  s;     ^^##T4V4G 54 	 54s   (A=A
AAc                 p    [         R                  R                  n U S:  a  [        R                  " 5       n U $ )Nr!   )r<   r  threadsr$   get_num_threads)r  s    r+   r  r  N  s+    jj  G{'')NrG   c                     SSK Jn   U " 5       nUR                  S[        R                  R
                  (       a  S5      $ S5      $ )Nr!   )get_backend_options
num_stagesr     )runtime.triton_helpersr  r+  r$   r  r  )r  optionss     r+   get_backend_num_stagesr  U  s2    ;!#G;;|%--*;*;QCCCCrG   c                .   SSK JnJn  U [        R                  [        R
                  [        R                  4;   d   e[        R                  " U5      R                  R                  S5      (       a  SSKJn  U" 5       nU [        R                  [        R
                  4;   a  U" X5      $ [        R                  R                  R                  R                   (       a  U" [        R                  U5      $ U" [        R                  U5      $ U [        R                  [        R
                  4;   a  U" U 5      $ [        R                  R                  R                  R                   (       a  U" [        R                  5      $ U" [        R                  5      $ )Nr   )get_max_simd_tflopsget_max_tensorcore_tflops
clock_rate)max_clock_rate)triton.testingr   r!  r$   r   r   r   inspect	signature
parametersr+  torch._utils_internalr#  backendsr   matmul
allow_tf32)rl   r   r!  r#  sm_clocks        r+   get_device_tflopsr-  ]  s   MU]]ENNEMMBBBB,-88<<\JJ8!#U]]ENN33,U==>>%%00,U]]HEE&u}}h??U]]ENN33,U33>>%%00,U]];;&u}}55rG   c                     SSK Jn   U " 5       $ )Nr   get_dram_gbps)r$  r0  r/  s    r+   get_gpu_dram_gbpsr1  y  s    ,?rG   c                 x    SSK Jn   U R                  R                  R	                  S5      R                  SS5      $ )Nr   drivermax_shared_mem)triton.runtimer4  activeutilsrB  r+  r3  s    r+   get_gpu_shared_memoryr9    s.    %==44Q7;;<LaPPrG   c                $    U R                  S5      $ )Nwelford)r  reduction_types    r+   is_welford_reductionr>    s    $$Y//rG   c                *    [        U 5      (       a  S$ S$ )Nr  r!   )r>  r<  s    r+   reduction_num_outputsr@    s    $^441;!;rG   c                 2    [         R                  " 5       S:H  $ )NLinux)platformsystemr_   rG   r+   is_linuxrE    s    ??''rG   c                 (    [         R                  S:H  $ )Nr>   )r  rC  r_   rG   r+   
is_windowsrG    s    <<7""rG   c                &    [        S U  5       5      $ )Nc              3     #    U  H7  n[        U[        R                  5      =(       a    UR                  (       + v   M9     g 7frX   )rI   rJ   r   	is_numberr	  s     r+   ry   #has_free_symbols.<locals>.<genexpr>  s)     Jcz!UZZ(<_<cs   ?Ar  )itrs    r+   r  r    s    JcJJJrG   c            	        SSK Jn  U  H  n[        X!R                  UR                  UR
                  UR                  UR                  45      (       aR  [        UR                  5       =(       d    S5      (       d'  [        UR                  5       =(       d    S5      (       a    gM  [        X!R                  5      (       d  M  [        S[        U5       35      e   g)Nr!   r  r_   Tzunexpected type for is_dynamic F)r  r  rI   r  r  r  ComputedBufferBufferr  maybe_get_sizemaybe_get_strider  	TypeErrorr   )rP   r  ts      r+   
is_dynamicrT    s    bmmR[[":K:KRYYW
 
   0 0 2 8b99=M""$*> > > Ayy))=d1gYGHH  rG   c                      \ rS rSrSrSrSrg)Placeholderi  KERNEL_NAMEDESCRIPTIVE_NAMEr_   N)rb   rc   rd   re   rW  rX  rj   r_   rG   r+   rV  rV    s      K *rG   rV  c                x   SSK Jn  [        R                  " SSSS9 n[        R
                  " 5       n[        R
                  " 5       n[        U[        U5      S9R                  " U6   [        SUR                   3US	9  [        UR                  US	9  [        R                  " 5       n[        X5         U " UR                  5        S S S 5        [        R                  " 5       U-
  n	U" UR                  5        UR                  R                  5         UR                  5         [        S
UR                   3US	9  [        UR                  US	9  UR!                  5       UR!                  5       :H  n
["        R%                  SUUR&                  U
U	5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f)Nr!   )stable_topological_sortwzutf-8F)modeencodingrR  )r?  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherrZ  rB  NamedTemporaryFileior   r2   r.   	propagaterP  r  r   nowr1   lint	recompiler  r   r  r   )r  r?  inpr  rZ  rV  	before_ioafter_io
start_timetime_elapsedrS  s              r+   pass_execution_and_saverl    sF   8		$	$
 
KKM	;;=R#3C#89CCSI	"(($1-bhhY'\\^
#B,N -||~
2)


#!,bhhX& H$5$5$77hFF	
-
 
 -,
 
s%   BF+3FCF+
F(	$F++
F9c                N   SSK Jn  [        U 5      UR                  :H  =(       a    US L =(       d    U R                  UL =(       Gd`    [        U 5      UR
                  :H  =(       Ga@    [        [        R                  R                  S5      =(       a;    U R                  [        R                  R                  R                  R                  :H  =(       d    [        [        R                  R                  S5      =(       a;    U R                  [        R                  R                  R                  R                  :H  =(       df    [        [        R                  R                  S5      =(       a;    U R                  [        R                  R                  R                  R                  :H  $ )Nr!   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r   _CollectiveKernelop_overloadFallbackKernelr   r$   r   torchrecrn  defaultro  rp  r>  r  r  s      r+   is_collectiverw    s1    	T
b***Ud
0Td>N>NRT>T  	T
b''' 	
 	

 		**,?@ U$$		(:(:(L(L(T(TT
 		**,DE E$$99%%<<DDE 		**,CD Y$$		(:(:(P(P(X(XX+rG   c                >    SSK Jn  [        U 5      UR                  :H  $ Nr!   r  )r  r  r   _WaitKernel)r>  r  s     r+   is_waitr{    s    :''rG   c                    SSK JnJn  [        X5      (       d   e[        X5      (       a  [	        S U R
                   5       5      $ [        U R                  5      $ )Nr   BaseSchedulerNodeGroupedSchedulerNodec              3  8   #    U  H  n[        U5      v   M     g 7frX   )contains_collectiver	  s     r+   ry   &contains_collective.<locals>.<genexpr>  s     @<a&q))<r{   )r  r~  r  rI   r  snodesrw  r>  snoder~  r  s      r+   r  r    sE    Qe////%..@5<<@@@UZZ((rG   c                    SSK JnJn  [        X5      (       d   e[        X5      (       a  [	        S U R
                   5       5      $ [        U R                  5      $ )Nr   r}  c              3  8   #    U  H  n[        U5      v   M     g 7frX   )contains_waitr	  s     r+   ry    contains_wait.<locals>.<genexpr>  s     :\=##\r{   )r  r~  r  rI   r  r  r{  r>  r  s      r+   r  r    sE    Qe////%..:U\\:::uzz""rG   c                    SSK Jn  [        U[        R                  R
                  5      (       a  U1n[        XR                  5      =(       a    U R                  U;   $ ry  )r  r  rI   r$   r  r  rs  rr  rv  s      r+   is_fallback_opr    sD    "ejj++,,Td--.I43C3Cr3IIrG   c                @    X!U    R                   R                  5          $ rX   )defining_oprh  )buf_namename_to_bufname_to_fused_nodes      r+   buf_name_to_fused_snoder    s    (3??HHJKKrG   c           	         U(       a  U" U 5      (       a  g UR                  U 5        U R                   H-  n[        UR                  X#5      nXa;   a  M   [	        UUUUUS9  M/     g )Ncriteria_cb)r  unmet_dependenciesr  r   find_recursive_deps_of_node)r  collected_node_setr  r  r  depdefining_op_for_deps          r+   r  r    sh     {5))5!''5HHk
 4##	
 (rG   c           
        U(       a  U" U 5      (       a  g UR                  U 5        U R                  5        H  nUR                   H  nUR                  c   eUR                  R	                  5       S:X  a  M2  UR                  R	                  5       U;  a  MR  X6R                  R	                  5          nXq;   a  Mu  [        UUUUUS9  M     M     g )NOUTPUTr  )r  get_outputsr)  r>  rh  find_recursive_users_of_node)r  r  r  r  r  or  user_ops           r+   r  r  3  s     {5))5! GGD99(((yy!!#x/yy!!#+==(););)=>G,(""'  !rG   c                   [         R                  R                  R                  (       a  SOSn[         R                  R                  R
                  (       a.  [         R                  R                  R                  5       (       d  gX-
  U-
  $ )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )r$   
_functorchr<   functionalize_rng_opsr  inline_inbuilt_nn_modulesr8  is_parameter_freezing)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      r+   num_fw_fixed_argumentsr  L  sb     $$::  	66##99;;69SSSrG   c                   S nSn/ nU R                   R                   H8  nUR                  S:X  d  M  U" U5      (       a  UR                  U5        US-  nM:     U[	        [        [        U5      5      5      :X  d   e[        U5      $ )z6
Infers which inputs are static for a backwards graph
c                x    SU R                   ;  =(       a%    SU R                   ;  =(       a    SU R                   ;  $ )Ntangentsbwd_seedbwd_base_offsetr  r  s    r+   is_saved_tensor'count_tangents.<locals>.is_saved_tensorb  s5    aff$ 0!&&(0!/	
rG   r   r2  r!   )r  r  r  r1  r  r   r&   )fx_gr  	arg_countstatic_arg_idxsr  s        r+   count_tangentsr  ]  s    

 IOZZ44= q!!&&y1NI	  d5_)=#>????rG   c                  6    \ rS rSr% S\S'   S r\S 5       rSrg)	BoxedBooliu  r   r\   c                    U R                   $ rX   )r\   ru  s    r+   r  BoxedBool.__bool__y  s    zzrG   c                @    [        U [        5      (       a	  SU l        U $ gNF)rI   r  r\   r9  s    r+   disableBoxedBool.disable|  s    c9%%CIJrG   r_   N)	rb   rc   rd   re   rv  r  r  r  rj   r_   rG   r+   r  r  u  s     K  rG   r  c              #     ^ ^#    SSK Jn  UR                  mU U4S jn[        R                  R
                  R                  USU5         S v   S S S 5        g ! , (       d  f       g = f7f)Nr!   )PythonWrapperCodegenc                @   > TR                  U5        T" XX#/UQ70 UD6$ rX   r  )r  r   kernel_coder  rP   r=  kernel_listorig_define_kernels         r+   new_define_kernel2collect_defined_kernels.<locals>.new_define_kernel  s)    ;'!'XXQWXXrG   define_kernel)codegen.wrapperr  r  r  r   rD  r  )r  r  r  r  s   `  @r+   collect_defined_kernelsr    sQ     5-;;Y
 
			#	#o/@
 	
 
 
s   AA(	A	A(
A%!A(c                    U S-   $ )N__original__r_   r  s    r+    get_cloned_parameter_buffer_namer    s    .  rG   c                R    [        U [        5      (       d
  U b   U 5       eU [        ;   $ rX   )rI   r   r"   rD  s    r+   is_gpur    s)    fc""fn<f<4YrG   c                F    [        U [        5      (       d   e[        U 5      $ rX   )rI   r   r  rD  s    r+   device_need_guardr    s    fc""""&>rG   c                    [         R                  " 5       (       a  U [        R                  :X  a  gU [        R                  [        R
                  [        R                  1;   $ r  )r<   r]  r$   r   r   r   ro  s    r+   ,needs_fallback_due_to_atomic_add_limitationsr    s?     eu~~5ejj%..AAArG   c                   U R                   [        R                  R                  R                  [        R                  R                  R
                  4;   a  Uc  gU R                   [        R                  R                  R                  :X  a  SOSnUS U1;  =(       Gd&    U=(       a    [        U5      =(       a    [        U5      =(       d    U R                   [        R                  R                  R                  :H  =(       ap    US:H  =(       ad    U=(       a[    US:H  =(       aO    [        R                  R                  =(       a.    [        R                  R                  =(       d    [        5       S:g  =(       dJ    X:H  =(       a#    U[        R                  [        R                  1;   =(       d    [        R                   " 5       $ )NFr  r   r   r!   )overloadpacketr$   r   atenscatter_reduce_scatter_reducescatter_r  r  r<   r  fallback_scatter_reduce_sumdynamic_threadsr  r   r   r)  )rr  r=  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          r+   use_scatter_fallbackr    s]    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 'SJ5::u{{:S,S	8 557!rG   c                   SSK JnJn  SSKJn  [        S[        U 5       S35        [        U 5       GH.  u  pE[        SUS S35        XRL a  [        S	5        M'  XQL a  [        S
5        M8  [        XS5      (       a  UR                  5       n[        U(       a  SOS S35        U(       a;  UR                  c   e[        SUR                  R                  R                   35        [        S5        UR                  R                   H  n[        U5        M     [        S5        UR                  R                   H  n[        U5        M     GM  [!        S[#        U5       35      e   g)z
An API that can be used in pdb to dump a node_schedule.
Right mainly dump the read/write dependencies but can add more as needed.
r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3rY  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr  r  r  r  rP  r&   r   rI   is_reductionr>  r  reduction_hintread_writesreadswritesr   r   )r  r  r  r  r  r>  is_redr  s           r+   dump_node_scheduler    s&   
 O7	M 236
:;}-	#al"$%%%&,,&&(FfU$/?@yy,,,01N1N0OPQ*''--c
 .+''..c
 / !9$t*FGG' .rG   c                z    SSK Jn  U" U R                  5       [        U R                  5      -  [
        -  S:H  5      $ )Nr   )statically_known_true)r(  r  storage_offsetrq  rl   GPU_ALIGN_BYTES)r   r  s     r+   tensor_is_alignedr    s:     L 				 >&,,#?	??RVWW rG   c                    [        U R                  R                  5      (       d  g[        R                  =(       d    [        U 5      $ r  )r  rm   r   r<   assume_aligned_inputsr  )example_inputs    r+   should_assume_input_alignedr    s5     -&&++,,''K+<]+KKrG   c                    [         R                  R                  R                  5       n U (       d  [        R
                  " 5       $ U R                  R                  nU(       d  [        R
                  " 5       $ UR                  5       $ rX   )	r$   _guardsTracingContexttry_getr  nullcontextr^  r  suppress_guards)tracing_contextr  s     r+   #maybe_get_suppress_shape_guards_ctxr  
  sb    
 mm22::<O%%''  ))33I%%''$$&&rG   c                "   [         R                  R                  R                  [        SS5         [
        R                  R                  5         SS KnSS K	nUR                  " 5       nUR                  " U5      nSSKJn  UR                  U5        UR                  nUR!                  UR"                  5        U " U0 UD6n	UR%                  5       n
UR!                  U5        UR'                  U5        S S S 5        X4$ ! , (       d  f       W	W
4$ = f)Nr   Tr   )output_code_log)r  r   rD  r  r<   r$   r  r  rb  loggingr   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGr  removeHandler)r   rP   r=  rb  r   log_capture_stringchr  
prev_levelrK  r  s              r+   run_and_get_cpp_coder    s     
			#	#FGT	:[[]""#56=""2&$**
  /T$V$'')  ,%%b) 
;  9! 
;	:  19s   CC==
Dc                    S n[        U 5      nUb  UR                  $ U  H:  n[        U[        R                  5      (       d  M$  UR
                  R                  s  $    g rX   )r.   r  rI   r$   r  r>  )r  r  r^  inputs       r+   shape_env_from_inputsr  2  sW    I (I """ eU\\**::''' 
 rG   c                >   ^ ^ [        T5      S:X  a  T $ SUU 4S jjnU$ )Nr   c                ,   > [        U T5        T" U 5      $ rX   )copy_misaligned_inputs)
new_inputsinputs_to_checkrG  s    r+   run)align_inputs_from_check_idxs.<locals>.runN  s    z?;Z  rG   )r  List[InputType])r&   )rG  r  r  s   `` r+   align_inputs_from_check_idxsr  G  s(     ?q ! ! JrG   c                X   SU R                  5       ;   a  SnO;[        S [        U R                  5       U R                  5       5       5       5      S-   n[        R
                  " X4S5      R                  5       n[        R
                  " X R                  5       U R                  5       5      $ )Nr   c              3  6   #    U  H  u  pUS -
  U-  v   M     g7fr  r_   )rw   shaper  s      r+   ry   )clone_preserve_strides.<locals>.<genexpr>[  s     T:Sf$:Ss   r!   rV   )rd  r   r   r  r$   
as_stridedclone)r(   needed_sizebuffers      r+   clone_preserve_stridesr   U  s    AFFH} T#affh
:STTWXX 	 a6<<>FFFFHahhj99rG   c                    U HS  nX   n[        U[        R                  5      (       d   eUR                  5       [        -  (       d  MF  [        U5      X'   MU     g rX   )rI   r$   r7  data_ptr	ALIGNMENTr   )r  check_inputs_idxsr   _inps       r+   r  r  a  sI     }$----==?Y&&248JM	 rG   c                    / nU HV  nX   n[        U[        R                  5      (       d  M(  UR                  5       [        -  S:X  d  ME  UR                  U5        MX     [        U5      [        U5      :w  a  U$ U$ )zO
We require all inputs to be aligned, so introduce a copy for any
that aren't.
r   )rI   r$   r7  r"  r#  r1  r&   )r  static_input_idxsaligned_static_input_idxsr  r  s        r+   remove_unaligned_input_idxsr)  k  sp     !# eU\\**0@90LQR/R%,,S1 ! $%->)??((rG   c                   SSK Jn  [        R                  " [        R                  5      R
                  nUR                  R                  R                  nUR                  R                  R                  R                  nUR                  R                  R                  X:*  5      (       a  gU" U 5      =(       a    U" U 5      U:*  $ )Nr!   r  T)r  r  r$   iinfor   r   r  r  r;  r  has_hintis_expr_static_and_true)r   r  int_maxr;  r,  s        r+   expr_fits_within_32bitr/  }  s    kk%++&**G  **Iww))22H 	ww//==A;29Q<722rG   c                  ^^^ [         R                  R                  R                  5       nUb  UR                  b  [        UR                  5      S:X  d   e[        U 5      mUR                   H  nUc  UR                  R                  S 5        M#  Sm[         R                  R                  R                  5       =n(       a  UR                  mUU4S jmUR                  R                  [        U4S jU 5       5      5        M     g g g )Nr   Fc                r   > Tc  [        U 5      $ T(       a  TR                  U 5      $ TR                  U 5      $ rX   )rY   deserialize_symexprevaluate_symexpr)r   fakify_first_callr  s    r+   map_expr4set_tracing_context_output_strides.<locals>.map_expr  s7     ("1v((<<Q??$55a88rG   c              3  4   >#    U  H  nT" U5      v   M     g 7frX   r_   )rw   r   r5  s     r+   ry   5set_tracing_context_output_strides.<locals>.<genexpr>  s     3OAHQKKs   )
r$   r  r  r  output_stridesr&   r  r1  r4  tuple)rH  compiled_graphrt  rl  r  r4  r5  r  s        @@@r+   "set_tracing_context_output_stridesr<    s    mm**224Gw55A7))*a///).9	#22E}&&--d3$)!--66>>@@3@(+(=(=%9 &&--e3O3O.OP 3  BrG   c                 4   [         R                  b  [         R                  $ [         R                  " 5       (       d  g[        R                  R                  5       (       a  g SSKJn   U [        R                  R                  S5      :  $ ! [         a     gf = f)NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
r<   fx_graph_remote_cacher]  r$   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher?  ModuleNotFoundErrorjustknobs_getval_intr>  s    r+    should_use_remote_fx_graph_cacherF    s    ##/+++,,..H  5#8#8#M#M8$    s   "B
 

BBc                2    [         R                  " SSU 5      $ )Nz[^a-zA-Z0-9_]r   )resubr  s    r+   normalize_namerJ    s    66"C..rG   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzz^.*[.]c                j    [         R                  S[        U 5      5      n[        R	                  X5      $ )z"Convert torch.dtype to triton typetl.)_triton_type_rerI  r   _triton_type_mappingr+  )rl   triton_type_names     r+   triton_typerP    s+    &**5#e*=##$4GGrG   c                    [         R                  X 5      nUR                  SS5      n[        [        U5      n[        U[        R                  5      (       d   eU$ )NrL  r  )_torch_triton_mappingr+  r  r#   r$   rI   rl   )rl   adjusted_type	type_namerx  s       r+   triton_type_to_torchrU    sM    )--e;M%%eR0Iy)Ii----rG   c                   U R                   (       + =(       a    U R                  5       UR                  5       :H  =(       a    U R                  5       UR                  5       :H  =(       a    U R                  UR                  :H  =(       a    U R                  UR                  :H  =(       ae    U R                  5       R                  5       UR                  5       R                  5       :H  =(       a!    U R                  5       UR                  5       :H  $ rX   )	is_mkldnnrd  r  rl   rm   untyped_storager"  r  r  r\   s     r+   is_same_tensorrZ    s    NN 	<IIK5::<'	<KKMU\\^+	< JJ%++%	< KK5<<'		<
   "++-1F1F1H1Q1Q1SS	< !U%9%9%;;rG   c                   U R                   =(       a    U R                  5       UR                  5       :H  =(       a    U R                  UR                  :H  =(       as    U R                  UR                  :H  =(       aS    [        R
                  R                  R                  U 5      [        R
                  R                  R                  U5      :H  $ rX   )rW  rd  rl   rm   r$   r   mkldnnr"  rY  s     r+   is_same_mkldnn_tensorr]    s     	PIIK5::<'	PJJ%++%	P KK5<<'	P II%%d+uyy/?/?/H/H/OOrG   c                     g)N)r  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorr_   r_   rG   r+   boolean_opsrk    s    rG   c                  *    \ rS rSr% S\S'   S\S'   Srg)OpDtypeRulei	  r   type_promotion_kindOptional[torch.dtype]override_return_dtyper_   Nru  r_   rG   r+   rm  rm  	  s    8800rG   rm  zDict[str, OpDtypeRule]op_dtype_propagation_rulesc                (    [        X5      [        U '   g rX   )rm  rq  )r   rn  rp  s      r+   #register_op_dtype_propagation_rulesrs  
	  s    
 (3(t$rG   c                    [         R                  R                  (       a4  U [        R                  [        R
                  4;   a  [        R                  $ U $ )z"Maybe upcast [b]float16 to float32)r<   r@  codegen_upcast_to_fp32r$   r   r   r   ro  s    r+   upcast_compute_typerv  	  s3    }}++%--00}}LrG   )frozen_defaultfrozenc              .   ^ SU4S jjnU c  U$ U" U 5      $ )Nc                   > [         R                  S:  a  [        R                  " U STS9$ [        R                  " U TS9$ )N)r  r  T)kw_onlyry  rx  )r  version_infodataclasses	dataclass)r[   ry  s    r+   wrapir_dataclass.<locals>.wrap	  s;    w&((d6JJ ((V<<rG   )r[   r?   ra   r?   r_   )r[   ry  r  s    ` r+   ir_dataclassr  	  s    = {9rG   c                     [         R                  R                  R                  5       n U b'  U R                  (       a  U R                  R
                  $ g rX   )r$   r  r  r  fw_metadatabw_donated_idxs)r  s    r+   get_donated_idxsr  ,	  s=    mm22::<O"'B'B**:::rG   )rS   r`   )   d   )r   zCallable[[], Any]ra   r  )ra   r   )rm   z"Union[Optional[torch.device], str]ra   ztorch.device)r   zIterable[sympy.Expr]ra   r`   )r   Sequence[sympy.Expr]r   r  ra   r`   )r   zIterable[_T]ra   zValuesView[_T])r   Union[int, sympy.Expr]r   r  ra   r  )r  z"Iterable[Union[int, torch.SymInt]]ra   zList[sympy.Expr])r  z Iterable[Union[int, sympy.Expr]]ra   zList[Union[int, torch.SymInt]])r  torch._ops.OpOverloadra   r   rX   )r%  z1Optional[Callable[[torch._ops.OpOverload], bool]]ra   r   )r   )rm   r   ra   r~  )r!   r   )rG  zCallable[..., Any]rI  rY   rm   r   ra   r  )r_   r  r  g      ?r   )rm   r   )r[  r   r\  r   )r[  r   r_  z	List[str])ra   rY   )r(   zTuple[_T, ...]ra   zList[_T])r   z!Callable[Concatenate[Any, P], RV]ra   zCachedMethod[P, RV])r  zIterable[torch.fx.Node]ra   zSet[torch.fx.Node])r  r`   ra   r   )r  r9   r  rY   ra   sympy.Symbol)r   r   ra   r  )r  r`   r  zDict[sympy.Expr, Any]ra   r`   )r   r   ra   z,TypeGuard[Union[torch.SymInt, torch.Tensor]])rP   r   ra   r   )r?  torch.fx.GraphModulera   zOptional[torch.fx.Node])r?  r  )r[  r   )NNT)ra   	List[int])r[  z.Sequence[Union[int, torch.SymInt, sympy.Expr]]ra   r  r  )r  zUnion[int, torch.device]ra   r   )r  zList[torch.dtype]ra   r   )r$  r   ra   r   )FT)ra   zTuple[Any, List[str]])r  r`   ra   torch.dtype)r=  r   ra   r   )r=  r   ra   rY   )rL  zIterable[Any]ra   r   )r  rY   r  rY   )r  r  )r   r   )rm   r  )rr  r  )r   torch.Tensor)r  r  )r  Sequence[InputType])rG   Callable[[List[InputType]], Any]r  Sequence[int]ra   r  )r(   r  )r  r  r$  r  ra   r~  )r  r  r'  r  ra   r  )r   r`   )r   r   ra   r   )rl   r  ra   r   )rl   r   ra   r  )r  r  r\   r  )rn  r   rp  ro  )rl   r  ra   r  )ry  r   )ra   zOptional[List[int]])
__future__r   r  r  r~  enumr   r%  rb  r  r   r  r   rF  rC  rH  rL  r  rB  r  rE  r  r   r   typingr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   rJ   r$   torch._inductor.runtime.hintsr   torch._prims_commonr   torch.utils._pytreer   r"   	lru_cacher,   torch._dynamo.device_interfacer-   torch._dynamo.utilsr.   torch.autogradr/   torch.autograd.profiler_utilr0   (torch.fx.passes.graph_transform_observerr1   torch.fx.passes.shape_propr2   torch.utils._sympy.functionsr3   r4   r5   r6   r7   torch.utils._sympy.symbolr8   r9   torch.utils._sympy.value_rangesr:   r;   r  r<   runtime.runtime_utilsr=   r   rO  	getLoggerrb   r   r?   r   	VarRangesr7  rY   r  	InputTypeGPU_KERNEL_BIN_EXTSr  r#  rD   rF   rO   FunctionrQ   r   r   r   r   r   r   r  r	  r  r  r#  r@  r|   rM  rU  r]  r`  rb  re  rn  ro  rp  rs  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r-  r3  r4  rv  r:  r<  r  rW  r^  rm  rq  rs  rx  r  r  r  r  r  r  r  r%  r*  r5  r@  rE  rW  rc  rf  ri  rk  rr  rn  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r  r  r-  r1  r9  r>  r@  rE  rG  r  rT  EnumrV  rl  rw  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r)  r/  r<  rF  rJ  rN  r  rR  compilerM  rP  rU  rZ  r]  rk  rm  rq  rs  rv  r  r  )r  rS   s   00r+   <module>r     s   "       	     	  	  
          $ U T    : C - UO	
 T  D 0 % 2 K 0  8 D  = llg%!T]UZZ'(	U5<<ell:;<	'7 	{Q'A-+2B XDX XB5
LENN Od T ;@
+)!)*@)) AF+	+++	)#.G OSK	2,' NT69GJ
  CI<?)'#$ cNTT"8WQU^ 8&)B52r 9=*&>"$%	DU	>.-)))X !# I "	 (  ( VA!B!!H Q7 7* ~ ~B
 
 $ $ @?' ? T 8 16U ,< T@ @ TR R6+\H> FJ3lCC C" /$N#& . .F&	)>&&   TD D T6 66 T Q0<(#K(*$)) *
D<()#JL
 MQ
, MQ2T" 0      !

B$&$NH>L'".*+" &	:994A9	9$ $3Q0&/ '#)*  +?*D*D*FG*F$!*FG  **Y'H	 T & 1 1 1
 68 2 78 1 D)   *S Hs   4Y