
    IЦiA                      S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKJr  S SKJrJrJrJrJrJrJrJrJrJrJrJrJr  S SKrS SKrS SKrS SKJ r J!r!  S SK"J#r#J$r$  S SK%J&r&  S SK'J(r(  S S	K)J*r*J+r+  S S
K,J-r-  SSK.J/r/J0r0J1r1J2r2J3r3  SSK4J5r5J6r6J7r7  SSK8J9r9  SSK1J:r:J;r;J<r<J=r=  SSK2J>r>J?r?J@r@JArA  SSKBJCrC  SSKDJErEJFrF  SSKGJHrHJIrI  SSKJJKrK  SSKLJMrMJNrNJOrOJPrPJQrQJRrRJSrSJTrTJUrUJVrVJWrW  SSKXJYrY  \R                  " \[5      r\\R                  R                  \[S5      r_\R                  R                  \[S5      r`\R                   " S S5      5       rb\R                   " S S\b5      5       rc " S S5      rd " S S 5      reS;S! jrf " S" S#5      rg        S<S$ jrh\R                  R                  R                  \R                  R                  R                  \R                  R                  R                  \R                  R                  R                  S%.ro " S& S'\d5      rp " S( S)\d5      rq " S* S+\d5      rrS=S, jrs        S>S. jrt " S/ S0\d5      ru " S1 S2\u5      rv " S3 S4\d5      rw S?       S@S5 jjrx\R                   " S6 S75      5       ry\R                  " 5       r{ " S8 S-5      r| " S9 S:5      r}g)A    )annotationsN)defaultdict)AnyCallableCounterDefaultDictDictGenericListOptionalSequenceSetTupleTypeVarUnion)countersdynamo_timed)get_metric_tableis_metric_table_enabled)free_unbacked_symbols)
OrderedSet)free_symbol_is_typeSymT)
has_triton   )commsconfigdependenciesirmetrics)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime)Dep	MemoryDepStarDepWeakDep)ComputedBufferget_device_typeMultiOutputMultiOutputLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)
green_textred_text)SimplifyIndexing)cache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsIndentedBufferis_collectiveis_gpuis_waitsympy_product)Vfusionloop_orderingc                      \ rS rSr% S\S'   S\S'   S\S'   \R                  " \S9rS	\S
'   \R                  " \	S9r
S\S'   SS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSrg)SchedulerBufferJ   	Scheduler	schedulerz	ir.BuffernodeBaseSchedulerNodedefining_op)default_factoryList[NodeUser]usersr.   
mpi_bufferc                @    [        U R                  R                  5      $ N)hashrF   nameselfs    X/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torch/_inductor/scheduler.py__hash__SchedulerBuffer.__hash__T   s    DIINN##    c                   [        5       nU R                  5       nUR                  U S[        U R                  5      R
                   35        UR                  U SU R                  R                   35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        [        U R                  5      S::  a0  UR                  U SU R                   35        UR                  5       $ UR                  U S35        UR                  S5         U R                   H  nUR                  U S35        M     S S S 5        UR                  S	5        UR                  5       $ ! , (       d  f       N/= f)
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])r9   get_name	writelinetyperF   __name__layoutget_aliasespformatget_mutationslenrK   indentgetrawvalue)rR   resultrP   users       rS   	debug_strSchedulerBuffer.debug_strW   s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! JJD$$vQZ0 ' " S!!!##	 "!s   *(F;;
G	c                6    U R                   R                  5       $ rN   rF   r[   rQ   s    rS   r[   SchedulerBuffer.get_namek       yy!!##rV   c                0   U R                   c   eU R                   R                  5       (       d  g U R                   R                  5       (       dV  U R                   R                  5       (       d7  [	        U R                   R                  5       [        R                  5      (       a4  [        R                  R                  R                  U R                   5        g [        [        R                  S5      (       a  U R                  5       [        R                  R                  ;   a  [        R                  R                  U R                  5          nXR                   R"                  ;   a$  U R                   R"                  U   R                   nO#U R                   R$                  U   R                   n[        R                  R                  R'                  UU R                   5        g [        R                  R                  R                  U R                   5        g )Nargs)rF   should_allocateget_inputs_that_alias_outputget_mutation_names
isinstanceget_output_specr   CommBufferLayoutr>   graphwrapper_codecodegen_allocationhasattrkernelr[   inplace_update_buffersrE   name_to_donated_buffername_to_bufcodegen_inplace_reuse)rR   input_buffer_nameinput_buffers      rS   allocateSchedulerBuffer.allocaten   sc   yy$$$yy((** II2244yy++--$))335r7J7JKKGG  33DII> AHHf%%188#B#BB !" ? ? P NN$I$II#~~DD% $   $~~99:KLQQGG  66		
 GG  33DII>rV   c                    U R                   c   e[        U R                   R                  [        R                  5      (       a  gU R
                   H$  n[        UR                   [        5      (       d  M$    g   gNFT)rF   rs   r_   r   
NoneLayoutrK   
OutputNode)rR   uses     rS   can_freeSchedulerBuffer.can_free   sW    yy$$$dii&&66::C#((J//  rV   c                4   0 nU Hr  n[        UR                  5      U;   a?  UR                  U[        UR                  5         5      U[        UR                  5      '   M[  X2[        UR                  5      '   Mt     [        UR	                  5       5      U l        g rN   )idrF   mergelistvaluesrK   )rR   rK   rf   r   s       rS   	set_usersSchedulerBuffer.set_users   sm    &(C#((|v%'*yy3881E'Fr#((|$'*r#((|$	 
 &--/*
rV   c                T    U R                   c   eU R                   R                  5       $ rN   )rF   rq   rQ   s    rS   r`   SchedulerBuffer.get_aliases   s%    yy$$$yy5577rV   c                T    U R                   c   eU R                   R                  5       $ rN   )rF   rr   rQ   s    rS   rb   SchedulerBuffer.get_mutations   %    yy$$$yy++--rV   )rK   Nreturnintr   strr   Noner   bool)rK   rJ   r   r   r   zSequence[str])r^   
__module____qualname____firstlineno____annotations__dataclassesfieldr   rK   r.   rL   rT   rh   r[   r   r   r   r`   rb   __static_attributes__ rV   rS   rB   rB   J   sl    
O""'--dCE>C.9.?.?3/J+ $$($?B+8.rV   rB   c                  $    \ rS rSr% SrS\S'   Srg)SchedulerDonatedBuffer   NOptional[BaseSchedulerNode]rH   r   )r^   r   r   r   rH   r   r   r   rV   rS   r   r      s    /3K,3rV   r   c                  N   \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   S9S jrS:S jrS;S jrS;S jrS;S jr	S<S jr
S;S jrS=S jr      S>S jrS?S jrS@S jrSAS jrSBS jr      SCS jrS=S jrSDS jrSDS jrS=S jrS=S jr    SES  jrS;S! jrS;S" jr\SDS# j5       r\SDS$ j5       rSFS% jrSGS& jrSHS' jr SIS( jr!SAS) jr"SAS* jr#SAS+ jr$SAS, jr%SAS- jr&SAS. jr'SAS/ jr(SJS0 jr)SAS1 jr*S=S2 jr+ SK     SLS3 jjr,\SMS4 j5       r-\SNS5 j5       r.SOS6 jr/S7r0g8)PrG      z7Tuple[torch.device, Tuple[Tuple[sympy.Expr, ...], ...]]groupdependencies.ReadWritesread_writeszOrderedSet[Dep]unmet_dependenciesr   	min_order	max_orderr/   mpi_nodec                     Xl         S U l        g )Nc                     / $ rN   r   )ro   kwargss     rS   <lambda>,BaseSchedulerNode.__init__.<locals>.<lambda>   s    BrV   )rE   debug_device_str)rR   rE   s     rS   __init__BaseSchedulerNode.__init__   s    $- ' 	rV   c           	     @   Xl         [        5       U l        [        5       U l        SU l        UR                  5        Vs/ s H  n[        U R                  UU S9PM     snU l        U R                   Vs0 s H  o3R                  5       U_M     snU l
        g s  snf s  snf )NF)rE   rF   rH   )rF   r   	ancestors
last_usagewrittenget_outputsrB   rE   outputsr[   outputs_by_name)rR   rF   outputbufs       rS   _init_from_node!BaseSchedulerNode._init_from_node   s    ,0	*4, L 	  **,/
 - .. 
 -/
 ,0<<<
+7CLLNC<<
/
<
s   B3Bc                V    [        U 5      R                   SU R                  5       < S3$ )Nz(name=)r]   r^   r[   rQ   s    rS   __repr__BaseSchedulerNode.__repr__   s'    t*%%&fT]]_,?qAArV   c                P   U R                  5       n[        5       nUR                  U S[        U 5      R                   S[        [        U SS5      5      R                   SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
35        UR                  5          U R                  5        H"  nUR                  UR                  5       5        M$     SSS5        UR                  S5         UR                  U R                  5       5        UR'                  5       R)                  5       $ ! , (       d  f       N]= f! [          a    ["        R%                  SSS9   NOf = f)#Longer form printout for trace logsrX   (rF   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        rZ   Ignoring error in debug_str()Texc_info)r[   r9   splicer]   r^   getattrra   r   writesr   readsrd   r   rh   r\   debug_str_extra	Exceptionlogwarningre   rstrip)rR   rP   r   outs       rS   rh   BaseSchedulerNode.debug_str   sv   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 		
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   %7E36F 3
FF%$F%c                    g)N r   rQ   s    rS   r   !BaseSchedulerNode.debug_str_extra       rV   c                $    U R                  U 5      $ rN   )r   rQ   s    rS   _debug_str_for_device'BaseSchedulerNode._debug_str_for_device   s    $$T**rV   c                   [        U R                  SS 5      nSn[        U[        R                  R
                  R                  5      (       a$  SUR                  UR                  5       /SSS9-   nOe[        U[        R                  R
                  R                  5      (       a2  SUR                  UR                  5       UR                  5       /SSS9-   nU  U 3$ )Ndatar   z, F)shorten	multiline)r   rF   rs   torch	_inductorr   	Pointwise
str_helperget_size	Reductionget_reduction_sizeget_reduction_type)rR   
maybe_datadata_strs      rS   debug_str_short!BaseSchedulerNode.debug_str_short   s    TYY5
j%//"4"4">">??j33$$&'% 4  H 
EOO$6$6$@$@AAj33..0*2O2O2QR 4  H
 z""rV   c                p    [         R                  SU U R                  U R                  R                  5        g )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor   r   r   rQ   s    rS   log_detailsBaseSchedulerNode.log_details  s,    6####		
rV   c                    g rN   r   )rR   self_dep	other_deps      rS   reorder_loops_by_dep_pair+BaseSchedulerNode.reorder_loops_by_dep_pair  s     	rV   c                X    U R                  U R                  R                  U5      5        g rN   )set_read_writesr   renamerR   renamess     rS   update_mutated_names&BaseSchedulerNode.update_mutated_names  s!    T--44W=>rV   c                X    U R                  U R                  R                  U5      5        g rN   )r  r   	with_readrR   deps     rS   add_fake_depBaseSchedulerNode.add_fake_dep  s!    T--77<=rV   c                B    [        S U R                  5        5       5      $ )Nc              3  n   #    U  H+  oR                  5       =(       d    UR                  5       v   M-     g 7frN   )r`   rb   ).0r   s     rS   	<genexpr>=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s*      
@ROO4!2!2!44@Rs   35)anyr   rQ   s    rS   has_aliasing_or_mutation*BaseSchedulerNode.has_aliasing_or_mutation  s%     
@D@P@P@R
 
 	
rV   c                f    Xl         U R                   R                  U l        U R                  5         g rN   )r   r   r   
prune_deps)rR   rws     rS   r  !BaseSchedulerNode.set_read_writes   s&    "&"2"2"8"8rV   c           	         U R                  5       n[        U Vs/ s H  oBR                  XD5      PM     sn5      nX1-
  U l        g s  snf rN   )used_or_aliased_buffer_namesr   getr   )rR   future_used_buffersmutation_real_nameused_buffersks        rS   set_last_usage BaseSchedulerNode.set_last_usage%  sD     88:!"VA#9#9!#?"VW&< #Ws   Ac                J    U R                    H  nUR                  5         M     g rN   )r   r   )rR   r   s     rS   mark_runBaseSchedulerNode.mark_run,  s    <<CLLN  rV   c                    [        S [        R                  " U R                  R                  U R                  R
                  5       5       5      $ )Nc              3  :   #    U  H  nUR                   v   M     g 7frN   rP   r  r  s     rS   r  6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>1  s      
W HHWs   )r   	itertoolschainr   r   r   rQ   s    rS   used_buffer_names#BaseSchedulerNode.used_buffer_names0  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
rV   c                >  ^ [        5       m[        R                  " U R                  R                  U R                  R
                  5       Vs/ s H  nUR                  PM     nn[        U5      S:  a  UR                  5       nTR                  U5        [        R                  R                  R                  U5      (       aD  UR                  U4S j[        R                  R                  U   R                  5        5       5        [        U5      S:  a  M  T$ s  snf )Nr   c              3  8   >#    U  H  nUT;  d  M  Uv   M     g 7frN   r   )r  alias
used_namess     rS   r  ABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>A  s(      "5 J.	 E"5s   
	)r   r-  r.  r   r   r   rP   rc   popaddr>   rv   name_to_bufferr  extendrq   )rR   r  depsr4  s      @rS   r  .BaseSchedulerNode.used_or_aliased_buffer_names6  s    &0l
 !t'7'7'='=t?O?O?V?VW
W HHW 	 
 $i!m((*CNN3ww%%))#.. !"!7!7"224"5 	 $i!m 
s   Dc                N   ^  [        U 4S jT R                   5       5      T l        g )Nc              3  t   >#    U  H-  nUR                   TR                  R                  ;  d  M)  Uv   M/     g 7frN   )rP   rE   available_buffer_namesr  r  rR   s     rS   r  /BaseSchedulerNode.prune_deps.<locals>.<genexpr>K  s0      -
.xxt~~DDD C.s   (8	8r   r   rQ   s   `rS   r  BaseSchedulerNode.prune_depsJ  s#    ", -
..-
 #
rV   c                   ^ ^ SU 4S jjm[        U4S jT R                  R                   5       5      nT R                  T R                  R	                  U5      5        g )Nc                   > [        U [        5      (       d  gTR                  R                  U R                     R
                  nUR                  5       [        R                  R                  ;   $ NF)
rs   r(   rE   r}   rP   rH   r[   r>   rv   removed_operations)r  oprR   s     rS   should_prune7BaseSchedulerNode.prune_weak_deps.<locals>.should_pruneS  sL    c7++++CHH5AAB;;=AGG$>$>>>rV   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7frN   r   r  r  rH  s     rS   r  4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>Y  s      
1C\#5FCC1   !	!r  r%   r   r   )r   r   r   r  remove_reads)rR   	to_removerH  s   ` @rS   prune_weak_deps!BaseSchedulerNode.prune_weak_depsQ  sN    	?  
++11
 
	 	T--::9EFrV   c                D    [        XU R                  R                  5        g rN   )_prune_redundant_depsrE   r}   )rR   name_to_fused_nodes     rS   prune_redundant_deps&BaseSchedulerNode.prune_redundant_deps^  s     	d8R8RSrV   c                T    U R                   c   eU R                   R                  5       $ rN   )rF   get_operation_namerQ   s    rS   r[   BaseSchedulerNode.get_namec  r   rV   c                "    U R                  5       $ rN   )r[   rQ   s    rS   get_first_name BaseSchedulerNode.get_first_nameg  s    }}rV   c                z    [        U R                  5        Vs/ s H  oR                  5       PM     sn5      $ s  snf rN   )r   	get_nodesr[   rR   rF   s     rS   get_operation_names%BaseSchedulerNode.get_operation_namesj  s,    t~~7GH7Gt==?7GHIIHs   8c                r    [        U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf rN   )r   r   r[   )rR   r   s     rS   get_buffer_names"BaseSchedulerNode.get_buffer_namesn  s'    T\\B\c<<>\BCCBs   4c                    U /$ rN   r   rQ   s    rS   r_  BaseSchedulerNode.get_nodesr  s	    vrV   c                    U R                   $ rN   )r   rQ   s    rS   r   BaseSchedulerNode.get_outputsu  s    ||rV   c                     U R                   U   $ rN   )r   )rR   buf_names     rS   
get_outputBaseSchedulerNode.get_outputx  s    ##H--rV   c                T    U R                   c   eU R                   R                  5       $ rN   )rF   
get_devicerQ   s    rS   ro  BaseSchedulerNode.get_device{  s%    yy$$$yy##%%rV   c                V    U R                  5       nUS L=(       a    UR                  S:H  $ Ncpu)ro  r]   rR   devices     rS   is_cpuBaseSchedulerNode.is_cpu  s'    "T!:fkkU&::rV   c                b    U R                  5       nUS L=(       a    [        UR                  5      $ rN   )ro  r;   r]   rt  s     rS   r;   BaseSchedulerNode.is_gpu  s'    "T!9fV[[&99rV   c                    grE  r   rQ   s    rS   is_reductionBaseSchedulerNode.is_reduction      rV   c                    grE  r   rQ   s    rS   is_split_scanBaseSchedulerNode.is_split_scan  r}  rV   c                    grE  r   rQ   s    rS   is_templateBaseSchedulerNode.is_template  r}  rV   c                    grE  r   rQ   s    rS   	is_externBaseSchedulerNode.is_extern  r}  rV   c                    grE  r   rQ   s    rS   
is_foreachBaseSchedulerNode.is_foreach  r}  rV   c                    grE  r   rR   read_deps     rS   can_inplaceBaseSchedulerNode.can_inplace  r}  rV   c                    grE  r   rQ   s    rS   has_side_effects"BaseSchedulerNode.has_side_effects  r}  rV   c                   SSK Jn  [        U [        5      (       a  [        R
                  (       a  [        R                  R                  U R                  5       [        R                  5      (       a  [        [        R                  [        R                  R                  R                   R"                  5      (       a  [%        [        R                  SS5      b  ['        [        R                  S5      (       d  gU R(                  R*                  U R-                  5          R/                  5        Vs1 s H  nUR-                  5       iM     nn[1        U R2                  R4                  S S9nU R6                  [        R                  R8                  -  U R(                  R:                  -  nU R=                  5        GHD  nUR>                  nUc   eURA                  5       (       aV  URC                  5       (       dA  URE                  5       (       d,  UR-                  5       [        R                  RF                  ;   a  M  U R2                  R4                   GH  nURH                  U R(                  RJ                  ;   a$  U R(                  RJ                  URH                     n	O/U R(                  RL                  RO                  URH                  5      n	U	(       d  M  [        R                  RP                  RS                  X5      (       d  M  [        U	RT                  [V        5      (       a  M  U	RX                  c   eU	RX                   V
s/ s H%  n
U
R>                  R-                  5       U;  d  M#  U
PM'     nn
[[        U5      S:X  d  GM2  US   R\                  (       d  GMI  US   R>                  U L d  GM^  U	R>                  c  GMn  [        U	R>                  R_                  5       [`        Rb                  [`        Rd                  [`        Rf                  45      (       a  GM  U	RT                  (       am  [        U	RT                  R>                  [`        Rh                  [`        Rj                  45      (       a*  [[        U	R>                  RC                  5       5      S:  a  GME  U" U	R>                  5      U" UR>                  5      :X  d  GMn  [        R                  Rl                  Ro                  U	R-                  5       UR-                  5       5        [        [        R                  [        R                  R                  R                   R"                  5      (       an  [        R                  Rp                  Rs                  U	R-                  5       5        [        R                  Rp                  Rs                  UR-                  5       5        U	R-                  5       [        R                  Rt                  UR-                  5       '     GMB     GMG     gs  snf s  sn
f )	zf
Decide if there should be inplace updates for the node
and record the decision in the active kernel.
r   buffer_reuse_key	mutationsNro   c                    U R                   $ rN   r*  xs    rS   r   9BaseSchedulerNode.decide_inplace_update.<locals>.<lambda>  s    QVVrV   keyr   );codegen.wrapperr  rs   SchedulerNoder   inplace_buffersr>   rv   has_featurero  r!   INPLACE_BUFFERSrz   r   r   codegensimd
SIMDKernelr   ry   rE   rU  r[   r_  sortedr   r   r   rF  completed_operationsr   rF   rp   rq   rr   removed_buffersrP   r|   r}   r  rw   	can_reuserH   NopKernelSchedulerNoderK   rc   r  rt   r   r   r,   MutationLayoutSHOULDREMOVEFallbackKernelr+   ro   make_inplacer  r7  r{   )rR   r  rF   fused_nodesordered_readsinconsequential_nodesr   buf_noderead	input_bufr  remaining_usess               rS   decide_inplace_update'BaseSchedulerNode.decide_inplace_update  s7   
 	6 t]++&&##DOO$5~7U7UVVqxx)@)@)E)E)P)PQQ188[$7C &)) 99$--/JTTV
V MMOV 	 

 t//55;KL NNgg(()nn112 	 ##%CxxH''',,..88::..00<<>QWW%<%<<((..99 E EE $ E Edii PI $ : : > >tyy II I,,66yGG&y'<'<>TUU$??666 "+&!0A66??,4II !0 # & N+q0*1-999*1-22d:%NN6 *%NN::< " " 4 4 " = =! ! &11 * ) 5 5 : :!#!2!2BNN C! ! !$INN$O$O$Q RUV V,Y^^<+CHH56 2293E3E3GX%HHeoo&=&=&B&B&M&M  HH..2293E3E3GHHH..223<<>B &..0 77LLN q / &
J&s   0X"X)Xc                b   [         R                  (       d  g U(       a  U R                  (       a  g U R                  c   eU R                  R	                  5       n/ nU GH&  nUR
                  S:X  a  M  UR                  S5        UR                  S5        SUR
                   SUR                   3nSUR                  ;   a  USUR                  S    3-   nUR                  U5        SUR                  ;   d  M  UR                  S    nUR                  S	5      S
   nUR                  SUR                  SS5      R                  SS5      R                  SS5      -   5        UR                  S5        UR                  S5        GM)     [        U5      S:X  a  g UR                  U5        SU l        g )Nr   r   z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|{z{{}z}}r   \z#pragma CMT END ORIGINr   T)r   comment_originr   rF   get_originsrG  appendtargetmetasplitreplacerc   
writelines)	rR   buffer	only_onceorigins	out_linesoop_info_strr  stack_trace_last_lines	            rS   codegen_originating_info*BaseSchedulerNode.codegen_originating_info  s    $$yy$$$))'')	AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(9(9#(>r(B%  "+33C>WS$'WT4()   !9:  $- 0 y>Q 	)$rV   c                  ^ ^	^
^^^ [        T [        5      (       a  g[        T [        5      (       a   [        T R                  [        5      (       a  gSS jm[        T [
        5      (       a@  T" [        T R                  5       S   5      [        T R                  5       S   5      -  5      mO[        S5      m[        R                  " [        5      nT R                  R                  T R                  R                  -   H   nXR                     R!                  U5        M"     [#        S T R                  R                   5       5      n[#        S T R                  R                   5       5      nSU 4S jjm[        T [$        5      (       a  [#        UU 4S jU 5       5      nXE-
  nX5-
  nSnX4-   H  n['        U4S	 jX    5       5      m	U[(        R*                  R,                  ;   a  [(        R*                  R,                  U   nO>U[(        R*                  R.                  ;   a  [(        R*                  R.                  U   nOM  SU	U
U U4S
 jjm
UT
" U5      -  nM     U$ )a  
Counting the number of bytes accessed for a kernel is
surprisingly tricky. In particular, there is a differentiation
between 'theoretical' memory accesses and practical memory
accesses. For example, a layernorm kernel may actually access an
input 3 times, but in theory, it only needs to access its input
once (and may be optimized to do so through say, persistent
reductions)

Another example is that even though a buffer is passed in, we may
not access the entire buffer. This may occur if we are accessing
a slice of the buffer. Another tricky case is for indirect
indexing, where the amount of bytes accessed depends on the
values of the input.

What this function aims to compute is the memory accesses for
worst-case inputs, best-case optimization. What this means is
that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

1. Numel in ranges multiplied by number of deps the buffer has
2. The buffer size
r   c                R    [         R                  R                  R                  U SS9$ )Nr   fallback)r>   rv   sizevars	size_hint)ss    rS   try_size_hintEBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.try_size_hintM  s"    77##--a!-<<rV   r       eAc              3  8   #    U  H  oR                   v   M     g 7frN   r*  r+  s     rS   r  ABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<genexpr>[  s     F/E88/E   c              3  8   #    U  H  oR                   v   M     g 7frN   r*  r+  s     rS   r  r  \  s     H0GHH0Gr  c                   > TR                   R                  U    R                  n[        S U 5       5      n[	        U[        U5      -
  5      S:  $ )Nc              3  8   #    U  H  oR                   v   M     g 7frN   )rF   )r  rg   s     rS   r  ZBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materialized.<locals>.<genexpr>`  s     !>))r  r   )rE   r}   rK   r   rc   )r   snodesrK   buf_usesrR   s       rS   is_materializedGBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materialized^  sG    NN..s399E!!>!>>Hx*V"44599rV   c              3  \   >#    U  H!  nT" UTR                   5      (       a  M  Uv   M#     g 7frN   r  )r  r  r  rR   s     rS   r  r  d  s#      )%_S$++-Nvs   ,	,c              3  (   >#    U  H  nTv   M	     g 7frN   r   )r  r  
node_numels     rS   r  r  l  s     $R;QCZ;Qs   c                  > U (       d  g[        U R                  [        5      (       a  TR                  R                  U R                  5          R                  nSnU H  n[        UR                  [        5      (       d   e[        UR                  R                  [        5      (       a8  UR                  R                  5        H  nUT" UR                  5      -  nM     M    g   U$ [        U R                  [        R                  5      (       a#  [        U4S jU R                  5        5       5      $ T	" [        U R!                  5       5      5      n[#        U R%                  5       5      ['        TU5      -  $ )Nr   c              3  n   >#    U  H*  nT" [         R                  R                  U5      5      v   M,     g 7frN   )r>   rv   
get_buffer)r  mut_nameget_buf_bytess     rS   r  XBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.get_buf_bytes.<locals>.<genexpr>  s/      (@H &agg&8&8&BCC(@   25)rs   r_   r,   rE   r}   r[   rK   rF   rG   r+   r   r   r   sumrr   r=   r   r7   	get_dtypemin)
r   rK   totrg   	sched_buf	buf_elemsbuf_accessed_elemsr  rR   r  s
         rS   r  EBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.get_buf_bytesu  s#    cjj*;<< NN66s||~FLLEC %)$))5FGGGG%diinnkBB-1YY-B-B-D	 #}Y^^'D D .E $% !& J

BMM:: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  rV   )r  z
sympy.Exprr   r   )r   r   r  Sequence[BaseSchedulerNode]r   r   )r   z(Optional[Union[ir.Buffer, ir.TensorBox]]r   r   )rs   r  ExternKernelSchedulerNoderF   r+   r  r=   
get_rangesr   collectionsr   r   r   r   r   rP   r  r   FusedSchedulerNoder  r>   rv   r8  graph_inputs)rR   buf_accessesr  r   r   r  
node_bytesrk  r   r  r  r  r  r  s   `        @@@@@rS   get_read_write_buffers_sizes.BaseSchedulerNode.get_read_write_buffers_sizes-  s   0 d233d566:II{<
 <
 	= dM**&doo/23 1! 456J
 SJ"..t4##))D,<,<,C,CCC"))#. D Ft/?/?/E/EFFH0@0@0G0GHH	:
 d.//( )%) O -F+E
H!$$R<;Q$R!R177111gg,,X6QWW111gg**84 < -,,JQ 'T rV   c                   U R                  5       S   R                  5       S   nUR                  R                  5       n[	        [        U5      5      (       d  g[        U R                  5      (       aA  [        U R                  [        R                  5      (       d   e [        U R                  5      $ [        U R                  5      (       a  gUR                  R!                  5       n [#        5       n[%        U5      S-  n[        U [(        5      (       Ga  [        U R                  [        R*                  5      (       d   S[-        U R                  5      < 35       e[.        R1                  [3        U R                  SS5      S5      nUGb[  SSKJn  SSKJn	  [=        S	 U R                  R>                   5       5      (       a  gU" 5        o" S
S9 n[@        RB                  " U R                  RD                  5         [@        RF                  " U
5         SSKJ$n  U R                  R>                   Vs/ s H
  nU" US
S9PM     nnU R                  RJ                  nURL                  " U/UQ70 U R                  RN                  D6  SnURQ                  5       nU RS                  5       nUU-  U-  S-  nUU-  n[U        UU5      sSSS5        sSSS5        sSSS5        sSSS5        $  g[        U [V        5      (       d  [        U R                  [X        5      (       a  U RS                  5       U-  $ g! [         a  n[        R                  U5         SnAgSnAf[         a  n[        R                  U5         SnAgSnAff = f! [&         a     gf = fs  snf ! , (       d  f       O= f SSS5        O! , (       d  f       O= fSSS5        O! , (       d  f       O= fSSS5        g! , (       d  f       g= f)z2
Returns estimated op runtime in nanoseconds (ns)
r   Nl    J)type(self.node)=python_kernel_namer   )FakeTensorMode)FlopCounterModec              3  l   #    U  H*  n[        [        UR                  5       5      5      S :  v   M,     g7fr   N)rc   r   	get_numelr  ns     rS   r  :BaseSchedulerNode.get_estimated_runtime.<locals>.<genexpr>  s-      - -akkm<=A-s   24F)displayr   )ir_node_to_tensor)guard_shapeg      ?r  )-r_  r   rF   rt   r;   r*   r:   rs   r   IRNoder$   
ValueErrorr   r   	TypeErrorr<   maybe_get_dtyper8   r6   r   r  ExternKernelr]   kernel_name_to_opr  r   torch._subclasses.fake_tensorr  torch.utils.flop_counterr  r  inputsr>   set_current_nodefx_nodeset_fake_moder  	__class__process_kernelr   get_total_flopsr  maxr  r)   )rR   r   r_   edtypegpu_memory_bandwidth	gpu_flopsrG  r  r  	fake_modeflop_counter_moder  inputfake_inputsclsfactorcounted_flopscounted_bytescompute_timetransfer_times                        rS   get_estimated_runtime'BaseSchedulerNode.get_estimated_runtime  sf   
 nnq!--/2))+of-.. ##dii3333
7		BB TYY
 ((*	#4#6 )%069I d566dii99P>Nd499o=O;PP9"&&		#7<dB
 ~HD !YY--   #%O!5&(:(:II%%)?? 6 &*YY%5%5#%5E *%UC%5   # ))--C&&rLKL499;K;KL !F$5$E$E$GM$($E$E$GM$*]$:Y$F##ML$14H$HM |];) ) )5 5%% X  011ZII~6
 6
 4469MMMW       		>#  ) ) )5 5 5%@ A &%@ s   L3 *N O*+O>N6N	4NA?N		N6	O	O*3
N=MN$M>>N
NNN	
N)%N6-	O6
O O	O*
O	O**
O8c                    g rN   r   rQ   s    rS   get_template_node#BaseSchedulerNode.get_template_node      rV   )
r   r   r   rF   r   r   r   rE   r   r   N)rE   rD   r   r   )rF   ir.Operationr   r   r   )r   z	List[str]r   r   r&   r   r&   r   r   r  Dict[str, str]r   r   )r  r%   r   r   r   )r  r   r   r   r  OrderedSet[str]r   r6  r   r   r   r8  rU  Dict[str, BaseSchedulerNode]r   r   r   r  )r   zSequence[SchedulerBuffer])rk  r   r   rB   r   Optional[torch.device]r  zdependencies.Depr   r   )T)r  r9   r  r   r   r   r   )r   floatr   zOptional[ir.TemplateBuffer])1r^   r   r   r   r   r   r   r   rh   r   r   r   r   r  r  r  r  r  r#  r&  r/  r  r  rQ  rV  r[   r\  r3   ra  rd  r_  r   rl  ro  rv  r;   r{  r  r  r  r  r  r  r  r  r  r-  r0  r   r   rV   rS   rG   rG      s   BB(('' NN'''
&B*2+#
!.7	
?>


=#2=HV=	=
(
GT">T	T
. J J D D.&;:cL 9=*$*15*	*X g gR Z ZxrV   rG   c                  P    \ rS rSr% / SQrS\S'   S\S'   SS jrSS jrSS	 jrS
r	g)	WhyNoFusei  )node1node2reasonro   r   rF  zTuple[Any, ...]ro   c                    Xl         X l        g rN   )rD  rE  rR   rD  rE  s      rS   r   WhyNoFuse.__init__  s    

rV   c                F    Xl         X l        [        R                  U 5        g rN   )rF  ro   
fusion_logdebug)rR   rF  ro   s      rS   __call__WhyNoFuse.__call__  s    	rV   c                    SU R                   R                  5        SU R                  R                  5        S3U R                  U R                  -  -   $ )Nzcannot fuse z with rX   )rD  r[   rE  rF  ro   rQ   s    rS   __str__WhyNoFuse.__str__  sK    djj1134F4::;N;N;P:QQSTKK$))#
 	
rV   )ro   rD  rE  rF  N)rD  rG   rE  rG   r   r   )rF  r   ro   r   r   r   r   )
r^   r   r   r   	__slots__r   r   rM  rP  r   r   rV   rS   rC  rC    s#     5IK


rV   rC  c                    [        U [        5      (       a  [        U [        S9n [        R
                  " U SS9nSU;   a  S[        R                  " US5       3$ U$ )Nr     )rd   r       )rs   r   r  r   pprintra   textwraprd   )objrf   s     rS   ra   ra     sP    #z""Sc"^^C*Fv~HOOFG4566MrV   c                  @    \ rS rSrSS jrS	S jrS
S jrSS jr\rSr	g)r   i  c                &    [        U/5      U l        g rN   rA  r  s     rS   r   OutputNode.__init__  s    ",cU"3rV   c                    grE  r   rQ   s    rS   r{  OutputNode.is_reduction  r}  rV   c                    g)Nr   r   rQ   s    rS   rq   'OutputNode.get_inputs_that_alias_output  r   rV   c                    g)NOUTPUTr   rQ   s    rS   r[   OutputNode.get_name"  s    rV   )r   N)r  r'   r   r   r   r   r   )
r^   r   r   r   r   r{  rq   r[   r   r   r   rV   rS   r   r     s    4 HrV   r   c                  ^ ^^^^ [         R                  " 5       mT R                   H_  n[        U[        5      (       a  M  TUR
                     R                  nTTUR                  5          R                  5       ==   S-  ss'   Ma     SUUUU 4S jjm[        U4S jT R                   5       5      nU(       a?  T R                  U-
  T l        T R                  T R                  R                  U5      5        gg)aU  
Prunes weakdeps intended for mutation ordering
on an upstream fused node if after fusion there is another dependency
on the fused upstream node, making the weakdep redundant

In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
be incrementally removed, enabling other fusions, ensuring they are fused in order.
r   c                   > [        U [        5      (       aS  TU R                     R                  R	                  5       nTTU   R	                  5          S:  nTU   T:H  nU=(       d    U$ g)Nr   F)rs   r(   rP   rH   r[   )r  op_nameis_redundantis_self_depr}   name_to_dep_countrU  rF   s       rS   rH  +_prune_redundant_deps.<locals>.should_prune<  sk    c7##!#((+77@@BG,-?-H-Q-Q-STWXXL -W5=K.;.rV   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7frN   r   rK  s     rS   r  (_prune_redundant_deps.<locals>.<genexpr>H  s      .,s2C.rM  NrN  )r  r   r   rs   r(   rP   rH   r[   r   r  r   rO  )rF   rU  r}   r  rG  deps_to_prunerh  rH  s   ```   @@rS   rT  rT  (  s     '2&9&9&;&&#w''SXX&22B0?HHJKqPK '

 
  .. M "&"9"9M"IT--::=IJ rV   )zextern_kernels.convolutionzextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmc                  J   ^  \ rS rSrSU 4S jjrSS jrS	S jrS	S jrSrU =r	$ )
r  iZ  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g rN   superr   r   r  get_read_writesrR   rE   rF   r  s      rS   r   "ExternKernelSchedulerNode.__init__[  5    #T"T1134rV   c                V    U R                  5        S[        U R                  SS 5       3$ )Nz.node.kernel = r  )r[   r   rF   rQ   s    rS   r   )ExternKernelSchedulerNode.debug_str_extra`  s*    --/"/'$))EY[_2`1abbrV   c                    gNTr   rQ   s    rS   r  #ExternKernelSchedulerNode.is_externc  r2  rV   c                    U R                   c   e[        U R                   S5      =(       a    U R                   R                  5       $ )Nr  )rF   ry   r  rQ   s    rS   r  *ExternKernelSchedulerNode.has_side_effectsf  s6    yy$$$tyy"45V$)):T:T:VVrV   r   rE   rD   rF   r3  r   r   r   r   )
r^   r   r   r   r   r   r  r  r   __classcell__r  s   @rS   r  r  Z  s    5
cW WrV   r  c                  ,   ^  \ rS rSrSU 4S jjrSrU =r$ )r  ik  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g rN   ro  rr  s      rS   r   NopKernelSchedulerNode.__init__l  rt  rV   r   r|  )r^   r   r   r   r   r   r}  r~  s   @rS   r  r  k  s    5 5rV   r  c                  D  ^  \ rS rSr% S\S'   S\S'         SU 4S jjr  S     SS jjr  S     SS jjrSS	 jrSS
 jr	      SS jr
SS jrS S jrS!S jrS!S jrS!S jrS"S jrS#S jr    S$S jrS%S jr\S&S j5       rS'S jr\S(S j5       rSrU =r$ ))r  ir  z Tuple[Sequence[sympy.Expr], ...]_sizesr-   _bodyc                f   > [         TU ]  U5        U R                  U5        U R                  5         g rN   )rp  r   r   _compute_attrsrr  s      rS   r   SchedulerNode.__init__v  s,    
 	#T"rV   c                    [        U R                  [        R                  [        R                  45      (       d   eU R                  R                  UUS9u  U l        U l        U R                  R                  5       nU R                  R                  U5      R                  nX4" U R                  5      4U l        [        R                  (       + =(       d    [        UR                   5      (       + n[        U R                  [        R                  5      (       a)  U R#                  U R                  R%                  US95        g U R#                  [&        R$                  " U R                  /U R                  Q7SU065        g )Nextra_indexing_constraintsrecompute_sizes_body_func	normalizer  )rs   rF   r   r)   TemplateBuffersimplify_and_reorderr  r  get_device_or_errorrE   get_backendgroup_fnr   r   loop_ordering_after_fusionr;   r]   r  extract_read_writesr   )rR   r  r  ru  r  should_normalizes         rS   r  SchedulerNode._compute_attrs  s9   
 $))b&7&79J9J%KLLLL"&))"@"@'A&? #A #
TZ
 ..0>>--f5>>ht{{34
  &@@@ 
KKI
 E
 dii!2!233  		--8H-I   00JJ!%8HrV   c                $    U R                  UUS9  g )Nr  )r  )rR   r  r  s      rS   recompute_size_and_body%SchedulerNode.recompute_size_and_body  s    
 	'A&? 	 	
rV   c                ,   U R                   R                   Vs1 s H"  n[        U[        [        45      (       d  M   UiM$     nnU R                  [        R                  " U R                  /U R                  Q7SU06R                  U5      5        g s  snf )Nr  )r   r   rs   r(   r'   r  r   r  r  r  r  )rR   r  r  	fake_depss       rS   refresh_dependencies"SchedulerNode.refresh_dependencies  s      ++11
1CZgwEW5XC1 	 
 	,,

![[4=i	"	

s
   BBc                   U R                   R                  U5      U l         U R                   R                  U l        U R	                  SS9  SSKJn  UR                  R                  5         U R                  R                  U 5        g )NFr  r   SIMDScheduling)r  reorder_iter_loopssizesr  r  codegen.simdr  candidate_tilingscache_clearpointwise_read_writesclear_cache)rR   	new_orderr  s      rS   apply_new_loop_order"SchedulerNode.apply_new_loop_order  sj    ZZ22

 jj&&!!E!20 	((446""..t4rV   c                   S nU R                   S   n[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       aP  [        =R
                  S-  sl        [        R                  SU R                  5       U5        U R                  U5        g [        R                  SU R                  5       5        g )Nr   r   z"Reorder loops for %s with order %szEDon't reordering %s because we can not decide the suitable loop order)
r  rc   num_varsdecide_loop_order_to_matchr    num_loop_reorderingloop_ordering_logrL  r[   r  )rR   r   r   r  
self_sizess        rS   r  'SchedulerNode.reorder_loops_by_dep_pair  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##WrV   c                   U R                  5       nU SU R                  S    3U SU R                  S    3U SU R                   3/nU R                  R	                  5        Hn  n[        U[        5      (       a  M  UR                  n[        R                  R                  U5      nUR                  U S[        UR                  5       35        Mp     [        U R                  [        5      (       aS  UR                  SU S35        UR                  [         R"                  " U R                  R%                  5       S	5      5        U R&                  c   eUR)                  U R+                  5       5        S
R-                  U5      $ )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:rU  r   )r[   r   r  r   reads_and_writesrs   r(   rP   r>   rv   r  r  ra   r_   r  r-   rW  rd   rh   rF   r9  r   join)rR   rP   linesr  rk  r   s         rS   r   SchedulerNode.debug_str_extra  s:   }}f$TZZ]O4f'

17fIdkk]+

 ##446Cc7++88gg((2zGCJJ4G3HIJ	 7
 djj(++LL6${34LL)=)=)?HIyy$$$T//12yyrV   c                    U R                   $ rN   )r  rQ   s    rS   r  SchedulerNode.get_ranges      {{rV   c                    [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  R                  5       5      $ Nr  )rs   rF   r   r)   r  r]   r   r   rQ   s    rS   r{  SchedulerNode.is_reduction  se    II))2+<+<=
 
 	!d499o 	! 
 DII00233rV   c                b   [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  [        R                  5      =(       a.    [        U R                  R                  [        R                  5      $ r  )rs   rF   r   r)   r  r]   r   	SplitScanrQ   s    rS   r  SchedulerNode.is_split_scan  s    II))2+<+<=
 
 	!d499o 	! 
 $))R%6%67 
JIINNBLL=
 	
rV   c                J    [        U R                  [        R                  5      $ rN   rs   rF   r   r  rQ   s    rS   r  SchedulerNode.is_template  s    $))R%6%677rV   c                p    [        U R                  [        R                  5      (       a  U R                  $ S $ rN   r  rQ   s    rS   r0  SchedulerNode.get_template_node  s'    &tyy"2C2CDDtyyN$NrV   c                f    U R                  5         U R                  5         U R                  U5        g rN   )r  r&  r  )rR   
index_varss     rS   runSchedulerNode.run	  s#    ""$Z rV   c                (   U R                   n[        [        [        U5      5      [        [        [        U5      5      :X  d   e[	        [        [        R                  R                  U5      [        R                  R                  U5      5      5      nU$ rN   )	r  r  maprc   dictzipr-  r.  from_iterable)rR   r  r  
var_rangess       rS   ranges_from_index_vars$SchedulerNode.ranges_from_index_vars  sp     3sE?#s3sJ+?'@@@@--j9--e4

 rV   c                   U R                  U5      n [        R                  " [        [        R                  " 5       U5      5         [        R
                  R                  U 5         U R                  " U6   S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f! [         a"    [        R                  SU R                  5        e f = f)NzError in codegen for %s)r  r>   set_ops_handlerr2   get_ops_handlerrz   r  r  r   r   fatalrF   )rR   r  r  s      rS   r  SchedulerNode.codegen  s    00<
	"" !2!2!4jAxx((.

J' / ..   	II/;	sA   3B)  B&B6B>B) 
B	B
B&"B) &B) ),Cc                    U R                   u  p[        R                  " U R                  U[        R
                  R                  /[        U5      -  /S9$ )z8
Get the memory dependencies in the non-reduction axis.
)hidden_args)r  r   r  r  sympySZerorc   )rR   r  reduction_sizess      rS   r  #SchedulerNode.pointwise_read_writes&  sE    
 "&//JJUWW\\NS=Q,Q+R
 	
rV   c                (   U R                  5       (       a  g[        S U R                  5        5       5      (       a  g[        U R                  R
                  5      S:X  a  [        U[        R                  5      (       a  [        [        U R                  R
                  5      5      n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  UR                  :H  =(       a    UR                  UR                  :H  $ g)NFc              3  @   #    U  H  oR                  5       v   M     g 7frN   )r`   )r  r   s     rS   r  ,SchedulerNode.can_inplace.<locals>.<genexpr>3  s     ?,>S  ,>   r   ztype(write_dep)=)r  r  r   rc   r   r   rs   r   r&   nextiterr]   indexsize)rR   r  	write_deps      rS   r  SchedulerNode.can_inplace0  s    ?D,<,<,>???t&&'1,l,,2
 2
 T$"2"2"9"9:;Ii)?)?@@WEUT)_DVBWW@>>Y__4X)..9XXrV   c                8   [        5       n[        U R                  [        5      (       a  U R                  R	                  5        H  nUR
                  S:X  d  M  UR                  S:X  d  M'  SUR                  ;   a  UR                  S   S:X  d0  [        UR                  5      S:X  d  Me  UR                  S   S:X  d  Mz  UR                  SUR                  ;   a  UR                  S   O)[        UR                  5      S:  a  UR                  S	   OS
5        M     U$ )Ncall_methodstoremode
atomic_add   rT  rP      r   r   )r   rs   r  r-   r_  rG  r  r   rc   ro   r7  )rR   buffers_store_as_atomic_addrF   s      rS   _get_atomic_add_buffers%SchedulerNode._get_atomic_add_buffers=  s    7A|#djj(++

,,.GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr / +*rV   )r  r  r   )rE   rD   rF   z+Union[ir.ComputedBuffer, ir.TemplateBuffer]r   r   NN)r  z*Optional[Tuple[Dict[Any, Any], List[Any]]]r  zOptional[Callable[..., Any]]r   r   )r  r   r   r   )r  zSequence[int]r   r   r4  r   )r   Sequence[Sequence[sympy.Expr]]r   rA  )r  Sequence[sympy.Expr]r   r   )r  r  r   zDict[sympy.Expr, sympy.Expr])r  r  r   r   )r   r   r?  r9  )r^   r   r   r   r   r   r  r  r  r  r  r   r  r{  r  r  r0  r  r  r  r3   r  r  r  r   r}  r~  s   @rS   r  r  r  s   ,,O : 
	 RVBF$N $@ 
	D RVBF
$N
 $@
 
	

5"!.7	( *4
8O!
8	%	 
 
 + +rV   r  c           	     z  ^  T R                   nT R                  [        R                  R	                  U Vs/ s H  o"R
                  PM     sn5      5        [        U 4S j[        R                  " U Vs/ s H  o"R                  PM     sn6  5       5      T R
                  R                  -
  T l        g s  snf s  snf )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7frN   rP   rd  )r  r  group_snodes     rS   r  2refresh_group_node_dependencies.<locals>.<genexpr>Y  s/      
Pxx{;;== CP   "2	2)
r  r  r   
ReadWrites
merge_listr   r   unionr   r   )r  r  r  s   `  rS   refresh_group_node_dependenciesr  R  s    F**6+J6aMM6+JK
 	 
!'')O1*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B34B8rD   c                   [        U [        [        45      (       d   eX l        Xl        S U l        [        R                  " U Vs/ s H  o3R                  c  M  UR                  PM     sn6 U l        [        U 5        [        S U R                   5       5      U l        [        S U R                   5       5      U l        U R                  5        Vs0 s H  oDR                  5       U_M     snU l        g s  snf s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7frN   r   r  r  s     rS   r  "init_group_node.<locals>.<genexpr>q       H5G5Gr  c              3  8   #    U  H  oR                   v   M     g 7frN   )r   r  s     rS   r  r  r  r  r  )rs   r  GroupedSchedulerNoder  rE   rF   r   r  r   r  r  r   r  r   r   r[   r   )r  rE   r  r  r   s        rS   init_group_noder  b  s    
 k$68L#MNNNN%K&,,%	Av!+!++v	AK $K0H[5G5GHHKH[5G5GHHK'2'>'>'@#'@'@#K 
B#s   C4C4C9c                    ^  \ rS rSr% SrS\S'   \      SS j5       r      SS jrS U 4S jjr	\
S!S j5       rS!S	 jr\
S"S
 j5       rS#S jrS!S jrS!S jr      S$U 4S jjr\
S"S j5       r\
S"S j5       rS%S jrS!S jr\
S&S j5       r\
S&S j5       r\
S&S j5       r\
S'S j5       rS(S jr\
S&S j5       rS)S jrS*S jrS+S jrS!S jrSr U =r!$ ),r  ix  z
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be fused together. The way it does this is by maintaining
its unmet dependencies as the union of its constituent nodes.
List[BaseSchedulerNode]r  c                J   UR                   UR                   L d   e[        U[        [        45      (       d   e[        U[        [        45      (       d   e[	        [
        R                  " UR                  5       UR                  5       5      5      nU " UR                   U5      $ rN   )rE   rs   r  r  r   r-  r.  r_  )r'  rD  rE  nodess       rS   fuseFusedSchedulerNode.fuse  s     %//111%-1C!DEEEE%-1C!DEEEEY__U__%68IJK5??E**rV   c                   U R                  5       (       a  g S nU R                   Hh  n[        U[        5      (       d   eUb<  [	        U5      [	        UR
                  S   5      :w  a  [        R                  S5          g UR
                  S   nMj     S nUc   e[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       d%  [        R                  SU R                  5       5        g [        =R                  S-  sl        [        R                  SU R                  5       U5        U R                   H+  n[        U[        5      (       d   eUR                  U5        M-     [        U 5        g )Nr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %s)r  r  rs   r  tupler  r  rL  rc   r  r  r[   r    r  r  r  )rR   r   r   r  snoder  s         rS   r  ,FusedSchedulerNode.reorder_loops_by_dep_pair  sH    
[[Ee]3333%%
*;uU\\RS_?U*U!''G aJ ! 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[Ee]3333&&y1 ! 	(-rV   c                ~   > [         TU ]  U5        [        XU5        / U l        [	        US S9R
                  U l        g )Nc                4    [        U R                  5       5      $ rN   )r   r{  r  s    rS   r   -FusedSchedulerNode.__init__.<locals>.<lambda>  s    s1>>3C/DrV   r  )rp  r   r  rK   r  r   rR   rE   r  r  s      rS   r   FusedSchedulerNode.__init__  s6    #0%'
%DEKK
rV   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf N_r  r  r[   rR   r  s     rS   r[   FusedSchedulerNode.get_name  +    xxt{{;{!{;<<;   :c                <    U R                   S   R                  5       $ Nr   r  r[   rQ   s    rS   r\  !FusedSchedulerNode.get_first_name      {{1~&&((rV   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf rN   r   r  r  rd  r  s     rS   rd  #FusedSchedulerNode.get_buffer_names  0    !L1"4"4"6!LMM!L   <c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ rN   r  r9  r   rR   rf   rF   s      rS   r   FusedSchedulerNode.get_outputs  /    (*KKDMM$**,-  rV   c           
        [        U R                  5       VVs/ s H+  u  pU R                  5        SU SUR                  5        3PM-     nnnU R                  S   R                  nUb  UR                  U R                  5       5        [        R                  " SR                  U5      R                  5       S5      $ s  snnf )Nz.snodes[z] =
r   r   rU  )	enumerater  r[   rh   rF   r9  r   rW  rd   r  r   )rR   irF   r  s       rS   r   "FusedSchedulerNode.debug_str_extra  s     %T[[1
1 }}xs%0@/AB1 	 
 {{1~""LL3356tyy/668&AA
s   2B=c                l    U R                    Vs/ s H  oR                  5       PM     nnU  SU 3$ s  snf )Nz
, snodes: )r  r   )rR   rF   
snodes_strs      rS   r   "FusedSchedulerNode.debug_str_short  s8    9=E**,
Ez*.. Fs   1c                   > [         TU ]  X5        [        5       n[        U R                  5       H/  nUR                  X5        UR                  UR                  5        M1     g rN   )rp  r#  r   reversedr  updater   )rR   r  r   rF   r  s       rS   r#  !FusedSchedulerNode.set_last_usage  sQ    
 	2G 0:|T[[)D 3H&&t7 *rV   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf rN   )r   r  r  r/  r  s     rS   r/  $FusedSchedulerNode.used_buffer_names  s0    !MA"5"5"7!MNN!Mr'  c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf rN   )r   r  r  r  r  s     rS   r  /FusedSchedulerNode.used_or_aliased_buffer_names  s5    8<D1,,.D
 	
Dr'  c                    U R                   $ rN   r  rQ   s    rS   r_  FusedSchedulerNode.get_nodes  r  rV   c                T    [        U 5      R                   SU R                  5        S3$ )Nz(nodes=r   r   rQ   s    rS   r   FusedSchedulerNode.__repr__  s'    t*%%&gdmmo->a@@rV   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frN   )r{  r  s     rS   r  2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s     9[>>##[r  r  r  rQ   s    rS   r{  FusedSchedulerNode.is_reduction  s    9T[[999rV   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frN   )r  r  s     rS   r  3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>  s     :k??$$kr  rC  rQ   s    rS   r   FusedSchedulerNode.is_split_scan  s    :dkk:::rV   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frN   )r  r  s     rS   r  1FusedSchedulerNode.is_template.<locals>.<genexpr>  s     8Kq==??Kr  rC  rQ   s    rS   r  FusedSchedulerNode.is_template  s    8DKK888rV   c                x    U R                    H*  nUR                  5       (       d  M  UR                  5       s  $    g rN   )r  r  r0  r`  s     rS   r0  $FusedSchedulerNode.get_template_node  s3    KKD!!--//   rV   c                     U R                   S   $ r  )r   rQ   s    rS   ro  FusedSchedulerNode.get_device  s    zz!}rV   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frN   )r  r  s     rS   r  >FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA--//r  rC  rQ   s    rS   r  +FusedSchedulerNode.has_aliasing_or_mutation	  s    EEEErV   c                    [         erN   NotImplementedErrorr  s     rS   r  'FusedSchedulerNode.update_mutated_names      !!rV   c                    [         erN   rV  )rR   rP   s     rS   r  FusedSchedulerNode.add_fake_dep  rY  rV   c                    [         erN   rV  r  s     rS   r  FusedSchedulerNode.can_inplace  rY  rV   c                X   U R                  5       nSR                  S U R                   5       5      n[        5       nUR	                  U S[        U 5      R                   SU SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
35        UR                  5          U R                  5        H"  nUR	                  UR                  5       5        M$     SSS5        UR                  S5         UR	                  U R!                  5       5        UR)                  5       R+                  5       $ ! , (       d  f       N]= f! ["         a    [$        R'                  SSS9   NOf = f)r   rY   c              3  L   #    U  H  n[        U5      R                  v   M     g 7frN   )r]   r^   r	  s     rS   r  /FusedSchedulerNode.debug_str.<locals>.<genexpr>  s     F+QQ 0 0+s   "$rX   r   r   r   r   r   r   z.outputs = [
            NrZ   r   Tr   )r[   r  r  r9   r   r]   r^   ra   r   r   r   r   rd   r   rh   r\   r   r   r   r   re   r   )rR   rP   node_typestrr   r   s        rS   rh   FusedSchedulerNode.debug_str  sx   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 	
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   )7E7:F 7
FF)(F))r   rK   rD  rG   rE  rG   r   r  r4  rE   rD   r  r  r   r   r   r9  r   zList[SchedulerBuffer]r7  r<  r   rA  )r   torch.devicer5  )rP   r%   r   r   r?  )"r^   r   r   r   __doc__r   classmethodr  r  r   r3   r[   r\  rd  r   r   r   r#  r/  r  r_  r   r{  r  r  r0  ro  r  r  r  r  rh   r   r}  r~  s   @rS   r  r  x  ss    $#+%+.?+	+ +#.!#..7#.	#.JL = =) N N	B/8#28HV8	8 O O 
 

A : : ; ; 9 9   F F
"""* *rV   r  c                  z  ^  \ rS rSr% Sr    SS jr    SS jr\SS j5       r\      SS j5       r	   S             SU 4S jjjr
\    SS j5       r\    SS	 j5       r\rS
\S'   \    SS j5       r\    SS j5       rSS jrSS jrS S jrS!S jrS"S jrS#S jr    S$S jrSrU =r$ )%ForeachKernelSchedulerNodei3  z
This is a schedular node that consists of a set of scheduler nodes that
has no data dependencies among them and can be executed in parallel.
c                    UR                  5        H@  nUR                  5       U R                  ;   d  M#  U R                  UR                  5          s  $    g rN   )r   r[   read_to_node)rR   producerr   s      rS   get_consumer_subnode_for3ForeachKernelSchedulerNode.get_consumer_subnode_for9  sG     '')C||~!2!22((88 * rV   c                   [        5       nUR                  R                   H  nUR                  U R                  R
                  ;  a  M)  U R                  R
                  UR                     R                  R                  5       nX@R                  ;   d  Mu  UR                  U R                  U   5        M     [        U5      S:X  a  [        [        U5      5      $ g Nr   )setr   r   rP   rE   r}   rH   r[   name_to_noder7  rc   r  r  )rR   consumer	producersrd	node_names        rS   get_producer_subnode_for3ForeachKernelSchedulerNode.get_producer_subnode_forB  s     E	&&,,Bwwdnn88822277;GGPPRI---d//	:; - y>QY((rV   c                  ^ [        TU5      nTR                  5       (       a  UR                  5       (       a  [        R                  " [        T5      m[        R                  " [        U5      n[        TR                  5      [        UR                  5      :H  nU(       d  U" S5        U=(       a3    [        U4S j[        TR                  UR                  5       5       5      $ UR                  5       (       ar  TR                  5       (       a	  U" S5        g[        R                  " [        U5      nUR                  T5      nUb  UR                  R                  TU5      $ U" S5        gTR                  5       (       aq  UR                  5       (       a	  U" S5        g[        R                  " [        T5      mTR                  U5      nUb  TR                  R                  Xb5      $ U" S5        g[        S5      e)	Nzforeach do not have same lengthc              3  ^   >#    U  H"  u  pTR                   R                  X5      v   M$     g 7frN   )rE   can_fuse)r  lrrm  s      rS   r  6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>^  s.      )ADA ""++A11As   *-zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)rC  r  typingcastrj  rc   r  allr  r{  rn  rE   r|  rx  AssertionError)r'  rm  rt  whyforeach_matchconsumer_subnodeproducer_subnodes    `     rS   r|  #ForeachKernelSchedulerNode.can_fuseU  s   (+  X%8%8%:%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    ""$$&&n {{#=xHH'@@J+))228=MNNGH  ""$$&&n {{#=xHH'@@J+))223CNNGHf
 	
rV   c           	     `   UR                  5       (       d  UR                  5       (       d   eUR                  5       (       a4  [        R                  " [        U5      nUR                  nUR
                  nO3[        R                  " [        U5      nUR                  nUR
                  nS nS nUR                  5       (       a  UR                  5       (       a  [        R                  " [        U5      n[        R                  " [        U5      n[        UR                  UR                  5       VVs/ s H  u  px[        R                  Xx5      PM     n	nnGO?UR                  5       (       a  [        R                  " [        U5      nUR                  U5      n
/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     OUR                  5       (       a  [        R                  " [        U5      nUR                  U5      n/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     O[        S5      eU " UR                  U	UUUUS9$ s  snnf )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r  r  r  rj  r  r  r  r  r  r  rx  r  rn  r  rE   )r'  rm  rt  r  r  r  r  r}  r~  r  r  rF   new_noder  s                 rS   r  ForeachKernelSchedulerNode.fuse  s\    ""$$(;(;(=(===  {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O  X%8%8%:%:{{#=xHH{{#=xHH  AADA #''-A  K   ""{{#=xHH'@@JK"KK +166tFH"*K&&x0&&t, (   ""{{#=xHH'@@JK"KK +166xFH"*K&&x0&&t, ( !f  &?##+
 	
Ks   0!J*c                B  >^  0 T l         0 T l        Ub  Ucv  [        TT ]  X5        U H_  nUR                  R
                   H  nUT R                   UR                  '   M     UR                  5        H  n	UT R                  U	'   M     Ma     GOUT l        UT l	        S T l
        / T l        T R                  [        R                  R                  UR                  UR                  /5      5        [!        U 4S j[         R"                  " UR$                  UR$                  5       5       5      T R                  R&                  -
  T l        [)        UR*                  UR*                  /5      T l        [-        UR.                  UR.                  /5      T l        UR1                  5       (       a  [3        U[4        5      (       d   eXEpO[3        U[4        5      (       d   eXTpU
R6                  T l        T R6                  R9                  UR6                  5        U
R                  T l        UR                  5        H  n	UT R                  U	'   M     UT l        US   R=                  5       nU(       d   eU[>        R@                  " S5      444T l!        [!        5       T l"        UT l#        g )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7frN   r  r?  s     rS   r  6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>  s5        xxt'<'<'>>	 C r  r   combo_kernel)$rl  rs  rp  r   r   r   rP   ra  rE   r  rF   rK   r  r   r  r  r   r  r   r   r  r   r  r   r  rs   rj  r   r6  r  ro  r  Exprr   r  r  )rR   rE   r  r  r  r  r  rF   r  rP   foreach_node
other_noderu  r  s   `            rS   r   #ForeachKernelSchedulerNode.__init__  sN    +"5GY/ ,,22D37D%%dii0 3 !446D.2D%%d+ 7	  'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN%%''!+/IJJJJ+6j!+/IJJJJ+6j)33DNNN!!*"6"67 , 9 9D"668*4!!$' 9 *C&%%'v

> :<>?
2<,.rV   c           	     x   U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       aW  [        R                  S[	        U5      U Vs/ s H+  oDR
                  c  M  UR
                  R                  5       PM-     sn5        U Vs/ s H"  n[        U[        [        45      (       a  M   UPM$     nnU Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H  n[        U[        5      (       a  M  UPM     nnU Vs/ s H  o"R                  5       (       d  M  UPM     nnU(       a   [        R                  S[	        U5      15        U Vs/ s H  o"U;  d  M
  UPM     nnU$ s  snf s  snf s  snf s  snf s  snf s  snf s  snf )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d foreach nodes are filteredz,ComboKernels: %d template nodes are filtered)
rs   r  r   rL  rc   rF   r  r  rj  r  )r'  r
  r  externrF   filtered_nodesforeach_nodestemplate_nodess           rS   combinable_nodes+ForeachKernelSchedulerNode.combinable_nodes	  s{    #OUj4M&N!UOIIAF5;UVTyy(&&(VU 
a"8:S!TU  	 
 &
%!A7Q)RA~ 	 
 IICSEWX%
%!Z;U-VA~ 	 
 &4G^}}!^GII>^AT@U &4O^7N!^O5 P
 V




 H
 PsR   FFF#FF#/F#;F(F(
F-'F-3F2F2	F7F7c           
         U R                  5       n/ nSnU H=  nUR                  [        S[        U5      U5       Vs/ s H	  nXEXS-    PM     sn5        M?     U$ s  snf )zC
Returns a list of lists of nodes that are to be grouped together.
   r   )_topological_sort_nodesr9  rangerc   )rE   sorted_nodesgrouped_nodesmax_num_nodesr
  r/  s         rS   &_default_group_nodes_for_combo_kernelsAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels)  ss     !88:!E   #1c%j-@@ a/0@ " s   A
4Callable[[Scheduler], List[List[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    U [         l        g rN   rj  r  )custom_group_algorithms    rS   %set_group_algorithm_for_combo_kernels@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernelsA  s    
 # 	#DrV   c                ,    [         R                  U 5      $ rN   r  rE   s    rS   group_nodes_for_combo_kernels8ForeachKernelSchedulerNode.group_nodes_for_combo_kernelsI  s     *KKIVVrV   c                    [         erN   rV  rQ   s    rS   r&  #ForeachKernelSchedulerNode.mark_runO  rY  rV   c                    [         erN   rV  rQ   s    rS   r  "ForeachKernelSchedulerNode.codegenR  rY  rV   c                    grx  r   rQ   s    rS   r  %ForeachKernelSchedulerNode.is_foreachU  r2  rV   c                ,    [        U R                  5      $ )z]Returns a list of nodes which comprise the combo kernel.
These nodes may be vertically fused.)r   r  rQ   s    rS   get_subkernel_nodes.ForeachKernelSchedulerNode.get_subkernel_nodesX  s     DKK  rV   c                t    [        [        R                  R                  S U R                   5       5      5      $ )ziReturns all nodes contained in this kernel, unpacking fused nodes
into their constituent scheduler nodes.c              3  @   #    U  H  oR                  5       v   M     g 7frN   )r_  r  s     rS   r  7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>`  s     1UA++--r  )r   r-  r.  r  r  rQ   s    rS   r_  $ForeachKernelSchedulerNode.get_nodes]  s(     IOO111U1UUVVrV   c                <    U R                   S   R                  5       $ r  )r  r\  rQ   s    rS   r\  )ForeachKernelSchedulerNode.get_first_nameb  s    {{1~,,..rV   c                    [        XU R                  R                  5        U R                   H  nUR	                  U5        M     g rN   )rT  rE   r}   r  rV  )rR   rU  rF   s      rS   rV  /ForeachKernelSchedulerNode.prune_redundant_depse  s5     	d8R8RSKKD%%&89  rV   )r   r  r   r   r   rs  rF   r  rl  rE   r  r   r  rK   )rm  rG   r   r   )rt  rG   r   r   rm  rG   rt  rG   r   r   )rm  rG   rt  rG   r   rj  )NNF)rE   rD   r  r  r  r   r  r   r  r   r  r   r   r   r
  r  r   r  )rE   rD   r   List[List[BaseSchedulerNode]])r  r  r   r   r   r   r   r  r<  r   r:  )r^   r   r   r   rg  rn  rx  rh  r|  r  r   r  staticmethodr  r  r   r  r  r&  r  r  r  r_  r\  rV  r   r}  r~  s   @rS   rj  rj  3  s   
)	$)	$& ,
 ,
\ >
(>
4E>
	#>
 >
J 4837 %B/B/ (B/ $(	B/
 1B/ 1B/ B/ 
B/ B/H +	  > 	& * 	/ & ( / 
 T
	
 
 WW	&W W
""!
W
/:">:	: :rV   rj  c                     ^  \ rS rSr% SrS\S'   \SS j5       rSU 4S jjrSS jr	SS jr
\SS	 j5       rSS
 jr\SS j5       rSS jrSS jr\SS j5       rSrU =r$ )r  in  a'  
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be *grouped* together (it does not allow another node to be scheduled
in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
Fusion will still happen among the nodes within each GroupedSchedulerNode.
At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
r  r  c                   ^ US   R                   m[        U4S jU 5       5      (       d   eU " TU5      nU H   nUTR                  UR                  5       '   M"     UTR                  UR                  5       '   U$ )Nr   c              3  >   >#    U  H  oR                   TL v   M     g 7frN   r  )r  rF   rE   s     rS   r  .GroupedSchedulerNode.create.<locals>.<genexpr>}  s     B64>>Y.6s   )rE   r  rU  r[   )r'  r  grouped_snoder  rE   s       @rS   createGroupedSchedulerNode.createz  su    1I''	B6BBBBBIv.E=JI(()9: AN	$$]%;%;%=>rV   c                <   > [         TU ]  U5        [        XU5        g rN   )rp  r   r  r  s      rS   r   GroupedSchedulerNode.__init__  s    #0rV   c                   U R                    H)  nXR                  R                  UR                  5       '   M+     U R                  R                  U R                  5       	 U R                  R	                  U R                   5      $ )zw
Do fusion among nodes within this GroupedSchedulerNode,
and then unpack this GroupedSchedulerNode into regular nodes.
)r  rE   rU  r[   
fuse_nodes)rR   r  s     rS   unpackGroupedSchedulerNode.unpack  s\    
 [[EBGNN--enn.>? !NN--dmmo>~~((55rV   c                    U R                  U R                  R                  U5      5        U R                  R	                  U5        g rN   )r  r   r  r   r7  )rR   fake_deps     rS   r  !GroupedSchedulerNode.add_fake_dep  s5    T--77AB##H-rV   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf r  r  r  s     rS   r[   GroupedSchedulerNode.get_name  r  r  c                <    U R                   S   R                  5       $ r  r   rQ   s    rS   r\  #GroupedSchedulerNode.get_first_name  r"  rV   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf rN   r$  r  s     rS   rd  %GroupedSchedulerNode.get_buffer_names  r&  r'  c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ rN   r)  r*  s      rS   r    GroupedSchedulerNode.get_outputs  r,  rV   c                    U R                   $ rN   r  rQ   s    rS   r_  GroupedSchedulerNode.get_nodes  r  rV   c                    grE  r   )r'  rm  rt  s      rS   r|  GroupedSchedulerNode.can_fuse  s     rV   r   )r  r  r   r  rd  r  )r  r%   r   r   r   r9  re  r<  r  )r^   r   r   r   rg  r   rh  r  r   r  r  r3   r[   r\  rd  r   r_  r|  r   r}  r~  s   @rS   r  r  n  s~     $# 16. = =) N N  rV   r  c           
     0  ^ ^ [         R                  SUU 4S jj5       n[        [        [	        [        T S   5      5      5      5      n[        U5      S:  a  U Vs/ s H  nT U   PM
     snm [        R                  (       a  UR                  US9  U$ s  snf )zu
A heuristic to decide loop iteration orders.  This has not been well
tuned and may be something we should autotune.
c                z  > TU    S:X  d	  TU   S:X  a  [        TU    S:H  TU   S:H  5      $ T Vs/ s H  n[        X    5      PM     nnT Vs/ s H  n[        X!   5      PM     nn[        S [        X45       5       5      n[        S [        X45       5       5      nXV:  a  gXe:  a  g[        X5      $ s  snf s  snf )Nr   c              3  F   #    U  H  u  pUS :H  =(       d    X:  v   M     g7fr  r   r  sl_asl_bs      rS   r  5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  $      
7VDAI$$7V   !c              3  F   #    U  H  u  pUS :H  =(       d    X!:  v   M     g7fr  r   r  s      rS   r  r    r  r  r  )r4   absr  r  )	abslstride_len_astride_len_ba_firstb_firstr  stride_lengthss	          rS   	index_cmp"pick_loop_order.<locals>.index_cmp  s    8q=E!HMuQx1}eAh!m44 .<<^rBE
^<-;<^rBE
^<  
7:<7V
 
  
7:<7V
 
  1y# =<s   B3B8r   r  )r  r   r  r   r   r   )		functools
cmp_to_keyr   r5  r  rc   r   pick_loop_orderssort)r  r  priority_idxr  orderpis   ``    rS   pick_loop_orderr    s      4 %N1$5 6789E
<17CD|.,|D

y
!L Es   Bc                  d    \ rS rSr% S\S'   SrS\S'   SrS\S'   SS jrSS	 jrSS
 jr	SS jr
Srg)NodeUseri  $Union[BaseSchedulerNode, OutputNode]rF   Fr   r  is_weakc                v    [        U R                  R                  5       U R                  U R                  45      $ rN   )rO   rF   r[   r  r  rQ   s    rS   rT   NodeUser.__hash__  s+    TYY'')4+;+;T\\JKKrV   c                    [        U[        5      =(       aa    U R                  5       UR                  5       :H  =(       a9    U R                  UR                  :H  =(       a    U R                  UR                  :H  $ rN   )rs   r  r[   r  r  rR   others     rS   __eq__NodeUser.__eq__  s[    uh' .5>>#33.  E$5$55. -		
rV   c                6    U R                   R                  5       $ rN   rk   rQ   s    rS   r[   NodeUser.get_name  rm   rV   c                    U R                   UR                   L d   e[        U R                   U R                  =(       a    UR                  U R                  =(       a    UR                  5      $ rN   )rF   r  r  r  r	  s     rS   r   NodeUser.merge  sP    yyEJJ&&&II2!2!2LL*U]]
 	
rV   r   Nr   )r
  objectr   r   r   )r
  r  r   r  )r^   r   r   r   r   r  r  rT   r  r[   r   r   r   rV   rS   r  r    s3    
..K GTL
$
rV   r  c                  .  ^  \ rS rSr% S\S'   S;S jrS;U 4S jjrS<S jr\S=S j5       r	\	R                  S>S j5       r	S?S	 jrS@S
 jrSAS jrS?S jrS?S jrS?S jr    SBS jrSCS jrSDS jrS?S jrS?S jrSBS jrS?S jr    SES jrS?S jrSFS jr      SGS jr    SBS jrSHSIS jjrSJS jr    SKS jr      SGS jr       SGS jr!      SGS  jr"        SLS! jr#      SMS" jr$SNS# jr%SGS$ jr&      SGS% jr'        SOS& jr(SPS' jr)SQS( jr*      SMS) jr+    SRS* jr,    SSS+ jr-S?S, jr.S?S- jr/S?S. jr0STS/ jr1SUS0 jr2SVS1 jr3SWS2 jr4      SXS3 jr5S?S4 jr6S?S5 jr7    SYS6 jr8SZS7 jr9S[S8 jr:S?S9 jr;S:r<U =r=$ )\rD   i  zDict[Dep, int]_Scheduler__dep_size_hint_cachec                p    [        S5         U R                  U5        S S S 5        g ! , (       d  f       g = f)NzScheduler.__init__)r   _initrR   r
  s     rS   r   Scheduler.__init__  s#    ./JJu 0//s   '
5c           
       >^  [         TT ]  5         0 T l        T [        R                  l        0 T l        [        [        5      T l	        [        5       T l        [        / [        R                  R                  R                  5       Q[        R                  R                  R                  5       Q[        R                  R                  R                  5       Q5      T l        U Vs/ s H  nT R#                  U5      PM     snT l        T R'                  5         T R                   R)                  [        R                  R                  R                  5       5        T R$                   H  nUR+                  5         M     T R-                  5       T l        T R$                   Vs0 s H  o"R1                  5       U_M     snT l        T R$                   VVs0 s H*  o3R5                  5         H  oDR1                  5       U_M     M,     snnT l        T R2                  R9                  5       T l        0 T l        0 T l        T RA                  5         T RC                  T R$                  5      T l        T RE                  5         T R$                   Vs0 s H  o"R1                  5       U_M     snT l        T RG                  5         [H        RJ                  " T R$                  T R6                  T R:                  5      T l        [L        =RN                  [Q        T R$                  5      -  sl'        [        RR                  RU                  T R$                  5        [Q        T R$                  5      T l+        T RY                  5         T RC                  T R$                  5      T l        [        5       T l-        [\        R^                  b%  [\        R^                  " T R$                  5      T l        T Ra                  T R$                  5      T l        [\        Rb                  (       a  SSK2J1n  U" T R$                  T R6                  T R:                  [g        [        R                  R                  R                  5       5      [g        [        R                  Ri                  5       5      5      T l        T Rk                  5         T Rm                  5         [\        Rn                  (       a%  [H        Rp                  " T R$                  5      T l        [\        Rr                  (       a  T Ru                  S S9  T Rw                  5         T Ry                  5         [        RR                  R{                  T R$                  5        [        RR                  R}                  T R$                  5        T R                  5         [        5       T l@        0 T lA        [        S5      R                  U 4S j5        g s  snf s  snf s  snnf s  snf )Nr   )reorder_for_peak_memory)num_ck_nodesgraph_statsc                 ^   > T R                   T R                  [        T R                  5      S.$ )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesrc   r
  rQ   s   rS   r   !Scheduler._init.<locals>.<lambda>m  s%     33+/+>+>*-djj/rV   )Drp  r   r  r>   rv   rE   backendsr  _post_grad_graph_counterr   r   r  r  keys	constantstorchbind_constantsr>  create_scheduler_noder
  update_zero_dim_cpu_tensorr6  r  get_donated_buffersr|   r[   rs  r   r}   copyrU  r   mutation_renamescompute_dependenciestopological_sort_scheduledead_node_eliminationcompute_ancestorsr   decide_global_ordering_of_commsr    ir_nodes_pre_fusionrc   rL  ir_pre_fusionr!  create_foreach_nodeslogged_slow_fusionr   _pre_fusion_custom_passr  r  memoryrr  get_output_namesmerge_loopsfinalize_multi_template_buffers reorder_for_compute_comm_overlap$reorder_compute_and_comm_for_overlapcombo_kernelscreate_combo_kernel_nodesprocess_grouped_nodescompute_last_usageir_post_fusiongraph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_row)rR   r
  r
  rF   r   r  r  s   `     rS   r  Scheduler._init  s4   %'" <>"&'?"@5?\!&0%%**,""'') ,,113'
# >CCUd003UC
'')##**177+<+<+A+A+CDJJDOO 
 $$& 	# &*ZZ;
%/JJL!OZ;
 -1JJ8
,6DBRBRBT3LLNCBTNJ8
 AE@Q@Q@V@V@X 35 13!!#33DJJ?
""$<@JJ"GJq::<?J"G ::JJ##

 	##s4::6#	djj)!$**o!!#33DJJ?
?I|))577

CDJ__TZZ0
))70

  ''AGG((--/0AGG,,./DJ 	,,.22CCDJJODJ***=""$!	tzz*	djj) 6@\! :<'//	
i D;
8
2 #Hs   #WW1WWc                   0 n[         R                  R                   Hg  n[        [         R                  R                  U   [        R
                  5      (       d  M?  [        U [         R                  R                  U   S S9X'   Mi     U$ )N)rH   )r>   rv   graph_inputs_originalrs   r   DonatedBufferr   )rR   name_to_donated_bufrP   s      rS   r*  Scheduler.get_donated_bufferst  sl     GG11D!''77=r?O?OPP,BGG11$7 $-#) 2 #"rV   c                6    [         R                  R                  $ rN   r>   rv   current_devicerQ   s    rS   rO  Scheduler.current_device  s    ww%%%rV   c                .    U[         R                  l        g rN   rN  rt  s     rS   rO  rP    s    !'rV   c                |    [         R                  R                  SS5      S:X  a  SSKJn  U" U R
                  SS9  gg)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr  rL  rU  r
  )rR   rU  s     rS   rC  Scheduler.debug_draw_graph  s1    ::>>:DASH+6 IrV   c                    [         R                  [        R                  5      (       a:  [         R	                  SU5        U R
                   H  nUR                  5         M     g g )Nz%s:)r   isEnabledForloggingINFOr   r
  r   )rR   labelrF   s      rS   debug_print_nodesScheduler.debug_print_nodes  sD    GLL))HHUE"

  " # *rV   c                P   UR                  5       c   S5       eUR                  5       (       a  [        X5      $ [        U[        R
                  [        R                  45      (       a  [        X5      $ [        U[        R                  5      (       a  [        X5      $ [        U5      e)Nz2All nodes passed to scheduling must have an origin)r  is_no_opr  rs   r   r)   r  r  r  r  rW  r`  s     rS   r(  Scheduler.create_scheduler_node  s    *	@?	@*==??)$55r00"2C2CDEE ,,boo..,T88%d++rV   c                   [        5       n/ nU R                  R                  5       n[        R                  R
                  R                  5        H  nU Vs/ s H0  nXS;   d  M
  [        U R                  U   [        5      (       a  M.  UPM2     nnU(       d  MI  UR                  U5        U Vs/ s H  oPR                  U   PM     nn[        R                  S:  n[        U USUS9nUR                  U5        U H  nXR                  U'   M     M     U R                   V	s/ s H  oR!                  5       U;  d  M  U	PM     sn	[#        U5      -   U l        g s  snf s  snf s  sn	f )Nr   Fr  r  )r   rU  r%  r>   rv   listsr   rs   rs  r  r6  r   combo_kernels_autotunerj  r  r
  r[   r   )
rR   removed_node_namesfe_nodeskept_node_namesnamesrP   r  r  fe_noderF   s
             rS   r4  Scheduler.create_foreach_nodes  sK   .8l11668WW]]))+E "!D*  #4#4#4T#:<RS !   %%e,:?@%$''-%F@$;;a?O0*/ /	G OOG$07''- 1 ,8 "ZZ
'T==?BT+TDZ
N
5 A
s$   	E# EE-E E ;E c                  ^ ^^^  [        S5      n " U4S jS[        U   5      m[        R                  " T5      mT R                   H  nUR                  5        H  nUR                  5       nUR                  5        He  nUT;   aD  UT;   a>  TU   nTU   nXg-   nTR                  5        H  n	TU	   UL d
  TU	   UL d  M  UTU	'   M     MM  UT;   a
  TU   TU'   M]  TU   TU'   Mg     M     M     SU U 4S jjm   S         SUU 4S jjjn
0 n[        R                  R                  R                  5        H=  u  p[        U[        R                  5      (       d  M&  UR                    H  nSX'   M	     M?     T R                   GH  n["        R%                  SUR&                  5        UR&                  c   e[)        UR&                  R+                  5       S S	9nU H?  n[        U[        R,                  5      (       d   eUU;  d  M,  UR                  5       UU'   MA     [)        UR&                  R/                  5       S
 S	9nU Hk  nUU;   d   U SU 35       eUU   =nc  M  T R0                  U   R                  5        H+  nUR3                  [5        UR                  5       5      5        M-     Mm     [7        UR8                  R:                  5      S:X  aQ  [=        [?        UR8                  R:                  5      5      =n(       a"  [        U[@        5      (       a  URB                  nOSnUR                  5        GH  n[7        URE                  5       5      S::  d   eURE                  5        H  nT " U5      nU
" UU5        UR3                  [5        UUS95        TU   R                   H  nUR                  5       UR                  5       :X  a  M'  [        UR&                  [F        5      (       d   eUR&                  RI                  5        H:  nT " U5      nUR3                  [K        UUR                  5       S95        U
" UUSS9  M<     M     M     GM     UR8                  RL                   H<  n[        U[J        5      (       a  M  U
" URN                  X"RQ                  U5      5        M>     URS                  T RT                  5        UR                  5        H  nURE                  5        Hz  nUR                  5       T RT                  T " U5      '   UR                  5       T RT                  U'   T RV                  RY                  UU5      T RV                  UR                  5       '   M|     M     GM     [        R                  R[                  5        H4  n["        R%                  SU5        U
" U[]        [5        U5      5      5        M6     [        R                  R^                   H  nUR/                  5        H  nUU;   d   U SUR                  5        35       eUU   =n(       d  M1  T R0                  U   RI                  5        H5  n["        R%                  SUU5        U
" U[]        [5        U5      5      5        M7     M     M     T RT                   H  nU[        R                  R                  ;   aF  U
" U[]        [5        U5      5      5        [        R                  R`                  Rc                  U5        Mg  U[        R                  Rd                  ;   d  M  U
" U[]        [5        U5      5      5        M     [g        [        R                  R                  R                  5       5       VVs0 s H  u  noU_M
     nnn[        R                  R`                   Vs/ s H  nUU   PM
     sn[        R                  l4        T R                   HF  nUR                  5        H/  nURk                  TUR                  5          R                  5        M1     MH     T Rl                   H.  nT Rl                  U   Rk                  TU   R                  5        M0     gs  snnf s  snf )zQ
Create dependency edges between nodes, handling aliasing and
mutation properly.
Tc                  P   > \ rS rSrSr  S     S	S jjrS
S jrSU 4S jjrSrg)1Scheduler.compute_dependencies.<locals>.DedupListi  a  
This data structure behaves like a list except it makes sure the
elements remain unique.
Normally one could use a OrderedSet/dict for this purpose however
the list in question gets elements appended as it is being
iterated over which means that we need to keep the list
semantics.
Nc                T    U=(       d    / U l         U=(       d
    [        5       U l        g rN   )itemsr   
membership)rR   rs  rt  s      rS   r   :Scheduler.compute_dependencies.<locals>.DedupList.__init__  s    
 #[b
","<
rV   c                    XR                   ;   a  g U R                  R                  U5        U R                   R                  U5        g rN   )rt  rs  r  r7  )rR   	node_users     rS   r  8Scheduler.compute_dependencies.<locals>.DedupList.append  s3    /

!!),##I.rV   c                   > [         R                  " U R                  UR                  5      nU R                  UR                   Vs/ s H  o3U R                  ;  d  M  UPM     sn-   nT" XB5      $ s  snf rN   )r   r  rt  rs  )rR   r
  new_membershipr  	new_items	DedupLists        rS   __add__9Scheduler.compute_dependencies.<locals>.DedupList.__add__  sc    !+!1!1$//5CSCS!T JJ${{**!t.FA{* 	 !;;*s   A0A0)rs  rt  r  )rs  zOptional[List[T]]rt  zOptional[OrderedSet[T]]r   r   )rw  ro  r   r   )r
  DedupList[T]r   r  )	r^   r   r   r   rg  r   r  r}  r   )r|  s   rS   r|  rq    s@     ,06:=(= 4= 	=/< <rV   r|  c                R   > U TR                   ;   a  T" TR                   U    5      $ U $ rN   )r,  )r
  r  rR   s    rS   r  .Scheduler.compute_dependencies.<locals>.rename  s,    D)))d33A677HrV   c                N   > TT" U 5         R                  [        XU5      5        g rN   )r  r  )used_by_name	user_noder  r  name_to_usersr  s       rS   add_user0Scheduler.compute_dependencies.<locals>.add_user  s'     &./669rV   Nzscheduling %sc                    U R                   $ rN   r*  r  s    rS   r   0Scheduler.compute_dependencies.<locals>.<lambda>'      AFFrV   r  c                    U R                   $ rN   r*  r  s    rS   r   r  2  r  rV   z not in r   )r  )mutating_bufT)r  zscheduling output %sz+scheduling output %s for unbacked symint %s)r
  r   r   r   )FF)
r  r   r  r  r  r   r  r   r   r   )7r   r
   r  r   r
  r   r[   r`   r%  r>   rv   r  rs  rs   r  r  free_symbolsr   rL  rF   r  get_unbacked_symbol_defsSymbolget_unbacked_symbol_usesrs  r  r'   rc   r   r   r  r  r&   r  rb   rG   rd  r(   r   rP   r  r  r,  r   r  r8  r   graph_outputsmutated_inputsr7  r&  r.  mutated_input_idxsr   r|   )!rR   ro  rF   buf1	buf1_name	buf2_namelist1list2combinedr  r  unbacked_symbol_to_origin_noderP   valfsunbacked_symbol_defsr  unbacked_symbol_usesr~  r   r  	node_modealt_namerg   
other_namer  rk  r   r  	inp_namesr|  r  r  s!   `                             @@@rS   r-  Scheduler.compute_dependencies  sS    CL	<
 	<> @K?V?V@
 JJD((* MMO	!%!1!1!3I M1i=6P -i 8 -i 8#(=#0#5#5#7C -c 2e ;#0#5#>5=c 2 $8 #m33@3Ki03@3Ki0 "4 + (	 	 !&!			;	 	 		
 	 	 MO&
 --335ID#uzz****B9=26 + 6
 JJDIIotyy1 99(((#)		224:J$  *!!U\\2222 ::8<215 * $*		224:J$  *77BS!? @AB77::AG#003??A))'#,,.*AB  B * D$$++,1 d&6&6&=&=!>??S?sI..HH	 	 '')3,,./1444 # 1 1 3H%h/HXt,%%ghY&GH -h 7 = ===?dmmo=$)$))5FGGGG*.))*D*D*FJ)/
);J -- '
 P %ZtD +G !> !4 *, ((..!$00TYY.>.>t.DE / %%d&;&;< '') # 1 1 3H>AllnD))&*:;69llnD))(3 //33HhG ++ !4 *I Z 002HII,h7Xz'(*;<= 3
 77((C11377IS!?!D!D!F GHI76q9919$($5$5a$8$I$I$K		I8UV !:gh6G+HI	 %L 4 ) ))Dqww+++z'$-89&&**40***z'$-89 * ,5QWW5I5I5N5N5P+Q
+QKE4%K+Q 	 
 )*(>(>&
(>IdO(>&
"
 JJD'')mCLLN;AAB *  //D''-77d8K8Q8QR 0
&
s   (aac                  ^	 / n[        U R                  5       GH  nSS jm	SnUR                  5        H  n[        U	4S jUR                   5       5      nU(       a]  [
        R                  SUR                  5       5        [        R                  R                  R                  UR                  5       5        M  SnM     UR                  5       (       + =(       a    U(       + nU(       d  UR                  U5        M  [
        R                  SUR                  5       5        [        R                  R                  R                  UR                  5       5        UR                  R                    H  nUR"                  U R$                  ;   d  M  U R$                  UR"                     R                  nU Vs/ s H2  oR&                  R                  5       UR                  5       :w  d  M0  UPM4     snU R$                  UR"                     l        M     GM     [)        [        U5      5      U l        U R                   H  nUR+                  5         M     gs  snf )	z 
Remove any nodes without users
c                ~    U R                   =(       d+    U R                  5       [        R                  R                  ;   $ rN   )r  r[   r>   rv   rF  )rg   s    rS   can_eliminate_user;Scheduler.dead_node_elimination.<locals>.can_eliminate_user  s&    ||Tt}}!'':T:T'TTrV   Fc              3  4   >#    U  H  nT" U5      v   M     g 7frN   r   )r  ur  s     rS   r  2Scheduler.dead_node_elimination.<locals>.<genexpr>  s     #M9a$6q$9$99s   zremoved dead buffer: %sTzremoved dead operation: %sN)rg   r  r   r   )r5  r
  r   r  rK   r   rL  r[   r>   rv   r  r7  r  r  rF  r   r   rP   r}   rF   r   rQ  )
rR   updated_nodesrF   active_buffersr   can_eliminater  rK   r  r  s
            @rS   r/  Scheduler.dead_node_elimination  s    TZZ(DU #N'') ##M399#M M II7HGG++//?%)N * !% 5 5 77N<NM $$T* 		6H**..t}}? ,,22DyyD$4$44 $ 0 0 ; A A',=',!0AT]]_0TAu=((39 3- )8 (=12
 JJD  " =s   4/I'Ic                   ^^^^ [        5       m[        5       m/ mSUUUU4S jjmU H  nUR                  5        H  nUTU'   M
     M!     U H  nT" U5        M     T$ )z/
Ensure nodes is in topologically sorted order
c                   > U T;  af  TR                  U 5        [        U R                  S S9 H*  nUR                  T;  a  M  T" TUR                     5        M,     TR	                  U 5        g g )Nc                    U R                   $ rN   r*  )ds    rS   r   DScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>  s    affrV   r  )r7  r  r   rP   r  )r
  r  rs  rf   seenvisits     rS   r  2Scheduler.topological_sort_schedule.<locals>.visit  sa    }!!"6"6<LMCxx|3 ,sxx01	 N
 a  rV   )r
  rG   r   r   )r   r  rd  )rR   r
  rF   rP   rs  rf   r  r  s       @@@@rS   r.  #Scheduler.topological_sort_schedule  sc     /9l59V*,	! 	! D--/%)T" 0  D$K rV   c                  ^  [        5       n[        U[        [        [        [
        45      (       a/  UR                   H  nUR                  UR                  5        M      O[        S[        U5       S35      eU 4S jU 5       n[        U Vs1 s H   nT R                  UR                  5          iM"     sn5      $ s  snf )Nz+get_unmet_dep_nodes is not implemented for .c              3  V   >#    U  H  nTR                   U   R                  v   M      g 7frN   )r}   rH   r?  s     rS   r  1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>  s"     Qjs))#.::js   &))rr  rs   r  r  r  r  r   r7  rP   RuntimeErrorr]   r   rU  r[   )rR   r  
unmet_depsr  unmet_dep_opsr
  s   `     rS   _get_unmet_dep_nodesScheduler._get_unmet_dep_nodes  s    U
)&"	
 
 //sxx( 0 =d5k]!L  RjQMRMqT,,QZZ\:MRSSRs   'B;c                   / n[         R                  U R                  S5      n0 nU R                   HQ  nU R                  U5      n[	        U5      X$'   U H*  nUR                  U/ 5      nUR                  U5        XsU'   M,     MS     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  UR                  U
5        U
 H9  nUR                  U/ 5       H  nX+==   S-  ss'   M     UR                  U5        M;     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  M  U(       a   S5       eU$ s  sn	nf s  sn	nf )zE
Sort nodes by their topological order, return a list of node lists.
r   r   zTopological sort failed!)	r  fromkeysr
  r  rc   r  r  rs  r6  )rR   r  r
  childrenrF   r:  r  cr
  vzero_deg_nodesrg   s               rS   r  !Scheduler._topological_sort_nodes  s,    djj!,#%JJD,,T2Dd)EKLLb) !   ).@a!@LL(#$LLB/DK1$K 0		! $ -2KKMDMDAQ!VaMND n 444y A Es   E)EE,Ec                ~   0 nU R                    H  n[        5       nUR                   HL  nU R                  UR                     R
                  R                  5       nUR                  U5        X1U   -  nMN     X1UR                  5       '   X2l        M     [        U R                   5       H  u  pbXbl
        Xbl        M     g)z
Populate each node.ancestors
N)r
  r   r   r}   rP   rH   r[   r7  r   r.  r   r   )rR   name_to_ancestorsrF   r   r  dep_node_namer  s          rS   r0  Scheduler.compute_ancestors		  s    
 9;JJD)3I.. $ 0 0 : F F O O Qm,}==	 / 2;dmmo.&N  %TZZ0KE"N"N 1rV   c                   U R                    H  n[        R                  (       d  M  [        U[        [
        45      (       a)  UR                  5       (       d  [        R                  S:w  a  M`  UR                  5        Hx  n[        U[        5      (       a  UR                  5       (       a  M/  UR                  R                  5       Ul
        UR                  R                  Ul        UR                  SS9  Mz     M     g )NhalideTr  )r
  r   r  rs   r  r  r;   cpu_backendr_  r  r  r9  r  r  r  )rR   rF   r  s      rS   r9  Scheduler.merge_loops	  s    JJD44 d]4F$GHHKKMMf&8&8H&D)!%775;L;L;N;N#kk557${{00
 **T*: * rV   c                z   [        S5         [        S5       H  n[        U5      n[        R	                  SUS-   U5        U R                  U5      n[        U5      n[        R	                  SUS-   UU5        XC:X  d  US:X  d  Ml  [        R	                  SUS-   5          O   UsSSS5        $ ! , (       d  f       g= f)z2
Combine eligible nodes into FusedSchedulerNodes.
zScheduler.fused_nodes
   z/===== attempting fusion (%d/10): %d nodes =====r   z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)r   r  rc   rK  rL  fuse_nodes_once)rR   r
  r/  old_lennew_lens        rS   r  Scheduler.fuse_nodes=	  s     122Ye*  EE
 ,,U3e*  TE	 %A$$Eq1u ' ( + 322s   A4B,B,,
B:c                    / nU R                    H:  nUR                  [        U[        5      (       a  UR	                  5       OU/5        M<     Xl         g)z1
Unpack GroupedSchedulerNode into regular nodes.
N)r
  r9  rs   r  r  )rR   	new_nodesrF   s      rS   r?  Scheduler.process_grouped_nodesX	  sF     .0	JJD!+D2F!G!GdV  
rV   c                    [        U5      S:  d   eUS   R                  5       nX l        U R                  U5      n[	        S5         UR                  U5      sSSS5        $ ! , (       d  f       g= f)k
Benchmark fused list of nodes and return the execution time
in milliseconds on randomly generated inputs.
r   benchmark_fused_nodesN)rc   ro  rO  r  r   r  )rR   r
  ru  backends       rS   r  Scheduler.benchmark_fused_nodesc	  s_     5zA~~q$$&$""6*12007 322s   A""
A0c                         SS jn[        U R                  5       GHc  u  p#[        U[        5      (       d  M  [        UR                  [
        R                  5      (       d  MH  UR                  n[        R                  R                  (       d  UR                  5       u  pVO"[        S UR                   5       S 5      nUc   e[        U[        R                  R
                  R                  5      (       a  UR                  R!                  U5        M  UR#                  5       nUR$                  n[        U[
        R&                  5      (       d   eUR$                  n	[        U	[
        R(                  5      (       d   eUR*                  U	l        U" XI5        U R-                  U	5      n
XR                  U'   XR.                  UR1                  5       '   XR2                  UR1                  5       '   [5        U
R7                  5       UR7                  5       5       H2  u  pXR8                  UR1                  5       '   UR:                  Ul        M4     UR<                  U
l        UR>                  U
l        UR@                  U
l         GMf     g )Nc                   UR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   eUR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   e[        R
                  R                  U	 X1l        [        R
                  R                  U	 XQl	        [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   g rN   )r[   rs   r   rY  r>   rv   r8  rP   
name_to_opoperation_namebuffersr  remove
operations)	orig_noder  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          rS   replace_operation_bufferKScheduler.finalize_multi_template_buffers.<locals>.replace_operation_bufferr	  s_    !) 1 1 3%..0MmS11jARTW6X6XXX'::<$779LlC00Z@PRU5V5VVV&&'89)M""#34&2#77??((3DGGOO""8,$,AGGOOD!4<AGG""=177%%++I6DGG%%h/'/AGGt$/7AGG|,rV   c              3     #    U  H<  n[        U[        R                  R                  R                  5      (       d  M8  Uv   M>     g 7frN   )rs   r   r   select_algorithmExternKernelCaller)r  timings     rS   r  <Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>	  s7      *C) & % @ @ S S  #F*Cs
   7A	A)r  zir.MultiTemplateBufferr  zir.OperationBufferr   r   )!r.  r
  rs   r  rF   r   MultiTemplateBufferr   test_configs%force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr   r   TritonTemplateCallerBasefinalize_as_triton_calleroutput_noder   
StorageBoxOperationBufferr_   r(  rs  r[   rU  r  r   r}   rK   r   r   r   )rR   r  r/  rF   
multi_nodemin_node_unfusedr  out_tensorboxout_storage
out_buffernew_scheduler_nodenew_outold_outs                rS   r:  )Scheduler.finalize_multi_template_buffersq	  s   	8-	89K	8	86 !,GA$..:		2114 4 "YY
**PP*4*C*C*E'$a'+*4*C*C 
($ ,777$OO&&??  II778HI 0 < < >+00!+r}}====(--
!*b.@.@AAAA$.$5$5
!(@%)%?%?
%K" 2

15G!!$--/2;M''8(+&224d6F6F6H)$G <C$$W%5%5%78$+MMGM	) 04~~",/3~~",04"-e -rV   c                &    [        S U 5       5      $ )Nc              3    #    U  H  n[        UR                  S 5      =(       a_    UR                  SL=(       aJ    [        UR                  R                  S5      =(       a#    UR                  R                  R                  S:H  v   M     g7f)r   Nscatter_moder  )ry   rF   r   r  r	  s     rS   r  ,Scheduler._any_atomic_add.<locals>.<genexpr>	  sp      

 	 AFFF# 9d"9^49 ((L89 s   B	B)r  rR   	node_lists     rS   _any_atomic_addScheduler._any_atomic_add	  s     

 
 
 	
rV   c                  ^^^^^^^^ TR                  5       =(       a(    [        TR                  5       [        R                  5      n[
        R                  (       d  U(       d  gTR                  5       (       a-  [        TR                  5       [        R                  5      (       a*  TR                  5       (       d  TR                  5       (       a  gTR                  5       nUS   R                  5       nU(       d   eUR                  S:X  a  gTR                  5       n[        [        R                  " XF5      5      nU R                  U5      (       a  gSSKJn  [%        TT5      n	SUU4S jjn
[        T[&        5      (       Gak  [        TR(                  [        R                  5      (       GaA  TR(                  nUR*                  nUR-                  5       u  nmU R/                  U5      u  mm[1        S5      nSnSn[3        UR5                  5       S S	9 H  u  nn[        U[6        R8                  R                  R:                  5      (       d  M;  UTT-   :  a    OaUS
-  nU[
        R<                  :  a    OFTR(                  R?                  U5         U R/                  U5      u  mnTU:  a  TnUnSSS5        M     U
" UTT5        UTT-   :  a  Ub  TR(                  RA                  U5        gg U R/                  U5      u  mm[B        RD                  " T5      (       a	  U	" S5        gU R/                  U5      u  mm[B        RD                  " T5      (       a	  U	" S5        gU R/                  U5      u  mm[B        RD                  " T5      (       a	  U	" S5        g U
" TTT5        [I        S5      (       a[  TTT-   :  aR  TT4U RJ                  ;  a@  U RJ                  RM                  TT45        [O        S5      RQ                  UUUUUU4S j5        TTT-   :  $ ! , (       d  f       GM  = f! U a  nS[G        U5      ;   a   SnAge SnAff = f)o
If config.benchmark_fusion is False, always return True.
Otherwise, return True if fusion can brings speedup.
Tr   rs  CompilationErrorc           
     z  > [         R                  [        R                  5      (       a  XU-   :  aE  [         R	                  STR                  5       TR                  5       [        X-   U -  S 5      5        g [         R	                  STR                  5       TR                  5       [        XU-   -  S 5      5        g g )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)rK  r[  r\  DEBUGrL  rd  r0   r1   )ms_fusedms1ms2rD  rE  s      rS   
log_fusion/Scheduler.speedup_by_fusion.<locals>.log_fusion	  s    &&w}}55Ci'$$S..0..0"syH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6rV   infNc                    U S   $ rq  r   r  s    rS   r   -Scheduler.speedup_by_fusion.<locals>.<lambda>
  s    adrV   r  r   Fz%register spilling of the first kernelz&register spilling of the second kernelz%register spilling of the fused kernelLoop-carried variableslow_fusionc            	     $   > TT TTTTTT T-   -  S.$ )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior   )r  r  r  path1path2
path_fuseds   rS   r   r  Q
  s&    $)'*$)'*)3,4'/39'=rV   )r  r@  r  r@  r  r@  r   r   ))r  rs   r0  r   r  r   benchmark_fusionTritonTemplateBufferr  r_  ro  r]   r   r-  r.  r  triton.compiler.errorsr  rC  r  rF   r  r  r  r@  r  rs  r   r   r   max_epilogue_benchmarked_choicesswap_as_triton_callerr  mathisinfr   r   r5  r7  r   rF  )rR   rD  rE  is_multi_templatenode_list_1ru  node_list_2node_list_fusedr  r  r  r  r  r  min_ms_fusedms_fused_choicetriton_choiceschoiceunfused_timer  r  r  r  r&  r'  r(  s    ``                 @@@@@@rS   speedup_by_fusionScheduler.speedup_by_fusion	  s    "--/ 
J##%r'='=5
 &&/@ u668":Q:QRR!!!! oo'Q**,v ;;%oo'y{HI
 00;u%	 	" e]++
JJ..1
 1
 J'66N..0FAs33K@JC <L"ON(.$$&N)$ "&%//*<*<*U*UVV39,!#!F$K$KK ZZ55f="&"<"<_"MKHa,.'/*0 >=), |S#. sSy)o.I

44_E!77D
U::c???@ !77D
U::c??@A '+'A'A/'R$*::h''?@  ( 	8S#&#M22C#I%d&=&==##''7]+33 
 #)##o >=< $ *c!f4s6   P27P+ *7P+ "7P+ 
P(	+Q1QQQc                P   [        U5      n[        R                  [        R                  5      (       aD  [        R                  S5        U H)  n[        R                  SUR                  5       -   5        M+     U R                  U5       GHf  u  pEU R                  UR                  5          nU R                  UR                  5          nU R                  XE5      (       d  MX  U R                  XE5      (       a  Mp  U R                  XE5      (       d  M  [        R                  SUR                  5       UR                  5       5        UR                  5       nU R                  U5      R!                  XE5      nUR#                  U5        UR#                  U5        UR%                  U5        U R                  R'                  UR)                  5        Vs0 s H  oR                  5       U_M     sn5        GMi     [+        US S9nU R-                  U5      nU R/                  U5        U$ s  snf )z
Combine eligible nodes into FusedSchedulerNodes.

This relies on two key functions to control the logic:
    - self.can_fuse(): checks if a fusion is legal
    - self.score_fusion(): assigns priority to a given fusion
zfuse_nodes_once, candidates:z  zfusing %s with %sc                    U R                   $ rN   r   r  s    rS   r   +Scheduler.fuse_nodes_once.<locals>.<lambda>
  s    !++rV   r  )r   rK  r[  r\  r  rL  r   get_possible_fusionsrU  r\  r|  will_fusion_create_cycler9  r[   ro  r  r  r  r7  r6  r_  r  r.  rV  )	rR   r
  r  rF   rD  rE  ru  node3r
  s	            rS   r  Scheduler.fuse_nodes_once]
  s    !'""7==11;<#  (<(<(>!>? $ 55e<LE++E,@,@,BCE++E,@,@,BCE}}U**43P3P4 4 --e;;  ')95>>;K
 ))+((055eC""5)""5)&''..27//2CD2CQZZ\5(2CD% =* {(=>..u5!!%( Es   H#
c                   [        U R                  5      nSn[        U R                  5      n[        R	                  SU5        [        [        R                  U 5      5       GH(  u  pV[        R                  U5      n[        U5      S:  a  M,  Ub  X1:  a    OU R                  U5      (       d  [        R	                  SU5        Md  US-  n[        R                  S:  n[        US   R                  USUS9n[        R                  S	[        U5      U5        U H  n	UR                  U	5        M     UR                  U5        U R                   R#                  UR%                  5        V
s0 s H  oR'                  5       U_M     sn
5        GM+     [)        US
 S9U l        U R+                  U R                  5      U l        [        R                  SUU[        U R                  5      5        U R-                  U R                  5        gs  sn
f )z
Groups parallel nodes
r   z2ComboKernels: Generating with num_ck_nodes = %d...r  Nz)ComboKernels: Not speeding up %d-th groupr   Tre  z0ComboKernels: Combining %d nodes for %d-th groupc                    U R                   $ rN   r   r  s    rS   r   5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>
  s    q{{rV   r  zEGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodels)rr  r
  rc   r   rL  r.  rj  r  r  speedup_by_combo_kernelr   rg  rE   r   r  r7  rU  r6  r_  r[   r  r.  rV  )rR   r  r  countnum_nodes_orignumr
  r  r  rF   r
  s              rS   r>  #Scheduler.create_combo_kernel_nodes
  s    $**oTZZ		FU'&DDTJ
NC 3CCINI9~!'E,@//	::		EsKQJE$;;a?O4!&&*. /	K HHBI
 """4( "OOK(##**4?4I4I4KL4Kq{*4KL7
< K-BC
33DJJ?
S

O		
 	!!$**- Ms   (H
c                L    U H  nUR                  U R                  5        M      g rN   )rV  rU  )rR   r
  rF   s      rS   rV  Scheduler.prune_redundant_deps
  s     D%%d&=&=> rV   c                  ^ ^	^
 / m	[        5       m
SU	U
U 4S jjn[        R                  " [        5      nU HE  nT R	                  U5      (       a  M  UR                  5        H  nX5   R                  U5        M     MG     UR                  5        H  nU" U5        M     [        R                  (       ak  [        R                  " [        5      nU H,  n[        USS5      nU(       d  M  Xx   R                  U5        M.     UR                  5        H  nU" U5        M     T R                  T	5      m	T	R                  T R                  SS9  [        R                  S[!        T	5      5        T	$ )zN
Helper to find all legal fusion opportunities, sorted by self.score_fusion()
c                  > [        U 5       H  u  pXS-   S   H  nX#4nUT;   a  M  TR                  U5        TR                  X#5      (       a  TR                  U5        MH  UR	                  5       (       d  UR                  5       (       d  Mt  TR                  X25      (       d  M  TR                  X245        M     M     g rq  )r.  r7  r|  r  r  r  )r
  node1_indexrD  rE  r  possible_fusionsr  rR   s        rS   check_all_pairs7Scheduler.get_possible_fusions.<locals>.check_all_pairs
  s    &/&6""?#45E .Cd{ HHSM}}U22(//4++--1A1A1C1CJ J )//? 6 '7rV   r   NT)r  reversezfound %d possible fusionsr
  r  r   r   )r   r  r   r   unfusable_noder/  r  r   r   aggressive_fusionr   *get_possible_fusions_with_highest_priorityr  score_fusion_keyrK  rL  rc   )rR   r
  rP  buffer_names_groupingrF   r   node_groupinggroup_groupingr   rO  r  s   `        @@rS   r>  Scheduler.get_possible_fusions
  sD    HR	@ 	@  !, 7 7 =D""4((--/%*11$7 0 
 399;MM* < ##(44T:Ngt45")006  "0!6!6!8. "9  JJ
 	$"7"7F4c:J6KLrV   c                  ^ ^^^^ [        5       mSUUUU U4S jjmUR                  5       R                  R                  5       UR                  5       R                  R                  5       -  mUR                  R                  R                  5       UR                  R                  R                  5       -  T-
  m[        UU 4S jT 5       5      nU(       a  [        X5      " S5        U$ )zf
Finds whether there's a path from node1 to node2 (or vice-versa)
caused indirectly by other fusions.
c                ,  > [        U [        5      (       a~  U T;  ax  TR                  U 5        U R                  5       R	                  T5      (       a  g[        TU R                  -  5      =(       d#    [        UU4S jU R                  T-
   5       5      $ g)NFc              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7frN   rU  r  r
  
found_pathrR   s     rS   r  IScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  s,      H!DA #4#:#:1#=>>!D   "%)rs   r  r7  ra  issubsetr   r   r  )rF   combined_ancestorscombined_namesra  rR   visiteds    rS   ra  6Scheduler.will_fusion_create_cycle.<locals>.found_path
  s    $ 233G8KD!++-667IJJ !   ?@ C H!%2D!DH E  rV   c              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7frN   r_  r`  s     rS   r  5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s&     WDVqJt66q9::DVrc  zwill create cyclerF   rG   r   r   )rr  ra  _dictr%  r   r  rC  )rR   rD  rE  cyclere  rf  ra  rg  s   `   @@@@rS   r?  "Scheduler.will_fusion_create_cycle
  s     ,/5	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWWe#$78rV   c                  ^  SSK Jn      S	U 4S jjnU" U5      nU" U5      nU Vs1 s H
  os" U5      iM     nnU Vs1 s H
  os" U5      iM     n	nUR                  U	5      n
SnU
 H  n U[        US   5      -  nM     T R                  X5      n[        R                  R                  R                  USU-  5      (       a  ggs  snf s  snf ! [         a       gf = f)
a  
Return true if fusing the two nodes can potentially increasing peak memory.

The implementation is more like a heuristic since we don't really know if we are at peak
or not when trying to fuse these two ndoes. The order of nodes may change later which makes the
peak memory estimation hard.

Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
1. find all buffers read by each node with a single user. These buffers are supposed to
   be reused if we don't fuses these 2 nodes
2. find the intersection of these buffers for the two node and sum the total buffer size.
   If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
   Note that the extra memory allocation is not necessarily causing peak memory increase.
   This is just a heuristic.

We return true only if the saving for fusion can not trade off the extra memory allocation.
r   r  c                P  > / nU R                   R                   H  nTR                  R                  UR                  5      nU(       d  M1  [        UR                  5      S:X  d  ML  UR                  R                  5       (       d  Mm  UR                  UR                  5        M     U$ rq  )
r   r   r}   r  rP   rc   rK   rF   has_tensor_outputr  )rF   r   rv  r   rR   s       rS   _find_single_user_inputsKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputs1  sw     F&&,,&&**277333syy>Q.3883M3M3O3OMM#((+ - MrV   r   r  F    T)rF   rG   r   zList[ir.Buffer])
r  r  intersectionr   r  score_fusion_memoryr>   rv   r  statically_known_gt)rR   rD  rE  r  rr  lhs_dep_nodesrhs_dep_nodesr   lhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr  	bw_savings   `             rS   can_fusion_increase_peak_memory)Scheduler.can_fusion_increase_peak_memory  s    * 	6	#		 1707;HI=C*3/=I;HI=C*3/=I*77G$C3s1v;. % ,,U:	 77//iPP% JI  s   CC
+C
CCc                    [        [        UR                  UR                  -
  5      [        UR                  UR                  -
  5      5      nUS:  $ )a  
This function prevents fusion for nodes that can increase memory
footprint. This problem is more common in horizontal fusion, where nodes
that are far apart in the original order get fused, lengthening the live
intervals of tensors. This is very evident in models with activation
checkpointing, where the recomputed nodes from different checkpointed
regions get fused and significantly increase the memory footprint.

The current attempt is a quick, possibly hacky, heuristic to prevent the
fusion of nodes that are far away in the original order.

A better but difficult to implement heurisitic would be to use live
intervals of the buffers, find region of peak pressure in the original
program and prevent fusion that crosses that peak region. We might need
special care or good approximation in this implementation, as fusion of
node changes live intervals, and re-computing live intervals and peak
memory after each fusion can introduce large compilation overhead.
@   )r  r  r   r   )rR   rD  rE  proximity_scores       rS   are_long_distant_nodes Scheduler.are_long_distant_nodesS  sE    * %//12%//12
 ##rV   c                   0 nUR                   R                  5        Vs0 s H  oUR                  U_M     nnUR                   R                  5        Vs0 s H  oUR                  U_M     nnU GHj  n[        R                  R                  U5      n	Xh   n
Xx   nU
R                  5       UR                  5       :w  a)  SU
R                  5        SUR                  5        3UU'   Mv  [        U
R                  5      [        UR                  5      :w  a  SXH'   M  [        U
[        5      (       a  [        U[        5      (       d  S[        U
5       S[        U5       3UU'   M  U
R                  5       nUR                  5       nX:w  a  SU SU 3XH'   GM#  U
R                  5       UR                  5       :X  a  SU
 SU 3XH'   GMR  SU
 SU SU	R                   3UU'   GMm     [        U5      $ s  snf s  snf )	ze
Try to decide reasons why fusion fail due to no shared memory even though
there are common buffers.
zdifferent numel: z v.s. 	broadcastznot MemoryDep: zdifferent offset: zMismatch loop orders: zUnknown reason: z
. Layout: )r   r  rP   r>   rv   r  r  r=   r  rs   r&   r]   
get_offsetnormalize_with_stride_orderr_   r   )rR   rD  rE  common_buf_namesreasonsr  node1_name2depnode2_name2deprk  r   lhs_deprhs_deplhs_offrhs_offs                 rS   decide_fusion_fail_reason#Scheduler.decide_fusion_fail_reasonn  s    383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX(H''$$X.C$.G$.G  "g&7&7&99 ((9(9(;'<F7CTCTCVBWX   W\\*mGLL.II$/!gy11GY9W9W &d7m_F4=/J  ((*G((*G! '9	y$Q! 3356689 '=WIVG9$U!
 #7)6'*SZZLQ O )V 7|] YXs   G'G,c                   [         R                  (       a  [        S X4 5       5      (       a  gUR                  R	                  5       nUR                  R	                  5       nX4-  nU(       d  gUR                  R                  5        Vs0 s H  ofR                  U_M     nnUR                  R                  5        Vs0 s H  ofR                  U_M     nn/ n	U Hw  n
Xz   nX   nUR                  5       UR                  5       :X  d  M/  U	R                  [        R                  R                  R                  UR                  5       SS9UU45        My     [        U	5      S:X  a  g[        U	S S9u  pnUR                   UR                   :w  a4  UR#                  5       UR#                  5       :X  a  U R%                  U5      $ gUR'                  5       (       d  UR)                  X5        OZUR'                  5       (       d  UR)                  X5        O3[*        R-                  SUR/                  5       UR/                  5       5        U R1                  X5      $ s  snf s  snf )z
Right now just greedily reorder the loop of node1 to be compatible with node2,
but ideally we should have some heuristics to reorder the loop for node2
to be compatibile with node1 if that's more efficient.
c              3  @   #    U  H  oR                  5       v   M     g 7frN   )rv  r	  s     rS   r  >Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>  s      8
 .1HHJJr  r   r  c                    U S   $ r  r   r  s    rS   r   =Scheduler.shared_data_after_reordering_loop.<locals>.<lambda>  s    !rV   r  z?Don't reorder loops since both nodes are reductions: %s v.s. %s)r   r  r  r   buffer_namesr  rP   r  r  r>   rv   r  r  r  rc   r  r  r  dep_size_hintr{  r  r  rL  r[   rv  )rR   rD  rE  node1_buffer_namesnode2_buffer_namescommon_buffer_namesr  r  r  
candidatesbuffer_namer  r  numels                 rS   !shared_data_after_reordering_loop+Scheduler.shared_data_after_reordering_loop  s"    00C 8
!&8
 5
 5
 "..;;="..;;=0E"383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX 
.K$1G$1G3356689 !!((2273D3D3FQR2S / z?a #&jn"Ew///
   "g&7&7&99))'22 !!##++G=##%%++G=##Q   ''55_ YXs   I?Ic                f    [        U[        [        45      =(       a    UR                  5       (       + $ )z.
Is this node unfusable under any conditions.
)rs   r  r  r  r`  s     rS   rT  Scheduler.unfusable_node  s.    
 t79OPQ '$$&&	
rV   c                   XL a  g[        X5      n[        U[        5      (       d  [        U[        5      (       a	  U" S5        g[        U[        [        45      (       a  UR                  5       (       d	  U" S5        g[        U[        [        45      (       a  UR                  5       (       d	  U" S5        gUR                  5       UR                  -  (       a	  U" S5        gUR                  5       (       a	  U" S5        gUR                  5       (       aH  UR                  5       (       d*  UR                  5       (       d  [        R                  (       d	  U" S5        gUR                  5       [        R                  R                  -  (       d0  UR                  5       [        R                  R                  -  (       a	  U" S5        gUR!                  5       nUR!                  5       nXE:w  a
  U" S	XE5        gAU R#                  X5      nUS
:X  a  U R%                  X5      n[&        R)                  [*        R,                  5      (       a4  [&        R/                  SUR1                  5       UR1                  5       U5        [        R2                  R5                  XX&5      (       d  gUR                  5       UR                  -  (       a_  U R7                  X5      =(       aG    [        R2                  R7                  XX&5      =(       a     U R9                  U5      R7                  X5      $ [        R2                  R;                  XX&5      =(       a     U R9                  U5      R;                  X5      $ )zR
Determine if it is possible to combine node1 and node2 into a
single fused node.
Fz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2z!templates can only fuse epiloguesztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)r   z%s and %s has %s shared data)rC  rs   r  r  r  r  ra  r   r  r{  r   epilogue_fusionrd  r>   rv   no_fuse_buffer_namesro  rv  r  r  r[  r\  r  rL  r[   choicesr|  can_fuse_verticalr  can_fuse_horizontal)rR   rD  rE  r  ru  device2shared_data_scores          rS   r|  Scheduler.can_fuse  s    >%e122j'7
 7
 ABu8:PQRR%%''()u8:PQRR%%''()$$&8,-34**,,!!##))12""$qww'C'CC""$qww'C'CC56!!#""$,f> 44UB! $ F Fu T))'--88##.  !	 yy!!$uHH$$&8 &&u4 MII//UVM$$V,>>uL 9900U M""6*>>uLMrV   c                `   UR                  5       n[        X5      n[        [        5      nUR                   Ht  nU R
                  R                  UR                  UR                  5      n[        U[        5      (       a  U R                  XaU5      (       a  Ma  XW   R                  U5        Mv     UR                  R                   H  n[        U[        5      (       d  M  UR                  U R
                  R                  UR                  UR                  5      5      n	U	(       d  Mb  U	 H,  n
U R                  X5      (       d  M  U	R!                  U
5        M.     M     [#        [$        R&                  R)                  UR+                  5       5       Vs/ s H  nUR                  PM     sn5      nX-  (       a	  U" S5        gUR-                  5       nU HT  nU R.                  U   R0                  R3                  5       nXR4                  U   R6                  -  (       d  ML  U" S5          g   gs  snf )z
Check if it is legal to fuse a consumer (node2) into a producer (node1).

We can fuse them if all the reads of node2 either match
corresponding writes in node1, or are written by nodes that can
be scheduled before the fusion of node1 and node2.
zmemory deps did not matchFz(intermediate nodes between node1 & node2T)rd  rC  r   r   r   r,  r  rP   rs   r(   fusable_weak_depr  r   r   r&   fusable_read_and_writer  r   r-  r.  r  r   ra  r}   rH   r[   rU  r   )rR   rD  rE  node1_buf_namesr  remaining_deps_by_namer  rP   cd	remainingrv  remaining_depsnode1_op_namesre  s                 rS   r  Scheduler.can_fuse_verticalN  s     002%7B47H++C((,,SXXsxx@D#w''D,A,A#e,T,T"(//4	 , ##**Bb),,.22%%))"''277;I y#B222::!((, $ + $ %??88*113C 
 +
 +,224"D&&t,88AACG 7 7 @ J JJJ>?	 # /s   H+c                P  ^ UR                   UR                  5       ;  a  gUR                  R                   Vs/ s H!  nUR                   UR                  :X  d  M  UPM#     nn[        U5      S:w  a  gUS   m[        T[        5      (       d   e[        TR                  [        R                  5      (       a  gU R                  UR                     nUR                  R                   Vs/ s H  owR                   U:X  d  M  UPM     nn[        U4S jU 5       5      $ s  snf s  snf )NFr   r   c              3  $  >#    U  H  n[        U[        5      =(       ai    [        UR                  [        R
                  5      (       + =(       a9    UR                  TR                  :H  =(       a    UR                  TR                  :H  v   M     g 7frN   )rs   r&   r   r  r   TMPr  )r  r  writes     rS   r  -Scheduler.fusable_weak_dep.<locals>.<genexpr>  sn      

 '	 tY' ('

DHH==(

ekk)( 		UZZ'( 's   BB)rP   rd  r   r   r  rc   rs   r&   r   r  r   r  r   r   r  )	rR   weak_deprD  rE  r  mutating_writes	real_namer  relevant_readss	       `    rS   r  Scheduler.fusable_weak_dep  s    == 6 6 88 **11
1zzX222 1 	 

 1$"%++++u{{DHH55++H,A,AB	"..44
4T		Y8ND4 	 
  

 '
 
 	
#

s   DD*D#D#c                8   [        U[        5      (       Gab  U R                  R                  UR                  UR                  5      nX2R                  :w  dR  [        UR                  [        R                  5      (       d)  [        UR                  [        R                  5      (       a  g[        R                  (       a:  UR                  UR                  :w  a   UR                  5       nUR                  5       nUR                  UR                  :H  =(       aa    [        UR                  5      [        UR                  5      :  =(       a/    UR                  S [        UR                  5       UR                  :H  $ [        U[        5      (       a  U R                  R                  UR                  UR                  5      nU R                  R                  UR                  UR                  5      nUR                   UR                   :X  a  UR                   b  X4:X  a  ggr   )rs   r&   r,  r  rP   r   r  r   r  r   r  r  r  rc   r  r'   r  )rR   r  r  	read_name
write_names        rS   r   Scheduler.fusable_read_and_write  sh   dI&&--11$))TYYGI ZZ'&tzz488<<&u{{DHH==00T]]enn5T ~~') 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+rV   c                    SnXR                   ;  a6   UR                  5       (       d  UR                  5       nX R                   U'   U$ U R                   U   nU$ ! [         a     N-f = fr  )r  has_unbacked_symbolsnumbytes_hintKeyError)rR   r  ress      rS   r  Scheduler.dep_size_hint  sy    000//11++-C /2&&s+ 
 ,,S1C
   	s   %A 
A&%A&c                B  ^  [        UR                  R                  5      [        UR                  R                  5      -   n[        UR                  R                  5      [        UR                  R                  5      -   n[	        X45      S-  [        X45      :  a  X4:  a  UnUnUnUR                  R                  UR                  R                  -   Vs/ s H9  nXbR                  R                  ;   d  XbR                  R                  ;   d  M7  UPM;     nn[        U 4S jU 5       5      $ UR                  R                  UR                  R                  -  UR                  R                  UR                  R                  -  -  n[        U 4S jU 5       5      $ s  snf )zV
The first term in our fusion score that estimates number of saved
memory operations.
rT  c              3  F   >#    U  H  nTR                  U5      v   M     g 7frN   r  r?  s     rS   r  0Scheduler.score_fusion_memory.<locals>.<genexpr>  s     ?$3t))#..$   !c              3  F   >#    U  H  nTR                  U5      v   M     g 7frN   r  r?  s     rS   r  r    s!     I6Hs4%%c**6Hr  )rc   r   r   r   r  r  r  )	rR   rD  rE  node1_dep_lennode2_dep_lentmpr  r:  common_memory_depss	   `        rS   rv  Scheduler.score_fusion_memory  sa    E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT },q03}3TT, !,,22U5F5F5M5MMMC++111S<M<M<T<T5T M   ?$???#//558I8I8P8PP##e&7&7&>&>>
 I6HIIIs   6FFc                   [        U5      S:X  a  U$ 0 nU H  u  p4UR                  5       UR                  5       :X  d   eUR                  5       n[        U R                  U5      R	                  X45      5      nXb;  a  X44/X&'   Mo  X&   R                  X445        M     [        UR                  5       [        R                  " S5      S9S   n[        U5      S:  d   eU$ )Nr   r  r   )
rc   ro  r   r  get_fusion_pair_priorityr  r  rs  operator
itemgetter)rR   rO  "possible_fusions_group_by_priorityrD  rE  ru  fusion_pair_priority&possible_fusions_with_highest_prioritys           rS   rV  4Scheduler.get_possible_fusions_with_highest_priority  s    
  A%##  	+ -LE##%)9)9);;;;%%'F#&  (AA%O$  $MNL2H 3HOON - 25.446H<O<OPQ<R2

2. 9:Q>>>55rV   c                D    [         R                  R                  " U /UQ76 $ )z
Shim for list.sort(key=...)
)r>   r  score_fusionr  s     rS   rW  Scheduler.score_fusion_key  s     yy%%d3U33rV   c                    [        [        R                  R                  5       5      n[	        U R
                  5       H9  nUR                  XR                  5        UR                  UR                  5        M;     g)zW
Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
N)
r   r>   rv   r8  r5  r
  r#  r   r6  r   )rR   r  rF   s      rS   r@  Scheduler.compute_last_usage$  sV    
 0:!'':R:R:T/UTZZ(D 35L5LM&&t7 )rV   c                (   [        U R                  [        R                  R                  -
  [        R                  R
                  R                  -
  5       GH  nXR                  ;   a[  U R                  U   nUR                  5       (       a5  [        R                  R
                  R                  UR                  5        Ml  Mn  U[        R                  R                  ;   d  M  [        R                  R                  U   R                  n[        U[        R                  5      (       a  UR!                  5       (       d   e[        R                  R
                  R                  UR                  5        GM!     U R                  R#                  5         g)z*Free any buffers that are no longer neededN)r  rD  r>   rv   r  rw   freedr}   r   codegen_freerF   r  r   rs   r   r  is_input_bufferclear)rR   rP   r   storages       rS   free_buffersScheduler.free_buffers/  s   %%gg%%&gg""(()
D
 '''&&t,<<>>GG((55chh? "---''..t499!'2==99g>U>U>W>WWW$$11',,?
 	!!'')rV   c                    U R                   R                  5        H  nUR                  5         M     U R                  5         g rN   )r#  r   flushr  )rR   r  s     rS   r  Scheduler.flushA  s.    }}++-GMMO .rV   c                   [        U[        5      (       d   e[        S   S==   S-  ss'   [        R                  " [        SS95         UR                  5         UR                  5         S S S 5        UR                  n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  [        R                  R                  5        U R                  5         g ! , (       d  f       N= f)Ninductorextern_callsr   F)increase_kernel_countztype(node)=)rs   r  r   r>   set_kernel_handlerr#   r  r&  rF   r   r  r]   r  rv   rw   r  )rR   scheduler_noderF   s      rS   codegen_extern_callScheduler.codegen_extern_callF  s    .*CDDDD
 	^,1,!!&u"EF002##% G ""$00B[T$ZM2BB0QWW))* GFs   	!C++
C9c                \   [        UR                  5      (       a  UR                  c
   U S35       e[        R                  R                  U5        [        UR                  5      nUc  [        SUR                   35      e[        5       (       d  UR                  S:X  aa  [        R                  R                  U5      =nR                  S:  a2  [        SUR                   SUR                   SUR                   35      e[        UR                  5      (       a  [        S5      eU" U 5      $ )	Nz( should have been normalized in loweringzUnsupported device type: cuda   zFound z which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability r  zCannot find a working triton installation. Either the package is not installed or it is too old. More information on installing Triton can be found at https://github.com/openai/triton)r;   r]   r  r>   rv   add_device_infor"   r  r   r   r  get_device_propertiesmajorrP   minor)rR   ru  device_schedulingdevice_propss       rS   create_backendScheduler.create_backendU  s=   v{{##v||'?	?X=>	??	'5fkkB$!:6;;-HII||v%%*ZZ%E%Ef%MM\TTWXX"\../  0j  kw  k}  k}  j~  ~  @L  @R  @R  S  T  $$" N  !&&rV   c                    Uc   eXR                   ;  a  U R                  U5      U R                   U'   U R                   U   $ rN   )r#  r   rt  s     rS   r  Scheduler.get_backendn  s@    !!!&$($7$7$?DMM&!}}V$$rV   c                  ^  SU 4S jjnUR                  5        VVs0 s H?  nUR                  c  M  UR                  R                  5         H  nU" U5      U4S _M     MA     nnn[        UR	                  5       5      nU(       aJ  [        U[        R                  " S5      S9u  pg[        R                  R                  R                  U5        g g s  snnf )Nc                   > U TR                   ;  aM  TR                   R                  [        U R                  R                  5       VV s0 s H  u  pX_M	     sn n5        TR                   W    $ s  sn nf rN   )rE  r6  r.  rv   r
  )r
  r/  rR   s     rS   	get_order*Scheduler.enter_context.<locals>.get_orderu  s^    ,,,$$++i>V,W>VdaQT>V,WX''** -Xs   	A.
r   r  )r
  ztorch.fx.Noder   r   )r_  rF   r  r   r%  r  r  r  r>   rv   rw   enter_context)rR   rF   r  r
  r  r  r  lasts   `       rS   r  Scheduler.enter_contextt  s    	+ ^^%
%vv  VV'') q\1t# * % 	 
 w||~&'x':':1'=>GAGG  ..t4 
s
   C1Cc                   ^  U R                   U   R                  n[        U4S jU 5       5      =(       a#    XR                  ;  =(       a    XR
                  ;  $ ! [         a     gf = f)NFc              3  n   >#    U  H*  oR                   =(       d    UR                  5       T;   v   M,     g 7frN   )r  r[   )r  rg   fused_node_namess     rS   r  AScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>  s)     VPUC3C CCPUr  )r}   rK   r  r  r,  r   )rR   rP   r  rK   s     ` rS   $can_buffer_be_removed_through_fusion.Scheduler.can_buffer_be_removed_through_fusion  sj    	$$T*00E VPUVV 41114333	
  		s   A 
A('A(c                n    [        S5         U R                  5       sS S S 5        $ ! , (       d  f       g = f)NzScheduler.codegen)r   _codegenrQ   s    rS   r  Scheduler.codegen  s     -.==? /..s   &
4c                   [         R                  (       a  SS Kn[        R                  " 5       n[        5       n[        U5       H  nUR                  S:X  a0  UR                  UR                  R                  R                  :X  a    OTUR                  UR                  4nXS;  d"   SUR                   SUR                   S35       eUR                  U5        M     S U l        U R                   GH8  n[         R#                  [$        R&                  5      (       a4   [         R)                  SUR+                  5       UR-                  5       5        U R1                  U5        UR3                  5       =n(       Ga  XR                  :w  d*  UR5                  5       (       d  UR7                  5       (       a  U R9                  5         XR                  :w  a  U R                  (       aL  [;        U R                  R<                  5      (       a(  [>        R@                  RB                  RE                  5         Xl        [;        UR<                  5      (       aG  URF                  c   S5       e[>        R@                  RB                  RI                  URF                  5        U RJ                  RM                  URN                  5        UR7                  5       (       a3  URQ                  5       tpiU RS                  U5      RU                  Xi5        GO1UR5                  5       (       a-  [V        RX                  " [Z        U5      nU R]                  U5        OUR_                  5       (       aw  [V        RX                  " [`        U5      nU RS                  U5      n
S	S
K1J2n  S	SK3J4n  [k        XU45      (       a  U
nO[m        S[=        U 5      < 35      eURo                  U5        Oc[k        U[p        [r        45      (       a!  U RS                  U5      Ru                  U5        O'[k        U[v        5      (       d   eURy                  5         [         Rz                  R|                  (       a  U RS                  U5      R                  5         U R                  RM                  UR                  5       5        U R                  RM                  UR                  5       5        [k        U[v        5      (       a  GM  UR3                  5       nUc  GM  U RS                  U5      R                  5       (       d  GM(  U R9                  5         GM;     U R                  (       aL  [;        U R                  R<                  5      (       a(  [>        R@                  RB                  RE                  5         U R9                  5         g ! [.         a/  n[         R)                  SUR+                  5       5         S nAGNS nAff = f)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0zdevice should have an indexr   )CUDACombinedSchedulingr  ztype(self)=)Er   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackrr  r5  rP   filename_dynamoconvert_frame__file__linenor7  rO  r
  r   r[  r\  r  rL  r[   r-  r   r  ro  r  r  r  r5   r]   r>   rv   rw   codegen_device_guard_exitr  codegen_device_guard_enterrD  r6  r   r_  r  codegen_templater  r  r  r  r  rj   codegen.cuda_combined_schedulingr  r  r  rs   r  codegen_combo_kernelr  r  codegen_noder  r&  tritondebug_sync_kernelcodegen_syncr>  rd  r  ra  ready_to_flush)rR   r   stackr  framer  rF   r  ru  epiloguebackend_r  r  r  s                 rS   r  Scheduler._codegen  s1   44.++-E5D!% JJ"22%--*E*E*N*NN~~u||4 ,U^^,<Aell^ LJ J
  ) #JJD..
IIO224 t$**v*111~~''''))JJL000**/@++000 0 ,,FFH*0'(55%||7V9VV7,,GGU%%,,T__=!!"&.."2  (99$I!!{{#<dC((.""{{#=tD++F3T8h9O(PQQ&G(KDJ=)9::,,T2D#5}"EFF  (55d;!$(>????}}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;;*%$*:*:6*B*Q*Q*S*SJJLG J #4T5H5H5M5M#N#N GG  ::<

E ! IIP s   3V
W#$WWc                    US   R                  5       nU [        R                  l        X l        Uc   eU R                  U5      nUR                  U5      $ )r  r   )ro  r>   rv   rE   rO  r  benchmark_combo_kernel)rR   r
  ru  r  s       rS   r1   Scheduler.benchmark_combo_kernel  sU     1((* $!!!""6*--i88rV   c                2   [         R                  (       d  gUnUS   R                  5       nUb  UR                  S:X  a  gSSKJn  S/ pe[        U5       H  u  pxUR                  5       n	U R                  U	5      (       a  [        R                  S5         U R                  U	5      u  p[        R                  " U
5      (       a  [        R                  SU5          g	 XZ-  nUR                  U5        M      U R                  U5      u  pnX-
  S:  =(       d    US:  n[        R!                  ["        R$                  5      (       aS  X]:  d  U(       a$  [        R                  S['        X]-  S 5      5        O#[        R                  S[)        X]-  S 5      5        X-
  U:  =(       d    U$ ! U a0  nS
[        U5      ;   a  [        R                  S5         SnA  ge SnAff = f! U a/  nS
[        U5      ;   a  [        R                  S5         SnAge SnAff = f)r  Tr   Nrs  r  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr  zCComboKernel benchmark: return True because of loop-carried variableg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r1  ro  r]   r+  r  r.  r_  r  rK  rL  r  r.  r/  r   r  r[  r\  r  r0   r1   )rR   r
  subkernel_nodesru  r  r  
path1_listr/  r  r
  mspathr  r  	ms2_clone
path2_listsmall_kernels                    rS   rE  !Scheduler.speedup_by_combo_kernel  s   
 ,, #..0 >V[[E1;rZ!/2HA)I ##I..  R55i@::b>>$$U ! " ICd#7 3:
	)-)D)D_)U&CJ ,9c	""7==11yL  E#)C2
   I	#0
 $44M $ *c!f4$$]     	&#a&0  Y 	s=   AF(6G! (G.$GGG!H'$HHHc                r    U R                   U   nUR                  c   eUR                  R                  5       $ rN   )r}   rF   
get_layout)rR   rk  r   s      rS   get_buffer_layoutScheduler.get_buffer_layoutR  s5    x(xx###xx""$$rV   c                    U R                    H  nUR                  5       (       d  M  UR                  R                   H  n[        R
                  R                  R                  UR                  5      nU(       d  M?  [        U5      S:X  d  MP  [        UR                  [        5      (       a  Mq  UR                  5       / :X  d  M  [        R
                  R                  R                  UR                  5        M     M     g rr  )r
  r;   r   r   r>   rv   r8  r  rP   r*   rs   r_   r,   r   zero_dim_cpu_tensor_listr7  )rR   rF   r  r  s       rS   r)  $Scheduler.update_zero_dim_cpu_tensorW  s    JJD{{}} ,,22DWW3377		BF+F3u< *6==:K L L"OO-388<<TYYG 3 rV   )__dep_size_hint_cacher>  r#  rD  r  rO  r5  r   r,  r}   r|   rU  rs  r
  r!  rE  r   )r
  zList[ir.Operation]r   r   )r   z!Dict[str, SchedulerDonatedBuffer]r=  )ru  r>  r   r   r   )r^  r   r   r   )rF   r3  r   rG   r  )r  rG   r   r  )r   r  r
  r  r   zTuple[float, str])r
  r  r   r   rD  rG   rE  rG   r   r   rN   )r  zOptional[int]r   r   rS  )r
  r  r   1List[Tuple[BaseSchedulerNode, BaseSchedulerNode]])rD  rG   rE  rG   r  zTuple[str, ...]r   r   rD  rG   rE  rG   r   r   rk  )r  r(   rD  rG   rE  rG   r   r   )r  r%   r  r&   r   r   )r  r%   r   r   )rO  rF  r   rF  )r
  z+Tuple[BaseSchedulerNode, BaseSchedulerNode]r   r   )r  r  r   r   )ru  rf  r   BaseScheduling)ru  r>  r   rH  )rF   rG   r   r   )rP   r   r  r8  r   r   r
  r  r   zTuple[float, float, str])r
  r  r   r   )rk  r   r   z	ir.Layout)>r^   r   r   r   r   r   r  r*  propertyrO  setterrC  r_  r(  r4  r-  r/  r.  r  r  r0  r9  r  r?  r  r:  r  r9  r  r>  rV  r>  r?  r  r  r  r  rT  r|  r  r  r  r  rv  rV  rW  r@  r  r  r  r   r  r  r  r  r  r1  rE  r>  r)  r   r}  r~  s   @rS   rD   rD     s   ))j
X	# & & ( (7#,"HOSb(#T,	 6T(4#&;B6	808	8N@`
Q$&Q$/@Q$	Q$f','	 'R..`?0 ,0 	:0 d,&,/@,	,\7&7/@7	7r$&$/@$	$69 9 !9 *	9
 
9vF6&F6/@F6	F6P
RMh7&7/@7	7r

(9
BS
	
J D J&J/@J	J<6 Q6	:6@4@4	4	8*$
'2%5$

+:
	
#`D949	!9I5V%
H HrV   c                      \ rS rSr\SS j5       r      SS jr      SS jr      SS jr    SS jr	      SS jr
SS jrSS	 jrSS
 jrSS jr    SS jr      SS jr    SS jrSrg)rH  ie  c                    g)z0Return a set of .codegen.common.BackendFeature()r   r   )r'  ru  s     rS   get_backend_features#BaseScheduling.get_backend_featuresf  s     rV   c                    [         e)z?
Check whether node1 and node2 can be vertically fused or not.
rV  rH  s      rS   r   BaseScheduling.can_fuse_verticalk  
     "!rV   c                    [         e)zA
Check whether node1 and node2 can be horizontally fused or not.
rV  rH  s      rS   r  "BaseScheduling.can_fuse_horizontals  rR  rV   c                    UR                  5       (       d  UR                  5       (       a  [        R                  X5      $ [        R                  X5      $ )z
Fuse two nodes
)r  rj  r  r  rH  s      rS   r  BaseScheduling.fuse{  sC     !1!1!3!3-225@@%**588rV   c                    [         e)zK
Process the iteration sizes in case a transformation needs to be applied.
rV  )rR   r  s     rS   r  BaseScheduling.group_fn  rR  rV   c                    [         e)z
Given a template node, generate a kernel.

This function is only available for triton now. If the third-party backend behaves as a sub-class
of TritonScheduling, it can override it or reuse it.
rV  )rR   template_nodeepilogue_nodess      rS   r#  BaseScheduling.codegen_template  s
     "!rV   c                    [         e)z4
Generate a kernel given a list of pre-fused nodes.
rV  r`  s     rS   r&  BaseScheduling.codegen_node  
     "!rV   c                    [         e)zd
Generate synchronization code for the kernel. This method depends on the hardware characteristics.
rV  rQ   s    rS   r)  BaseScheduling.codegen_sync  r_  rV   c                    g)z}
Check whether the backend is requesting the scheduler to flush the generated kernel.
If not supported, please return False.
Fr   rQ   s    rS   r*  BaseScheduling.ready_to_flush  s    
 rV   c                    [         e)zM
Flush the generated kernel and python wrapper code to the source code file.
rV  rQ   s    rS   r  BaseScheduling.flush  r_  rV   c                    [         e)r  rV  r  s     rS   r  $BaseScheduling.benchmark_fused_nodes  
     "!rV   c                    g)zt
Return an unsigned integer which represents the priority of this fusion pair.
The smaller is with higher priority.
r   r   rH  s      rS   r  'BaseScheduling.get_fusion_pair_priority  s     rV   c                    [         e)z
Benchmark the list of nodes to combine and return the execution time
and memory copy time in milliseconds on randomly generated inputs.
rV  r	  s     rS   r1  %BaseScheduling.benchmark_combo_kernel  rh  rV   r   N)ru  rf  r   zSequence[BackendFeature]rE  rc  )r  r  r   z"Tuple[Tuple[sympy.Expr, ...], ...])rZ  rG   r[  r  r   zOptional[str])rF   z(Union[FusedSchedulerNode, SchedulerNode]r   r   r   r   rD  rG  rI  )r^   r   r   r   rh  rN  r  r  r  r  r#  r&  r)  r*  r  r  r  r1  r   r   rV   rS   rH  rH  e  s     "&"/@"	""&"/@"	"	9&	9/@	9		9"3"	+""(" 4" 
	"""""0"	"&/@	"4"	!"rV   rH  )rX  r   r   r   )rF   rG   rU  r;  r}   zDict[str, SchedulerBuffer]r   r   )r  rG   r   r   )r  rG   rE   rD   r  r  r   r   )r   )r  zList[List[int]]r  r  r  zTuple[int, ...]r   z	List[int])~
__future__r   r  r   r  r-  r\  r.  r  rW  rV  rW  r  r  r   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r  r   torch._inductor.async_compiletorch._dynamo.utilsr   r   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._ordered_setr   torch.utils._sympy.symbolr   r   torch.utils._tritonr   r   r   r   r   r   r    codegen.commonr!   r"   r#   comm_analysisr$   r%   r&   r'   r(   r)   r*   r+   r,   	loop_bodyr-   r7  r.   r/   runtime.runtime_utilsr0   r1   r  r2   utilsr3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   virtualizedr>   	getLoggerr^   r   _logginggetArtifactLoggerrK  r  	dataclassrB   r   rG   rC  ra   r   rT  opsatenconvolutionmmbmmaddmmr  r  r  r  r  r  r  rj  r  r  r  rF  r$  rD   rH  r   rV   rS   <module>r     s   "        	     #       $ 6 M G / ? * 6 6 M M ; : : O O  J 7 &     !^^--hA
NN44XO  ^. ^. ^.B 4_ 4 4D	 D	N
 
,  &K
&K4&K ,&K 
	&KV #()).."<"<**))..,,!IINN00	 W 1 W"5. 5]+% ]+@ " $ 
	,x** x*vx:!3 x:v	?, ?J %'+#++ "+ 	+\ 
 
 
> %??, aH aHH;h" h"rV   