
    a,jG                   f   U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZ
d dlZd dlZd dlmZmZ d dlmZ d dlmZmZ d dlmZmZ d dlZd dlZd dlZd dlmZ d dlmc m Z! d dl"m#Z#m$Z$ d dl%m&Z& d d	l'm(Z( d d
l)m*Z+ d dl,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z> d dl?m@Z@mAZA d dlBmCZCmDZDmEZEmFZFmGZGmHZH d dlImJZJ d dlKmLZL d dlMmNZN dd
lOm*Z* ddlPmQZQ ddlRmSZSmTZTmUZUmVZV ddlWmXZX ddlYmZZZm[Z[m\Z\ ddl]m^Z^ ddl_m`Z` ddlambZb dd lcmdZdmeZemfZfmgZgmhZh dd!limjZjmkZkmlZl erd dlmZnd dloZoe*jp        Zqd"erd#<    ejs        et          Zud$erd%<   ejv        jw        Zwejv        jx        Zxe G d& d'                      Zye G d( d)                      Zze G d* d+                      Z{dd/Z|dd2Z}dd3Z~dd5Z G d6 d7          Zdd8Z e            Zdd;Zdd<Zdd?Z	 	 dddKZddLZddMZddNZddOZddPZddQZddRZddSZddWZdd\Zdd_Z	 	 	 dddiZddoZddrZddtZdduZddvZddyZdddzZdd{Z	 dddZ	 	 dd dZ	 ddd@d@dddZdddddZ ed          ZddZddZddZej        dd            ZddZddZd	dZd
dZddZddZddZddZddZ	 dddZddZddZddZddZddÄZddʄZd dlmZ dd΄ZddτZ	 dddфZddӄZddքZdd؄Z	 dddڜdd܄Z	 	 	 	 	 dddZdS (      )annotationsN)defaultdictdeque)Callable)	dataclassreplace)AnyTYPE_CHECKING)countersis_node_meta_valid)(create_structured_trace_for_min_cut_info)is_with_effects)config)CustomKnapsackSolverCustomRuntimeEstimator)FakeScriptObject)
is_builtin)
LazyStringtrace_structured)	trace_log)extract_tensor_metadata)BackwardState)is_sym_nodepy_sym_types)magic_methodsmethod_to_operator)find_symbol_binding_fx_nodesfree_symbolsis_symbol_binding_fx_nodeoptimization_hintstatically_known_falsestatically_known_true)graph_drawer)
OrderedSet)CheckpointPolicy   )GraphInfoProvider)dp_knapsackdp_knapsack_sliding_hirschberggreedy_knapsackilp_knapsack)KnapsackEvaluator)	AOTOutputSavedForBackwardsAOTOutput#SavedForBackwardsNoVcCheckAOTOutput)_is_functional_graph)is_opaque_node)get_aot_graph_name)_is_bwd_seed_offset_is_fwd_seed_offset
_is_primal_is_tangentget_cuda_generator_meta_val)fx_graph_cseget_aten_targetraise_getitemsboolAOT_PARTITIONER_DEBUGzlogging.Loggerlogc                  n    e Zd ZU dZded<   ded<   ded<   ded<   ded<   ddZddZddZddZddZ	dS )OpTypesz8Class for keeping track of different operator categorieszOrderedSet[Callable[..., Any]]fusible_opscompute_intensive_ops
random_opsview_opsrecomputable_opsnodefx.Nodereturnr;   c                .    t          |          | j        v S N)r9   r@   selfrE   s     ]/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/torch/_functorch/partitioners.py
is_fusiblezOpTypes.is_fusibleh   s    t$$(888    c                .    t          |          | j        v S rI   )r9   rA   rJ   s     rL   is_compute_intensivezOpTypes.is_compute_intensivek   s    t$$(BBBrN   c                .    t          |          | j        v S rI   )r9   rB   rJ   s     rL   	is_randomzOpTypes.is_randomn   s    t$$77rN   c                .    t          |          | j        v S rI   )r9   rC   rJ   s     rL   is_viewzOpTypes.is_viewq   s    t$$55rN   c                .    t          |          | j        v S rI   )r9   rD   rJ   s     rL   is_recomputablezOpTypes.is_recomputablet   s    t$$(===rN   NrE   rF   rG   r;   )
__name__
__module____qualname____doc____annotations__rM   rP   rR   rT   rV    rN   rL   r?   r?   ^   s         BB////9999....,,,,44449 9 9 9C C C C8 8 8 86 6 6 6> > > > > >rN   r?   c                      e Zd ZU ded<   ded<   ded<   ded<   ded<   ded	<   ded
<   ej        dd            ZddZddZddZ	ddZ
dS )NodeInfolist[fx.Node]inputsOrderedSet[fx.Node]_required_fw_nodesrequired_bw_nodestangents_closureunclaimed_nodesdict[fx.Node, int]fw_orderstatic_lifetime_input_nodesrG   c                J     t          d  j        D              fd          S )Nc              3     K   | ]}|V  d S rI   r]   .0ns     rL   	<genexpr>z-NodeInfo.required_fw_nodes.<locals>.<genexpr>   s"      001Q000000rN   c                    j         |          S rI   )rh   )rn   rK   s    rL   <lambda>z,NodeInfo.required_fw_nodes.<locals>.<lambda>   s    a@P rN   key)sortedrc   rK   s   `rL   required_fw_nodeszNodeInfo.required_fw_nodes   s:    00/0006P6P6P6P
 
 
 	
rN   rn   rF   r;   c                    || j         v S rI   )rc   rK   rn   s     rL   is_required_fwzNodeInfo.is_required_fw   s    D+++rN   c                    || j         v S rI   )rd   rx   s     rL   is_required_bwzNodeInfo.is_required_bw   s    D***rN   c                    || j         v S rI   )rf   rx   s     rL   is_unclaimedzNodeInfo.is_unclaimed   s    D(((rN   intc                T    || j         vrt          d| d          | j        |         S )NNode z not in fw nodes!)rc   AssertionErrorrh   rx   s     rL   get_fw_orderzNodeInfo.get_fw_order   s7    D+++ !=!=!=!=>>>}QrN   N)rG   r`   rn   rF   rG   r;   )rn   rF   rG   r~   )rX   rY   rZ   r\   	functoolscached_propertyrv   ry   r{   r}   r   r]   rN   rL   r_   r_   x   s          ++++****))))((((    4444
 
 
 

, , , ,+ + + +) ) ) )           rN   r_   c                  B    e Zd ZU ded<   ded<   ded<   ded<   ded<   dS )MinCutOptionsr;   ban_if_used_far_apartban_if_long_fusible_chainsban_if_materialized_backwardban_if_not_in_allowlistban_if_reductionN)rX   rY   rZ   r\   r]   rN   rL   r   r      sN         $$$$&&&&!!!!rN   r   rE   rF   rG   c                h    | j                             dd           t          j        t          j        fv S )N	recompute)metagetr%   MUST_RECOMPUTEPREFER_RECOMPUTErE   s    rL   must_recomputer      s0    9==d++')0  rN   fx_gfx.GraphModulec                H    | j         j        D ]}t          |          r dS dS )NTF)graphnodesr   r   rE   s     rL   has_recomputable_opsr      s7    
   $ 	44	5rN   c                    | j         j        D ]F}t          |          r5t          |j        d          r t
          j        j        |j        j        v r dS GdS )NtagsTF)	r   r   r   hasattrtargettorchTagnondeterministic_seededr   r   s     rL   has_recomputable_rng_opsr      s^    
   4  	V,,	 	1T[5EEE445rN   r~   c                
   t          | j        d         t          j        t          j        f          rdS t          | j        d         t          j                  s*t          dt          | j        d                              dS )Nvalr&   z.expected node.meta['val'] to be SymFloat, got    )
isinstancer   r   SymIntSymBoolSymFloatr   typer   s    rL   sym_node_sizer      sw    $)E"U\5=$ABB qdi&77 
UT$)EBR=S=SUU
 
 	
 1rN   c                      e Zd ZddZdS )InvalidNodeBaserG   strc                    dS )NzInvalid Noder]   ru   s    rL   __repr__zInvalidNodeBase.__repr__   s    ~rN   N)rG   r   )rX   rY   rZ   r   r]   rN   rL   r   r      s(             rN   r   c                6    t          | j        dd           dk    S )N	namespace_c10d_functional)getattrr   r   s    rL   is_not_collectiver      s    4;T226HHHrN   getitem_nodefx.Node | Nonec                :   | j         t          j        k    rdS | j        d         }| j        d         }t	          |t
          j                  r|j        dk    rdS d|j        vrdS |j        d         }||vrdS ||         }t	          |t
          j                  r|S dS )zGiven a getitem node, check if it extracts from a higher-order op
    that has kwargs mapping the key back to an original input.

    Returns the original input node if found, None otherwise.
    Nr   r&   call_functionkwargs)	r   operatorgetitemargsr   fxNodeopr   )r   	ho_resultrs   r   original_inputs        rL   _get_ho_op_original_inputr      s     h...t!!$I

A
Ci)) Y\_-L-Lty'''th'F
&tC[N."'** 4rN   c                    | j         t          j        j        j        j        t          j        j        j        j        fvrdS | j        d         }t          |t          j
                  sdS t          |          S )zCheck if node is a view/reshape of a higher-order op output that aliases an input.

    Returns the original input node from the higher-order op's kwargs if the pattern
    matches, None otherwise.
    Nr   )r   r   opsatenviewdefaultreshaper   r   r   r   r   )rE   sources     rL   _is_copy_node_bw_onlyr      sa     {59>.6	8N8VWWWtYq\Ffbg&& t$V,,,rN   envdict[fx.Node, Any]c                    t          |           }|'||v r#t          ||         t                    s||         S t          |           }|'||v r#t          ||         t                    s||         S dS )a  Try to find a valid input replacement for an invalid forward output.

    This handles cases where a forward output depends on backward nodes but
    semantically aliases an input. For example, a view of a getitem from a
    triton kernel that mutates a buffer in backward, or a direct getitem from
    such a higher-order op. The original input may be a primal or a valid
    intermediate node already present in the forward graph.
    N)r   r   r   r   )rE   r   r   s      rL   _find_input_for_invalid_outputr      s     +400N"c!!3~.@@ " >"".t44N"c!!3~.@@ " >""4rN   Fjoint_graphfx.Graphra   r`   outputsoutputs_descslist[AOTOutput]subgraph
str | Noneignore_must_be_in_fw_bwc                   t          j                    }i |D ]-}|                    |j                  }|j        |_        ||<   .| j        D ]}|sHt          |          r|dk    r||vrt          |<   )t          |          r|dk    r||vrt          |<   M|v rR|j	        dk    rt          |<   h|j	        dk    r`t          j        |j        i |j        }	fd|	D             }	t          |	          rt          |<   |                    |fd          |<   |j	        dk    r|                    |fd          |<   |j	        d	k    r	 	g }
t!          ||          D ]{\  }}t#          |t           j                  rE|vrt'          d
| d          t#          |         t(                    rd}|j        t,          j        j        j        j        u rt          |          rt7          |j                  dk    rmt#          |j        d         t           j                  rH|j        d         v r9t#          |j        d                  t(                    s|j        d                  }|t9          |          }||
                    |           6t=          d
| d          |
                    |                    f|
                    |           }|                    tA          |
                    }||j        d<   d |
D             |j        d<   |!                                 |"                                 |S )a  
    Given a graph, extracts out a subgraph that takes the specified nodes as
    inputs and returns the specified outputs.

    This includes specifying non-placeholder nodes as inputs.

    The general strategy is to initialize all inputs with proxies as we
    encounter them, and trace through the graph, only keeping values which take
    in valid proxies. Then, all dead code is eliminated.
    backwardforwardplaceholderr   c                z    g | ]7}t          |t          j                  t          |         t                    8S r]   )r   r   r   r   )rm   xr   s     rL   
<listcomp>z6_extract_graph_with_inputs_outputs.<locals>.<listcomp>Q  sI       a))3q6?33  rN   c                    |          S rI   r]   r   r   s    rL   rq   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>Z      CF rN   get_attrc                    |          S rI   r]   r   s    rL   rq   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>]  r   rN   outputr   z couldn't be found in envNr&   r   z was invalid, but is outputdescc                z    g | ]8}t          |t          j                  r|j                            d           nd9S )stack_traceN)r   r   r   r   r   )rm   vs     rL   r   z6_extract_graph_with_inputs_outputs.<locals>.<listcomp>  sL     ' ' ' &027%;%;E

=!!!' ' 'rN   output_stack_traces)#r   Graphr   namer   r   _must_be_in_backwardInvalidNode_must_be_in_forwardr   pytreearg_tree_leavesr   r   any	node_copyzipr   r   RuntimeErrorr   r   r   r   r   copy_r   lenr   appendr   r   tupleeliminate_dead_codelint)r   ra   r   r   r   r   	new_graphrE   new_nodeall_argsoutput_valuesr   x_descreplacementoutr   s                  @rL   "_extract_graph_with_inputs_outputsr    s   $ 

I"$C   ((33	D		! ) )& 	$T**
**&&'D	 $D))	))&&'D	3;; W%%#CIIW''-tyHDKHHH   !  H
 8}} 'D	!++D2B2B2B2BCCCIIW
""!++D2B2B2B2BCCCIIW  M-00 $ $	6a!! 	$||"#G1#G#G#GHHH#a&/22 M # H	 4 <<<,Q// =AFq(("16!9bg66 )q	S((&s16!9~GG ) #&afQi.K &"@C"H"HK*!((555$%KQ%K%K%KLLL  Q((((  ####


5//
0
0C$CHV' '' ' 'CH"#
 !!###NNrN   c                    t           j        oWt          | j        t          j        j                  rt          | j                   p| j        t          j        j	        j
        k    S rI   )r   is_non_builtin_to_includer   r   r   _ops
OpOverloadr   r   higher_order triton_kernel_wrapper_functionalr   s    rL   r  r    sP    + 	DK!6	7	7	W
4;@W@W<W 	R;%)0QQrN   c                r    | j         dk    o,t          | j                            d          t                    S )Nr   r   )r   r   r   r   r   r   s    rL   _is_backward_stater    s,    7m#W
49==3G3G(W(WWrN   c                @    | j                             dd           dk    S )Npartitioner_tagis_backwardr   r   r   s    rL   _has_tag_is_backwardr    s    9==*D11]BBrN   c                @    | j                             dd           dk    S )Nr  
is_forwardr  r   s    rL   _has_tag_is_forwardr    s    9==*D11\AArN   c                @    | j                             dd           dk    S )Nr  must_be_in_forwardr  r   s    rL   _has_tag_must_be_in_forwardr    s    9==*D115IIIrN   c                @    | j                             dd           dk    S )Nr  must_be_in_backwardr  r   s    rL   _has_tag_must_be_in_backwardr    s    9==*D115JJJrN   c                    t          |           rdS t          | j        t          j        j                  o| j        j        j        }t          |            ot          |            o|S NT)
r  r   r   r   r  r  _schema
is_mutabler  r  rE   r  s     rL   r   r     sr    "4(( t 	4;
 566 	+K* 
 !&&& 	,T222	rN   c                    t          |           rdS t          | j        t          j        j                  o| j        j        j        }t          |           o|S r  )	r  r   r   r   r  r  r  r  r  r  s     rL   r   r     sU    #D)) t4;
 566 	+K*   %%4*4rN   joint_modulenum_fwd_outputsEtuple[list[fx.Node], list[fx.Node], list[AOTOutput], list[AOTOutput]]c          	        t          j        d | j                            d          D              }t          j        t	          t          | j                            d                              j                            dd gt          |          z                      }|d |         }||d          }|d |         }||d          }||||fS )Nc              3  $   K   | ]}|j         V  d S rI   r   rm   rE   s     rL   ro   z+_extract_fwd_bwd_outputs.<locals>.<genexpr>  s$      	K	K$)	K	K	K	K	K	KrN   r   r   r   )	r   r   r   
find_nodesnextiterr   r   r   )r   r!  r   r   fwd_outputsbwd_outputsfwd_outputs_descsbwd_outputs_descss           rL   _extract_fwd_bwd_outputsr/    s     $	K	K 2 = = = J J	K	K	KG *T,$//8/<<==>>CGGTFS\\)	
 	
 M
 *?*+K/**+K%&6&67%o&6&67%68IIIrN   saved_valuesr   r   Nonec                V    | D ]%}|j         |k    r|                     |            d S &d S rI   )r   remove)r0  r   saved_values      rL   _remove_by_namer5    sI    #  t##,,,EE $ rN   fwd_module_outputs#list[fx.Node] | tuple[fx.Node, ...]c                    t          |           }t          t          |           dz
  dd          D ]}t          | |                   s|dz   } n|S )Nr&   )r   ranger   )r6  idxis      rL   find_first_sym_noder=    sk      
!
!C3)**Q.B77  -a011 	a%CE	 JrN         @-q=r   torch.fx.Graphtorch.fx.Nodemaxfloatminpositionc           	     
   |                      |          5  |                     t          j        j        j        j        |f          }t          j        j        j                            |j        d                   |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j        j	        j        |dgdf          }t          j        j        j	                            |j        d         dgd          |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j
        j        j        |t          j        f          }t          j        j
        j                            |j        d         t          j                  |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j        j        j        ||f          }t          j        j        j                            |j        d         |          |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j        j        j        |f          }	t          j        j        j                            |j        d                   |	j        d<   t          |	j        d                   |	j        d<   d d d            n# 1 swxY w Y   |                      |	          5  |                     t          j        j        j        j        |	|f          }
t          j        j        j                            |	j        d         |          |
j        d<   t          |
j        d                   |
j        d<   d d d            n# 1 swxY w Y   |                      |
          5  |                     t          j        j
        j        j        |
t          j        fd| d|j                   }t          j        j
        j                            |
j        d         t          j                  |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |S )	Nr%  r   tensor_metar9  Tfp8_scale_pos__r   r   )inserting_afterr   r   r   r   absr   r   r   amaxprimsconvert_element_typefloat64	clamp_min
reciprocalmulTensorfloat32r   )r   rE   rB  rD  rE  abs_node	amax_nodeamax_64_nodeclamp_min_nodereciprocal_nodemul_node
scale_nodes               rL   calculate_quantization_scalingr]    s    
		t	$	$ U U&&IN& ' 
 
  %y~199$)E:JKKe'>x}U?S'T'Tm$U U U U U U U U U U U U U U U 
		x	(	( W W''IN'RD$' ( 
 
	 !&	 3 ; ;M% 2$!
 !
	u )@	u@U(V(V	}%W W W W W W W W W W W W W W W 
		y	)	) 

 

**IO08U]+ + 
 
 $)9?#G#O#ON5!5=$
 $
%  ,Ce$,
 ,
-(

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
		|	,	, 

 

,,IN$,$ - 
 
 &+Y^%=%E%Ee$c&
 &
E" .E&.
 .
M*

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
		~	.	. 

 

--IN%- " . 
 
 ',in&?&G&G&'
 '
U# /F '/
 /
]+

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
			/	/ U U&&IN%!3' ' 
 
  %y~188 ' 
  
e (?x}U?S'T'Tm$U U U U U U U U U U U U U U U 
		x	(	( 	Y 	Y((IO08EM*8(88TY88 ) 
 


 "'!E!M!MM% %-"
 "

 *AQVAW)X)X
&	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y s   BB22B69B6BE44E8;E8B&II
I
&BLLL$BO  OO BQ>>RRB2UU #U r\  
quant_typetorch.dtyperQ  	clamp_maxc           	        |                      |          5  |                     t          j        j        j        j        |t          j        f          }t          j        j        j                            |j        d         t          j                  |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j
        j        j        ||f          }t          j        j
        j                            |j        d         |j        d                   |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                      |          5  |                     t          j        j
        j        j        ||f          }	t          j        j
        j                            |j        d         |          |	j        d<   t          |	j        d                   |	j        d<   d d d            n# 1 swxY w Y   |                      |	          5  |                     t          j        j
        j        j        |	|f          }
t          j        j
        j                            |	j        d         |          |
j        d<   t          |
j        d                   |
j        d<   d d d            n# 1 swxY w Y   |                      |
          5  |                     t          j        j        j        j        |
|fd| d|j                   }t          j        j        j                            |
j        d         |          |j        d<   t          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |S )Nr%  r   rG  fp8_quant_pos_rI  rJ  )rK  r   r   r   rN  rO  r   rU  r   r   r   rS  rT  rQ  r`  r   )r   rE   r\  r^  rQ  r`  rE  target_node_32scaled_target_nodeclamp_min_scaled_nodeclamp_max_scaled_nodequant_activation_nodes               rL   perform_quantizationrh  4  s    
		z	*	* 

 

,,IO08& - 
 
 &+Y_%I%Q%QIeem&
 &
E" .E&.
 .
M*

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
		~	.	. 

 

"00IN% *- 1 
 
 */);)B)B&
(>*
 *
& 2I#E*2
 2
.

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
		1	2	2 

 

 % 3 3IN$,$i0 !4 !
 !
 -2IN,D,L,L#E*I-
 -
"5) 5L!&u-5
 5
"=1

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
		4	5	5 

 

 % 3 3IN$,'3 !4 !
 !
 -2IN,D,L,L!&u-y-
 -
"5) 5L!&u-5
 5
"=1

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
		4	5	5 
 
 % 3 3IO08'48(88TY88 !4 !
 !
 IO088%*51:  	"5)
 5L!&u-5
 5
"=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ! s^   B&CCC(BFFF1BIII/BLLL-BOOOtensortorch.Tensorc                b    |                                  }|                                 }||z  dz  S )z
    Calculate the size of a PyTorch tensor in megabytes (MB).

    Args:
        tensor (torch.Tensor): Input tensor

    Returns:
        float: Memory size in MB
    i   )numelelement_size)ri  num_elementsrm  s      rL   calculate_tensor_sizero  z  s2     <<>>L&&((L<'K88rN   list[torch.dtype]c                     t           j        j        j        d                             dd          } d |                     d          D             } | S )N!activation_quantization_aten_passallowed_dtypesztorch.bfloat16c                j    g | ]0}t          t          |                    d           d                   1S ).r9  )r   r   split)rm   dtypes     rL   r   z&get_allowed_dtypes.<locals>.<listcomp>  s@       16u{{3''+,,  rN   ;)r   	_inductorr   post_grad_fusion_optionsr   rv  )rs  s    rL   get_allowed_dtypesr{    s_    _+D+	c
,--  :H:N:Ns:S:S  N rN   c                <   t                      }t          |           r| j        d         j        |vrdS t          j        j        j        d                             dd          }t          | j        d                   }t          j        j        j        d                             dd          s||k    S t          j        j        j        d                             dd          r't          ||k              pt          ||k               S t          ||k              S )Nr   Frr  
size_in_mbd   skip_dynamo_guardsquantize_dynamic_shape)r{  r   r   rw  r   ry  r   rz  r   ro  r"   r!   )rE   rs  size_thresholdr}  s       rL   should_quantizer    s   '))Nd## ty'7'=^'S'Su_+D+	c,  'ty'788J?!:+	c
&&G ^++ ?!:/

#&
.
.	G )n,  J+J.,HIIIJ
 )~)EFFFrN   c                     t           j        j        j        d                             dd          } t          t           |                     d          d                   S )Nrr  r^  ztorch.float8_e5m2ru  r9  )r   ry  r   rz  r   r   rv  )r^  s    rL   get_quant_typer    sO    '@+	c,+,,  5***3//3444rN   rw  tuple[float, float]c                F    t          j        |           }|j        |j        fS )z
    Calculate the range of values for a given torch.dtype.
    Args:
        dtype (torch.dtype): The input dtype.
    Returns:
        tuple: A tuple containing the minimum and maximum values.
    )r   finforD  rB  )rw  infos     rL   calculate_ranger    s"     ;uD8TXrN   c           
        |                      d          d         }|j        d         }t                      }t          |          \  }}t	                      g }g }t          |          D ]\  }	}
|	|k     r|
j                            dd          rbt          j	        j
        j        d                             dd          rct          | |
|d	|	          }t          | |
|||||	          }t          |          s|                    |           n|                    |           n|                     |
          5  |                     t          j        j        j        j        |
|fd
|	 d|
j                   }t          j        j        j                            |
j        d         |          |j        d<   t/          |j        d                   |j        d<   d d d            n# 1 swxY w Y   ||	<   fdt          |          D             }t1          |          }||z   }|r|d |         |z   ||d          z   }|                    dt5          |                     t6          d         dxx         dz  cc<   d S )Nr   r'  r   saved_for_quantizationFrr  use_scalingTr?  rb  rI  rJ  r   rG  c                B    g | ]\  }}                     ||          S r]   )r   )rm   r<  rE   position_to_quants      rL   r   z*quantize_activation_fw.<locals>.<listcomp>  s;       +21da&&  rN   inductor%activation_quantization_fwd_aten_passr&   )r(  r   r  r  dict	enumerater   r   r   ry  r   rz  r]  rh  r   r   rK  r   r   rN  rO  r   r   r   r=  
update_argr   r   )r   r!  r   r+  r^  rQ  r`  tensor_scale_nodessym_scale_nodesrE  rE   r\  
quant_nodeoutput_updated_argsr;  scale_nodesr  s                   @rL   quantize_activation_fwr    s   **1-F+a.K!!J*:66Iy(*%'O#K00 -5 -5$ o%%9==1599 #	5%>3c-&& <4E8 

 24ZIx 
 #:.. 7&--j9999#**:6666 **400  !&!4!4	<D"J/DhDDDD "5 " "J 	<DD Ie,j  OE*
 6M".6 6JOM2               +5h'   6?6L6L  
 1
2
2C$6K 
%36I#$$6OO 	 a233444Z@AAAQFAAAAAs   =BG''G+	.G+	c           
     
	  	 d | j         D             }d }|D ]O}|j                            dd          r0|j                            d           |j                            d          }t          j        j        j        d                             dd          r|                     |          5  d|j	        
                    dd	          z   	t          	fd
|D                       }d d d            n# 1 swxY w Y   |                     |          5  |                     t          j        j        j        j        ||f          }t          j        j        j                            |j        d         |          |j        d<   t#          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                     |          5  |                     t          j        j        j        j        ||f          }t          j        j        j                            |j        d         |j        d                   |j        d<   t#          |j        d                   |j        d<   d d d            n# 1 swxY w Y   |                     |          5  |                     t          j        j        j        j        ||f          }t          j        j        j                            |j        d         |          |j        d<   t#          |j        d                   |j        d<   d d d            n# 1 swxY w Y   n|                     |          5  |                     t          j        j        j        j        ||fdt+          |j	                  z             }t          j        j        j                            |j        d         |          |j        d<   t#          |j        d                   |j        d<   d d d            n# 1 swxY w Y   t-          |j                                                  D ]$}||k    r||k    r|                    ||           %Qt4          d         dxx         dz  cc<   d S )Nc                (    g | ]}|j         d k    |S )r   r'  r&  s     rL   r   z*quantize_activation_bw.<locals>.<listcomp>  s$    JJJ$M1I1I1I1I1IrN   r  Fdequant_typerr  r  
fp8_scale_
fp8_quant_ c              3  2   K   | ]}|j         k    |V  d S rI   r   )rm   	bwd_input
scale_names     rL   ro   z)quantize_activation_bw.<locals>.<genexpr>  s<       & &%$>Z77 "7777& &rN   r%  r   rG  dequant_rJ  r  %activation_quantization_bwd_aten_passr&   )r   r   r   popr   ry  r   rz  rK  r   r   r)  r   r   rN  rO  r   r   r   divrT  r   listuserskeysreplace_input_withr   )
r   	bw_inputsactivation_noderE   r  r\  divided_target_node_32dequant_nodeuserr  s
            @rL   quantize_activation_bwr    s   JJ%+JJJIO H@ H@9==1599 G	@IMM23339==88L%>3c-''? **400  !-	0A0A,PR0S0S!SJ!% & & & &)2& & & " "J               **:66  &+&9&9	<D"L1 ': ' 'O
 	<DD Ie,l  $(/
 ;R',U3; ;O(7               **?;; 
 
-2-@-@	*1-z: .A . .* :?9K9R9R',U3Z_U5K: :*/6 00F0KE0RSS +/>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 **+ABB  #(#6#6	<D4lC $7 $ $L
 	<DD27>  !%e,
 8O$)%08 8L%m4               **400  #(#6#6	<D"L1'#di..8 $7 $ $L 	<DD Ie,l  !%e,
 8O$)%08 8L%m4               TZ__..// @ @<''DO,C,C++D,???Z@AAAQFAAAAAs^   0:C66C:	=C:	BF44F8	;F8	BI==J	J	BL;;L?	L?	B(PP	P	
fwd_module
bwd_modulebwd_module_inputsdict[str, fx.Node]c                    t          dd  fd           t           j        |           t          dd  fd           t          dd fd            j                            d	
          d         j        d         }|D ]}d|j        v r|t          j        dd|j                           }j                            |          5  j        	                    |j                  }d d d            n# 1 swxY w Y   |j
        d         }|j
                            |j
                   d|j
        d<   ||j
        d<   |                    |           j                            |           t          j        j        j        d                             dd          rt'          j                            d
                    }	|	d         }
t)          |	          D ]}t+          |          s|}
 n j                            d	
          d         j        d         }|D ]~}d|j        v rsj                            |
          5  j        	                    |j                  }d d d            n# 1 swxY w Y   |j
                            |j
                   |}
t-          j                   t          dd fd           d S )Nartifactc                     dddS )N,before_activation_quantization_fwd_aten_passstringr   encodingr]   r]   rN   rL   rq   z5perform_fp8_activation_quantization.<locals>.<lambda>d      B 
 
 rN   c                 4                          ddd          S NFTprint_outputinclude_strideinclude_deviceprint_readabler  s   rL   rq   z5perform_fp8_activation_quantization.<locals>.<lambda>h  #    :44tD 5 
 
 rN   metadata_fn
payload_fnc                     dddS )N+after_activation_quantization_fwd_aten_passr  r  r]   r]   rN   rL   rq   z5perform_fp8_activation_quantization.<locals>.<lambda>q      A 
 
 rN   c                 4                          ddd          S r  r  r  s   rL   rq   z5perform_fp8_activation_quantization.<locals>.<lambda>u  r  rN   c                     dddS )N,before_activation_quantization_bwd_aten_passr  r  r]   r]   rN   rL   rq   z5perform_fp8_activation_quantization.<locals>.<lambda>|  r  rN   c                 4                          ddd          S r  r  r  s   rL   rq   z5perform_fp8_activation_quantization.<locals>.<lambda>  r  rN   r   r'  r   r  z^fp8_quant_pos_\d+_r  r  r  Tr  rr  r  r   r9  r  c                     dddS )N+after_activation_quantization_bwd_aten_passr  r  r]   r]   rN   rL   rq   z5perform_fp8_activation_quantization.<locals>.<lambda>  r  rN   c                 4                          ddd          S r  r  r  s   rL   rq   z5perform_fp8_activation_quantization.<locals>.<lambda>  r  rN   )r   r  r   r(  r   r   resubrK  r   r   updatereplace_all_uses_with
erase_noder   ry  r   rz  r   r  reversedr6   r  )r  r  r  r!  quant_fwd_module_outputsfwd_noder  quant_bwd_inputr  quant_bwd_module_inputsbwd_input_locbw_inputscaled_fwd_module_outputsscale_bwd_inputs   ``            rL   #perform_fp8_activation_quantizationr  \  s    
 

 
 
 
	 	 	 	 :+_===
 

 
 
 
	 	 	 	 
 

 
 
 
	 	 	 	  */::h:GGJOPQR, 3 38=(()-r8=AAI !11)<< S S","2">">HM">"R"RS S S S S S S S S S S S S S S$>.9L ''666=AO !9:3?O 0++O<<<''	2226+	c-0 #'z'7'B'Bm'B'T'T"U"U/3 !899 	 	Hx((  ( %/$4$?$?8$?$L$LQ$O$TUV$W!1 	0 	0Hx},,%55mDD W W&0&6&B&B&B&V&VOW W W W W W W W W W W W W W W$++HM::: /:+,,,
 

 
 
 
	 	 	 	 	 	s$   !C;;C?	C?	!I<<J 	J 	ri   OrderedSet[fx.Node] | Nonec                   |rd |D             ng }d | D             }t           j        j        j        d                             dd          rd | D             }|j                            d          d	         j        d	         }d
 |j                            d          D             }d}	|D ]}
|
j        |v rt          |
          r|
j        |v r!t                              d|
j                   Dd|
j        d<   |
j        d         j        |
j        d<   d||
j                 j        d<   |
j        d         j        ||
j                 j        d<   d}	|	rt          ||||           d S d S )Nc                    g | ]	}|j         
S r]   r  r&  s     rL   r   z2enable_activation_quantization.<locals>.<listcomp>  s    ;;;t;;;rN   c                    i | ]
}|j         |S r]   r  r&  s     rL   
<dictcomp>z2enable_activation_quantization.<locals>.<dictcomp>  s    CCCd$)TCCCrN   rr  exclude_primalsFc                0    i | ]}d |j         v|j         |S )primalsr  r&  s     rL   r  z2enable_activation_quantization.<locals>.<dictcomp>  s/     
 
 
 $	8R8RDIt8R8R8RrN   r   r'  r   c                    i | ]
}|j         |S r]   r  r&  s     rL   r  z2enable_activation_quantization.<locals>.<dictcomp>  s)        	4  rN   r   z*Skipping quantization of static input %s: Tr  r   r  )r   ry  r   rz  r   r   r(  r   r   r  r=   debugr   rw  r  )r0  r  r  ri   r!  static_input_namessaved_values_namesr6  r  should_perform_fp8_quantrE   s              rL   enable_activation_quantizationr    s    '	;;:;;;; 
 DClCCC6+	c
U##

 
(4
 
 
 $)444AA!DI!L $.$4$?$?=$?$Q$Q    %" 
, 
,9***t/D/D*y...		F	RRR26DI./(,	%(8(>DIn%JNdi(-.FG@D	%@P@Vdi(-n='+$ 
+
$5	
 	
 	
 	
 	

 
rN   )ri   r   omit_aot_autograd_runtimesaved_sym_nodessaved_opaque_nodeslist[fx.Node] | Noner  %tuple[fx.GraphModule, fx.GraphModule]c                 # t          | |          \  }}	}
}| j                            d          }g t          t          |          }|rg ng t          t
          |          }g t          t          |          }g t          t          |          }g t          t          |          }|g }t          | j        ||z   z   |z   |z   |	|d|          }t          j                                        }|                    d          D ]}|j        s@t          |j                   t          ||j                   t          ||j                   I|r^t!          d |j        D                       r@t          |j                   t          ||j                   t          ||j                   t          |          r&t          |j                   |st#          d          t%                      }g }g }|D ]S}t'          |          }|r+|                    |           |                    |           >|                    |           Tt-          | j                  }t/          j        ||          D ]c}d	|j        vrt5          |j        d	                   |z
  }t7          |d
           D ]"}||vr|                    ||                    #||z  }d|                                 |                    ||z              |sg }g }g }D ]}t=          |j                            d	          t@                    r|                    |           E|j                            dd          r|                    |           v|                    |                                                                ||z              tC          |          #tE                    D ]K\  }}|#k    r@|j                            dd          s%t#          d| d# dtC                               Lt          | j        ||z   |z   |z   |z   |z   |
#fdtG          tC                    tC          |          z   tC          |          z   tC          |          z             D             z   d|          } t          | j        |z   |z   |z   |z   |z   |z   |	|d|          }n}t          | j        ||z   |z   |z   |
d tG          tC                    tC          |          z             D             z   d|          } t          | j        |z   |z   |z   |	|d|          }tH          j%        &                    | |           }!tH          j%        &                    | |          }"tN          j(                            dd          	 tS          |!|"||           |!|"fS )aj  Extract forward and backward graph modules from a joint graph.

    Args:
        ignore_must_be_in_fw_bw: When True, disables forward/backward placement
            enforcement in _extract_graph_with_inputs_outputs. Needed when the
            joint_module is not an original fwd+bwd joint graph (e.g. a backward
            graph being re-partitioned for dI/dW splitting).
        omit_aot_autograd_runtime: When True, skips postprocessing that is
            only needed when the resulting modules will be wrapped in a custom
            autograd.Function (the AOTAutograd path). This includes: tangent input
            handling, version-counter check sorting of saved tensors, opaque object
            (FakeScriptObject) separation, and fp8 activation quantization. Set this
            to True when the fwd/bwd modules will be executed directly without autograd.
    r!  r   r'  Nr   )r   c              3     K   | ]>}|j         t          j        j        j        j        u ot          |j                  d k    V  ?dS r   N)r   r   r   r   wait_tensorr   r   r  rl   s     rL   ro   z+_extract_fwd_bwd_modules.<locals>.<genexpr>"  s_       )
 )
  H	2>FF "AG!)
 )
 )
 )
 )
 )
rN   z'backward_state_inputs must not be emptyr   c                    | j         S rI   r  )ss    rL   rq   z*_extract_fwd_bwd_modules.<locals>.<lambda>J  s    16 rN   rr   saved_tensor_with_no_vc_checkFzi=z, no_vc_check_start_idx=z, len(saved_values)=c                ~    g | ]9}|k    r"|t                    k     rt          |          nt          |          :S r]   )r   r/   r.   )rm   r<  no_vc_check_start_idxr0  s     rL   r   z,_extract_fwd_bwd_modules.<locals>.<listcomp>  sa     
 
 
  ---!c,6G6G2G2G 4A666/22
 
 
rN   r   c                ,    g | ]}t          |          S r]   )r.   rm   r<  s     rL   r   z,_extract_fwd_bwd_modules.<locals>.<listcomp>  s0        +1--  rN   rr  )*r/  r   r(  filterr5   r6   r4   r3   r  r  r   distributedis_availabler  r5  r   allr   r$   r   addr   r   	itertoolschainr   r   rt   clearextendr   r   r   r   r  r:  r   _lazy_graph_module_make_graph_moduleinductor_configrz  r  )$r   r0  r  r  r!  ri   r   r  r+  r,  r-  r.  placeholdersprimal_inputstangent_inputsfwd_seed_offset_inputsbwd_seed_offset_inputsbackward_state_inputs	bwd_graphdistributed_enabledrE   saved_symbolssaved_sym_nodes_bindingsaved_sym_nodes_derivedsymbolsymbol_bindingsnew_symbolsr  saved_values_with_vc_checksaved_values_no_vc_checksaved_opaque_objectsr<  	fwd_graphr  r  r  s$    `                                 @rL   _extract_fwd_bwd_modulesr"    s   4 	!OOO CK/1B  %00M0BBL7fZ667M'Q-Qvk</P/P-Q  Jv&9<HHIIv&9<HHIGf%7FFG!2
	
	 	 !		!
 	 7  I  +88::$$$66 P Pz 	PL$)444OTY777.	::::
 ! 	PS )
 )
 Z)
 )
 )
 &
 &
 	P
 L$)444OTY777.	::::%% 	PL$)444( P$%NOOO /9llM     1 1*400 	1f%%%#**40000#**40000 3<3EFFO 7~VV % %	!!"49U#344}D)9)9::: 	? 	?A ''#**?1+=>>>>$
 25LLMMM$ b
 &("#% !  	8 	8D$)--..0@AA 8$++D1111>FF 8(//5555*11$777769QQRRR #$> ? ? !.. 	 	GAt)))y}}%DeLL (vQvv8Mvvcfgsctctvv   722"# !! 	
 
 
 
 
 
 %%.//0,--. /**+ 	
 
 
 $;+
 
 
	. 7"# !! 	
 %% $$ $;
 
 
		  722,&8 s<0033G3GGHH  
 $;
 
 
	 7$% $$ $;

 

 

	 &99,	RRJ&99,	RRJ044/	
 	
 	
 	''	
 	
 	
 z!!rN   )static_lifetime_input_indicesri   _joint_inputsr	   r#  list[int] | Nonec                  g }d}| j         j        D ]1}t          |          st          |          st	          |          r|}2|t          d          | j         j        D ],}t          |          s|                    |           ||u r n-t          d |D                       t          |           }t          |           }	|rRt          | j                   d         't          j        d           t          | |||          S t          | d          } t           j        st%          |            t'          |            t)          |            |g }t+          | ||          }
g }g }g }t,          j                                        d d}d d}d fd}| j         j        D ]x}|j        vr|j        dk    r&|j        d |                                 D             v r>|j        t,          j        j        j        j         t,          j        j!        j"        j         t,          j        j!        j#        j         t,          j        j!        j$        j         t,          j        j!        j$        j%        fv rtM          |          r|                    |            ||          r|j'        (                    d          tR          j*        k    r<tW          |          r|                    |           n|                    |           _ ||          rX|rt          d| d|j                   tW          |          r|                    |           n|                    |           tW          |          r|                    |            ||          s|j        dk    rt          d| d          fd|j,        D             }t[          d |D                       r|.                    |           Tt_          |          s|                    |           zta          tb          2                    |          3                                          }ta          tb          2                    |          3                                          }ta          tb          2                    |          3                                          }t           j4        rti          | j         |          }||
j5        }tm          | |||||          \  }}|j         7                    tp                     |j         7                    tp                     |r3|	r"ts          | ||tu          |                    \  }}tw          |          }t           j<        rddl=m<}  |||||           t}          |          }t}          |          }t          |d          }tu          |
j@                  dk    rt          |d          }||fS )!a  
    Partitions the :attr:`joint_module` in a manner that closely resembles the
    behavior observed in the original ``.forward()`` and ``.backward()`` of the
    callable, i.e., the resulting forward graph contains those operators that
    are executed in the original ``.forward()`` callable passed to
    :func:`aot_function`.

    The default partitioner collects the operators that are between the forward
    inputs and the forward outputs. This helps in finding the tensors which have
    to be stashed for the backward pass. These stashed tensors become the output
    of the generated forward graph. The remaining operators are then placed in
    the backward graph.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    Nzlast_node must not be Nonec              3  :   K   | ]}|j         d k    |j        V  dS r   Nr   r   r&  s     rL   ro   z$default_partition.<locals>.<genexpr>  s9       $ $tw(/B/B	/B/B/B/B$ $rN   r   zxTrying to unsafely apply AC to a non-functional graph with the default partitioner. Falling back to min-cut partitioner.)r!  r#  Tis_default_partitionrE   rF   rG   r;   c                    d| j         v p6t          | j                             d          t          j        j                  S )NrG  r   )r   r   r   r   _subclasses
FakeTensorr   s    rL   	is_tensorz$default_partition.<locals>.is_tensor#  s;    	) 
ZIMM%  %"3">.
 .
 	
rN   c                n    t          d | j        D                       ot          | j                  dk    S )Nc              3  @   K   | ]}|j         t          j        k    V  d S rI   )r   r   r   rm   r  s     rL   ro   z=default_partition.<locals>.is_multi_output.<locals>.<genexpr>*  s,      GGDx//GGGGGGrN   r   )r  r  r   r   s    rL   is_multi_outputz*default_partition.<locals>.is_multi_output(  s8    GGDJGGGGG $DJ!#	
rN   c                    |                      d          o-| j        dvo$ p!| j        t          j        j        j        j        uS )NF)impure_random)r   r   )	is_impurer   r   r   r   r   r  r   )rE   r  s    rL   r6  z$default_partition.<locals>.is_impure.  sW     NNN// 		 (' U;ei&@&L&TT	
rN   r   c              3      K   | ]	\  }}|V  
d S rI   r]   )rm   kr   s      rL   ro   z$default_partition.<locals>.<genexpr>C  s7       3
 3
!QA3
 3
 3
 3
 3
 3
rN   r   z.Trying to apply AC on a graph with impure op: z, r   z	Expected z to be a tensorc                &    g | ]}|j         v|S r]   r  )rm   rn   forward_node_namess     rL   r   z%default_partition.<locals>.<listcomp>o  s&    UUUAFBT4T4T14T4T4TrN   c              3  4   K   | ]}t          |          V  d S rI   r   rl   s     rL   ro   z$default_partition.<locals>.<genexpr>p  s(      77!{1~~777777rN   r  r  r!  ri   )is_impure_noder&   enable_activation_offloadingFr  rW   )Ar   r   r  r5   r4   r   r6   r   r$   r   r   r0   warningswarn#min_cut_rematerialization_partitioncleanup_recompute_tagsr   (unsafe_allow_optimization_of_collectivesforce_save_collectivesforce_save_effectful_opsforce_save_bw_mutation_srcclassify_nodesr   r  r  r   r   named_modulesr   r   r   _assert_scalarr   profiler_record_function_enter_new_record_function_enter_record_function_exit_RecordFunctionr   r   r   r%   	MUST_SAVEr1   r  r  r  r   r  r  fromkeysr  _sync_decision_cross_ranksri   r"  r   r   functionalize_rng_opsr   #reordering_to_mimic_autograd_enginer@  ,_activation_offloading.activation_offloadingr:   thread_graphsafe_rng_from_hopsrd   )r   r$  r!  r#  ri   forward_nodes	last_noderE   graph_has_recomputable_opsgraph_has_recomputable_rng_ops	node_infor0  r  r  r/  r3  r6  backward_usages	fw_module	bw_moduler@  r  r:  s                        @@rL   default_partitionra    s   @ MI"(  t$$ 	
4(8(8 	<OPT<U<U 	I9:::"(  4   	'  &&&9E # $ $+$ $ $   "6l!C!C%=l%K%K"! W 233A6B ML   7 /.K	    .lQUVVV: -|,,,\***|,,,$,(*%3_ I LO+88::
 
 
 


 
 
 

 
 
 
 
 
$ "( <& <&9...7j  TY 3
 3
&44663
 3
 3
 &
 &
 ;IN)1 I9AI5=I4<I4D	
 	
 	
 t 	 ""4(((?4   	9==%%)9)CCCd## *"))$////##D)))9T?? 		) $ZTZZT[ZZ   d## *"))$////##D)))$ 	%%d+++y 	D47o#=#= !BT!B!B!BCCCUUUUdjUUU7777777 
	 ""?333d## 	&%%%l3388::;;L4==99>>@@AAOdmm,>??DDFFGG( T1,2DlSS"*&/&K#3'-'$?  Iy O''7H'IIIO''7H'III! C) 	#8iC4H4H$ $ Iy 8	BB	 * 

	
 	
 	
 	
 	
 	
 	%$'		
 	
 	
 y))Iy))I.yeLLLI
9&''!++29$OOO	irN   g    .Arl  c                    | |j         z  S rI   )itemsize)rl  rw  s     rL   _tensor_nbytesrd    s    5>!!rN   c                j   ddd| j         v r| j         d         }t          |t                    rdS t          |t          t          f          rt          fd|D                       S t          |t                    r-t          fd	|                                D                       S t          |t          j	                  r |          S t          d
t          |           d|            | j        dk    s"| j        t          j        j        j        j        u rdS t          d|  d          )Nr   objectrG   r~   c                    t          | t          j                  sdS t          t	          |                                 d          | j                  S )Nr      fallback)r   r   rT  rd  r    rl  rw  r   s    rL   object_nbytesz_size_of.<locals>.object_nbytes  sC    !U\** 	1/		DIII17SSSrN   r   r&   c              3  .   K   | ]} |          V  d S rI   r]   )rm   rn   rl  s     rL   ro   z_size_of.<locals>.<genexpr>  s-      55A}}Q''555555rN   c              3  4   K   | ]\  }} |          V  d S rI   r]   )rm   rI  rn   rl  s      rL   ro   z_size_of.<locals>.<genexpr>  s1      @@DAq}}Q''@@@@@@rN   zUnknown metadata type z	 on node r   r   r   zO didn't have `val` metadata; we should always have `val` metadata on the nodes.)r   rf  rG   r~   )r   r   r   r  r   sumr  itemsr   rT  r   r   r   r   r   r   rL  r   )rE   r   rl  s     @rL   _size_ofrq    sV   T T T T
 	ic<(( 
	&1 dE]++ 	&5555555555T"" 	&@@@@CIIKK@@@@@@U\** 	& =%%%NDIINNNNOOOw*uy~/L/T T Tq
eeee  rN   c           	     .   ddl m}  |t                    }| j        D ]'}|j        dk    r||j        j        xx         dz  cc<   (t                              dt          |
                                t          j        d          d                     d S )Nr   )r   r   r&   %sTrs   reverse)collectionsr   r~   r   r   r   rX   r=   r  rt   rp  r   
itemgetter)r   r   cntrE   s       rL   
_count_opsry    s    ''''''%+c**C + +7o%%$%%%*%%%HHT6#))++8+>q+A+A4PPPQQQQQrN   !list[torch._ops.OpOverloadPacket]c                 v   g } t          t          j        j                  D ]}t	          t          j        j        |          }t          |t          j        j                  sA|                                D ]A}t	          ||          }t          j	        j
        |j        v r|                     |            nB| S rI   )dirr   r   r   r   r   r  OpOverloadPacket	overloadsr   	pointwiser   r   )r   	attr_nameopoverloadpacketoverloadop_overloads        rL   pointwise_opsr    s    -/C(( 
 
	"59>9==*EJ,GHH 	(2244 	 	H!"2H==Ky"k&666

+,,, 7
 JrN   r   tuple[Any, ...]	depth_maprg   list[tuple[fx.Node, int]]c                    fd| D             }t          |                                t          j        d          d          S )Nc                j    i | ]/}t          |t          j        j        j                  &||         0S r]   )r   r   r   rE   r   )rm   argr  s     rL   r  zsort_depths.<locals>.<dictcomp>  sE        #z#ux}?Q/R/RYs^  rN   r&   Trt  )rt   rp  r   rw  )r   r  
arg_depthss    ` rL   sort_depthsr    sY       '+  J *""$$(*=a*@*@$OOOOrN   gmc                4  	
 t          j                    
i 	| j                            d          D ]}
                    |	fd          	|<   d t          | j        j                  D             d	
fd	}t          t          t          | j        j                            }d
}t          j        }|D ]"}|j        D ]}|         |k     r
|         }|}#|| S t          | j        j                  d
|                  D ]:}|j        dk    r-|j        t          j        j        j        j        u r ||           ;t          | j        j                  |         d
         D ]} ||           t          j                             | 
          }|S )a  
    This pass finds the first bwd node in the graph (by looking at users of
    tangents) and then reorders the graph by walking from this node to all the
    way to the end of the graph. At each op in this traversal, we insert this op
    in a new graph and try to bring only the relevant subgraph from the other
    non-bwd edges relevant for this op. This closely mimics the behavior of
    autograd engine.

    Why is this pass required in the first place?

    This is an artifact of how partitioners work today. The starting point of
    partitioner is a joint graph, which is fwd and then bwd graph. In the case
    of checkpointing, we keep portions of fwd graph in their original place in
    the joint graph, while obtaining a bwd graph. As a result, the resulting bwd
    graph has copies of recomputed fwd subgraphs followed by the original bwd
    graph. If we run this naively, this leads to bad memory footprint, because
    the fwd subgraphs are live for way longer duration than necessary. This pass
    reorders the operations such that we prioritize the ops for the original bwd
    graph while only realizing those ops from the fwd graph that are necessary
    at any given point in the graph.
    r   r'  c                    |          S rI   r]   r   s    rL   rq   z5reordering_to_mimic_autograd_engine.<locals>.<lambda>  s    A rN   c                    i | ]\  }}||	S r]   r]   rm   r;  rE   s      rL   r  z7reordering_to_mimic_autograd_engine.<locals>.<dictcomp>  s    BBB93T3BBBrN   rE   rF   rG   r1  c                X   | g}t                      }t          |          dk    rO|                                } | |v s| v r0|                    |            || j        z  }t          |          dk    Ot          |fd          }|D ]}                     | fd          | <   d S )Nr   c                    |          S rI   r]   )rn   orders    rL   rq   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>+  s    %( rN   rr   c                    |          S rI   r]   r   s    rL   rq   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>-  r   rN   )r$   r   r  r  all_input_nodesrt   r   )rE   	cur_nodesinsertable_nodesr   r   r  s      rL   insert_node_in_graphzAreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph  s    F	0:)nnq  ==??D'''43;;  &&& --I )nnq   ""28J8J8J8JKKK$ 	D 	DD!++D2B2B2B2BCCCII	D 	DrN   Nr   rE   rF   rG   r1  )r   r   r   r(  r   r  r   r  r  r6   mathinfr  r   r   r   r   r   r   r   GraphModule)r  rE   r  r  first_node_in_bwdminimum_ordertangentr  new_gmr   r   r  s            @@@rL   rV  rV    s   . 

I"$C ##}#55 @ @''.>.>.>.>??D		BB	"(.(A(ABBBED D D D D D D D& &bhn==>>NHM! ) )M 	) 	)DT{]** %d$(!	)  	 RX^$$%?u->'?%?@ ' '7o%%$+9M9U*U*U  &&&RX^$$U+<%=%?%?@ # #T"""" X!!"i00FMrN   r_  torch.fx.GraphModuler`  fw_nodebw_nodedevicetorch.device	rng_countlast_fwd_inputlast_bwd_input#tuple[torch.fx.Node, torch.fx.Node]c                   |j         }|t          d          | j        }	|j        }
t          j        j        j        }| j                            |          5  | j                            d|           }t          |          |j
        d<   |}ddd           n# 1 swxY w Y   |j                            |          5  |j                            d|           }t          |          |j
        d<   |}ddd           n# 1 swxY w Y   t          |j                  }||d<   | j                            |          5  |	                    d||j        g|j        R |          }ddd           n# 1 swxY w Y   |                    |           |	                    |           t          |j                  }||d<   |
                    |          5  |
                    d||j        g|j        R |          }|                    |           |
                    |           ddd           n# 1 swxY w Y   ||fS )	a%  
    Note [CUDA Graph Safe RNG Functionalization]

    CUDA Graph capture doesn't work with get_rng_state and set_rng_state because these functions operate on CPU values,
    while CUDA Graph RNG capture uses on-device CUDA tensors. To solve this, we use graphsafe_set_state with a
    CUDA Generator registered to the CUDA Graph before capture begins. graphsafe_set_state updates the generator's pointer
    to reference a different GeneratorImpl, ensuring subsequent calls are correctly forwarded to the desired generator
    (and its cuda-tensor RNG state during graph capture).

    For each RNG operation's forward/backward pair:

    - We create two generators initialized with identical values
    - Each forward and backward call advances its respective generator equally
    - This keeps generators synchronized so forward and backward operations use matching RNG values

    When forward is called multiple times before backward (causing desynchronization):

    - We save the forward RNG state
    - We update the backward Generator's state before executing backward

    Before each CUDA Graph replay, replay_prologue updates captured RNG pointers with current states, ensuring backward Generator
    changes are reflected during replay.

    This function modifies both forward and backward computation graphs by:

    Creating RNG state placeholders for both passes
    Updating the forward node to use graph-safe RNG state
    Updating the backward node to use graph-safe RNG state

    For more details: https://github.com/pytorch/pytorch/issues/113541
    Nzdevice_idx must not be Nonefwd_rng_state_r   bwd_rng_state_	rng_stater   r   r   )indexr   r   r   _prims	rng_primsgraphsafe_run_with_rng_staterK  r   r7   r   r  r   create_noder   r   r  r  inserting_before)r_  r`  r  r  r  r  r  r  
device_idxfw_graphbw_graphr  fwd_rng_statebwd_rng_state	fw_kwargsfunctional_fw_node
bwd_kwargs
rng_outputs                     rL   %apply_graphsafe_rng_functionalizationr  K  s^   R J:;;;HH#(<#9#V  
	(	(	8	8 ' '!334PY4P4PQQ$?
$K$K5!&' ' ' ' ' ' ' ' ' ' ' ' ' ' ' 
	(	(	8	8 ' '!334PY4P4PQQ$?
$K$K5!&	' ' ' ' ' ' ' ' ' ' ' ' ' ' ' W^$$I*Ik		(	(	1	1 
 
%11(.07<00	 2 
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 !!"4555    gn%%J+J{		"	"7	+	+ % %))(.07<00	 * 
 

 	%%j111G$$$% % % % % % % % % % % % % % % >))sI   7BB!B?7DD	D (E44E8;E8AH55H9<H9num_sym_nodesc                6  ' t          j                    }dd}d d	'd!d} ||           } ||          } ||          }	i }
| j        j        D ]}t	          |          rtt          |j        d          r_t          j        j	        |j        j
        v rB|j        |vs	|j        |	vrV||j                 }||j                 }|	|j                 }||d|
|<   t          j        j        j        }t          j        j        j        }d }|j                            d          D ]}d|j        v r|} n|t#          d          g }t%          t'          |j                            d                              }t%          t'          |j                            d                              }t)          'fd|
                                D                       }|                    t          j        d                     t1          |          dk    }t          j        j        }t4          j        o| o|j         p|j        j        }t?          |
                                          D ]V\  }}|d         }|d         } '|          }|j        }|j        }|r'|%|j         dk    rtC          ||||||||          \  }}X|"                    |          5  |#                    d||j        g|j$        R |j%                  }|#                    dtL          j'        |dfi           } ||          |j(        d<   |#                    dtL          j'        |dfi           } tS          j)        |j(                  | _(        |*                    |            |+                    |           |,                    |           d d d            n# 1 swxY w Y   |"                    |          5  dt%          |           }!|-                    |!          }" ||          |"j(        d<   d d d            n# 1 swxY w Y   |"                    |          5  |#                    d||"|j        g|j$        R |j%                  } |*                    |            |+                    |           d d d            n# 1 swxY w Y   X|rt%          t]          |j                            d                              }#|#j$        d         }$t1          |$          |z
  }%|$d |%         t_          |          z   |$|%d          z   }&|j        0                    |&           |j        +                    |#           |1                                 |1                                 ||fS )"Ngmodr   rG   r  c                    i }| j         j        D ]I}|j        dk    r<t          |j        d          r't
          j        j        |j        j        v r
|||j	        <   J|S )Nr   r   )
r   r   r   r   r   r   r   r   r   r   )r  random_nodesrE   s      rL   get_rng_opsz*functionalize_rng_ops.<locals>.get_rng_ops  sd    +-J$ 	/ 	/D?**DK00 +I59III*.TY'rN   rE   rF   torch.device | Nonec                    d| j         vrdS | j         d         }t          |t                    s|f}|D ]5}t          |t          j                  r|j        j        dk    r	|j        c S 6t          j        d          S )zV
        Check the example value of the node outputs to find the device type.
        r   Ncudacpu)r   r   r   r   rT  r  r   )rE   
candidates	candidates      rL   
get_devicez)functionalize_rng_ops.<locals>.get_device  s     	!!4Yu%
*e,, 	'$J# 	, 	,I)U\22 ,#(F22$++++|E"""rN   r  rj  c                \   ddl m}  |            }|t          d          |5  | H| j        dk    r=|                    t
          j                                                  cd d d            S |                    t          j                              cd d d            S # 1 swxY w Y   d S )Nr   )detect_fake_modezfake_mode must not be Noner  )torch._guardsr  r   r   from_tensorr   r  get_rng_state)r  r  	fake_modes      rL   get_sample_rng_statez3functionalize_rng_ops.<locals>.get_sample_rng_state  s2   222222$$&&	 !=>>> 	@ 	@!fkV&;&; ,,UZ-E-E-G-GHH	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ (()<)>)>??	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@s   >B!/%B!!B%(B%r   )fwdbwdr   r'  r  zaCouldn't find tangent node in graph inputs. This is unexpected, please file a bug if you see thisc              3  :   K   | ]} |d                    V  dS )r  Nr]   )rm   	node_pairr  s     rL   ro   z(functionalize_rng_ops.<locals>.<genexpr>  sC        )2

9U#$$     rN   r  r&   r  r  r  r   r  r   r   rng_state_output_r   )r  r   rG   r  )rE   rF   rG   r  )r  r  rG   rj  )2r	  countr   r   r   r   r   r   r   r   r   r   r  r  run_and_save_rng_staterun_with_rng_stater(  r   r)  r  r$   valuesdiscardr  r   ry  r   graphsafe_rng_functionalizationfallback_randomtest_configs*graphsafe_rng_func_ignores_fallback_randomr  r   r  r  r  r   r   r   r   r   copyr  r  r   r   r*  r   r   	recompile)(r   r_  r`  r  uidr  r  joint_graph_rng_opsfw_graph_rng_opsbw_graph_rng_opsrecomputable_rng_ops_maprE   	base_noder  r  run_and_save_rngr  bw_tangent_start_nodefw_rng_state_outputsr  r  devicesmulti_cuda_devices
ind_config'use_rng_graphsafe_rng_functionalizationr  r  r  r  r  r  stater  
state_namebw_rng_state_nodefw_output_node
fw_outputssym_node_start_idxr   r  s(                                          @rL   rU  rU    sf   2 /

C	 	 	 	# # # #$	@ 	@ 	@ 	@ &+l33"{9--"{9--!"( S S4  	SV,,	S 	1T[5EEE
 y 000DIEU4U4U+DI6I&ty1G&ty1G:A'2R2R$Y/|-D/B **m*<<  	!!$(!E " $o
 
 	
 +-(9?#=#=#=#O#OPPQQN(9?#=#=#=#O#OPPQQN    6N6U6U6W6W    G OOEL''((( W) 'J. 	
""	
 ** R&Q , !**B*I*I*K*K L L P- P-	9E"E"G$$?? 4G	-"v%%-R	. 	.*NNN **733 !3 !3%-%9%9#$     #> &: 	& 	&" !,,#$,a0	 -   %9$8$@$@
5!%11#$*  2  
 #')GL"9"9
--j999##G,,,$++E222C!3 !3 !3 !3 !3 !3 !3 !3 !3 !3 !3 !3 !3 !3 !3H **+@AA M M<c<<
$,$8$8$D$D!0D0DV0L0L!&u-M M M M M M M M M M M M M M M
 **733 - -%11#& * ! 
 #> 2 
 

 --j999##G,,,- - - - - - - - - - - - - - -&  
3d9?#=#=#=#J#JKKLL#(+
 __}<***+())*+,,-. 	
 	w'''"">222is8   C'OO	O	(;P//P3	6P3	AR33R7	:R7	c                    | j         j        D ]Y}t          |j        t          j        j                  r3|j        j        dk    r#t          |          st          j
        |j        d<   ZdS )z
    By default, the partitioner is not allowed to recompute collectives
    unless they come from a user-annotated AC region.
    See Note [Recomputing collectives in the partitioner]
    r   r   N)r   r   r   r   r   r  r  r   r   r%   rR  r   )r   rE   s     rL   rG  rG    st     "( @ @t{EJ$9::	@%);;;"4(( < &6%?DIk"@ @rN   c                    dfd| j         j        D ]:}t          |          r)t          |          st	          |          s |           ;dS )a\  
    Force save outputs from with_effects nodes wrapping effectful ops.

    Effectful ops (registered via _register_effectful_op) should not be recomputed
    because they may have arbitrary global side effects (I/O, RNG state, collectives,
    etc.). We mark the tensor outputs of with_effects as MUST_SAVE to prevent
    recomputation of the effectful op.

    The with_effects node returns a tuple (token, result). We recursively find all
    leaf outputs extracted via getitem and mark them as MUST_SAVE. Since these are
    saved, the with_effects op doesn't need to be recomputed in backward.
    rE   rF   rG   r1  c                    | j         D ]h}|j        t          j        u rS |           t	          |j                            d          t          t          f          st          j
        |j        d<   id S )Nr   r   )r  r   r   r   r   r   r   r   r  r%   rR  )rE   r  mark_getitem_outputss     rL   r  z6force_save_effectful_ops.<locals>.mark_getitem_outputs  sx    J 	H 	HD{h...$$T***!$)--"6"6FF H-=-GDIk*		H 	HrN   Nr  )r   r   r   r   r  )r   rE   r  s     @rL   rH  rH    s    H H H H H H "( ' 'D!!	'"4((	' )..	'
 ! &&&' 'rN   c                   t                      }t          | j        j                  D ]}|j        dk    r|j        t          j        j        j	        j
        u }|rmt          |          r |                    |j        d                    t          |          r.|j        d         |v rt          j        |j        d         j        d<    d S d S )Nr   r   r&   r   )r$   r  r   r   r   r   r   r   r   r   r   r  r  r   r  r%   rR  r   )r   has_mutation_in_bwrE   is_copy_s       rL   rI  rI    s     5?LL+122  7h;%)."6">> 	+D11 5"&&ty|444*400 LTYq\EW5W5W1A1K	!!+. EE! rN   c                    | j         t          j        k    rdS | j        d         }t	          |          t
          j        urt          dt	          |                     d|j        vo
| j	        dk    S )NFr   z#expected parent to be fx.Node, got rG  r   )
r   r   r   r   r   r   r   r   r   r   )rE   parents     rL   is_getitem_of_multi_outputr     sk    {h&&&uYq\FF||27""Q4<<QQRRR+J?0JJrN   r+  c               X   | j         j        D ]}t          |          r|j        D ]S}t          |          rBd|j        v r9d|j        v r0|j        d         |j        d         k    rt
          j        |j        d<   T|j                            dd          r2t          d |j        D                       st
          j        |j        d<   d|j        vrWt          d |j        D                       r9t          |          rd|j
        d         j        v s|rt
          j        |j        d<   | S )a  
    If there are two consecutive checkpointed blocks with no operator in
    between, we would still want to stash the tensor at the boundary of
    checkpointed blocks. The following pass makes the last output node
    non-recomputable to allow for that.
    ac_graph_idr   has_backward_hookFc              3  4   K   | ]}t          |          V  d S rI   r   r2  s     rL   ro   z)cleanup_recompute_tags.<locals>.<genexpr>  sC       E E)-t$$E E E E E ErN   c              3  4   K   | ]}t          |          V  d S rI   r  r2  s     rL   ro   z)cleanup_recompute_tags.<locals>.<genexpr>  s*      @@TN4((@@@@@@rN   r   )r   r   r   r  r   r%   rR  r   r   r   r   )r   r+  rE   r  s       rL   rE  rE    so    "( +@ +@$ *	@
 H H"4((H%22%22	-049]3KKK-=-GDIk*y}}0%88 D E E15E E E B B D& *:)C	+&**@@TZ@@@@@ + +400	 + 6CdiPQlFW5W5W$ 6X &6%?DIk"rN   r]  min_cut_optionsdont_ban)tuple[list[fx.Node], OrderedSet[fx.Node]]c                   <=>?@ABCDEFGHIJK t                      t                      It          rZt          d | j        D                       }|t          d Ij        D                       z
  }t
                              d|           dbd	=dbd
>db=>IfdA	 dd lGn"# t          $ r}t          d          |d }~ww xY wdcAIfdCddCIfd}dcAfdBdeBIfd} Gj
                    Ht                      <dfdg<HIfd}	| j        D ]}
|
j        dk    r|
j        v ra|
j        vr,H                    |
j        dz   dt           j        d           n,H                    |
j        d z   dt           j        d!           yt%          |
          r,H                    |
j        d z   dt           j        d"           t'          |
          r |	|
d#           nt)          |
          r |	|
d$            ||
          }                    |
          r|r |	|
|           d%|
j        vod&|
j        vp.d%|
j        v o%t/          |
j        d%         t0          j                   }t5          |
          rt7          t9          |
                    }d }nm|rWt/          |
j                            d%          t<          t>          f          stA          |
          rd'}d }n#t           j        }d(}n ||
j!                  \  }}|rH|t           j        k    s|tD          k    r-H                    |
j        d z   |
j        dz   |d)|            n(H                    |
j        d z   |
j        dz   |*           |
j#        D ]5}H                    |
j        dz   |j        d z   t           j        d+           6dhAfd0}j$        rj%        D ]}fd1|j#        D             }fd2|j#        D             }tM          |          dk    r ||tO          |                    }tQ          |j#                  D ]}                    |          rz)                    |          |k    ra A||          rU|<v rAt
                              d3|)                    |          ||)                    |                      |	|           j*        rt                      }| j        D ]w}                    |          s)                    |          |fg})                    |          }tM          |          dk    rtW          j,        |          \  }}||v r0|-                    |           )                    |          |d4z   k    rctM          |          dk    rPt
                              d5||)                    |          )                    |                      |	|           nm|j#        D ]Q}                    |          r: A||          r.|<vr*tW          j.        |)                    |          |f           RtM          |          dk    y	  Gj/        Hd6d          \  }}n# Gj0        $ r}tc          td          j3                  }d }d @| j4        } 	 | r| 5                    d7d8d89          ntm          |           @to          d:d; @fd<=           tq          d>d?          }ts          |d@          5 }!|!:                    @           d d d            n# 1 swxY w Y   n# tv          $ r}dA| dB}Y d }~nd }~ww xY wdC<                    Gj=        j>        ?                    H                    ?to          d:dD ?fdE=           t          H          }"|"r
i }#d6g}$didG}%|"D ]!\  }&}'}(|$A                    |'           |&d6k    r5 |%|'          })|#B                    |)g           A                    |(           W|'dk    r5 |%|&          })|#B                    |)g           A                    |(            |%|&           |%|'          k    r5 |%|&          })|#B                    |)g           A                    |(            |%|&          }* |%|'          }+|#B                    |+g           A                    dH|*            #g },|#C                                D ];\  }-}.|,A                    dI|- dJ           |.D ]}/|,A                    dK|/            <dC<                    |,          }0dL<                    |$          }1t          H          \  }2JJrto          d:dM JfdN=           |rdO| dCnd}3|2r	|3dP|2 dCz  }3d}4|rdQ}4t          dR|0 dS|1 dC|3 |4           |t
                              dT           t
                              dUt          GHfdV                     t          H            d }~wtv          $ rW t
                              dT           t
                              dUt          GHfdW                     t          H            w xY w|\  }5Ft                      }6HfdX|5D             D ]'\  K}7|6F                    FKfdY|7D                        (t                      }8|6D ]_\  }9}:|9d dZ         |:d d[         k    r%t          d\|9d dZ          d]|:d d[                    |9d dZ         }-|8-                    |-           `t          |           Dd^ t          | j                  D             Et          Dfd_|8D             Efd`a          };|;<fS )jNc              3     K   | ]=}|j         d k    t          |j        d          "t          |j        j                  V  >dS )r   _overloadpacketN)r   r   r   r   r  r&  s     rL   ro   z solve_min_cut.<locals>.<genexpr>  sZ       &
 &
w/))gdkCT.U.U) +,,))))&
 &
rN   c              3  4   K   | ]}t          |          V  d S rI   )r   r  s     rL   ro   z solve_min_cut.<locals>.<genexpr>"  s9       4
 4
CFF4
 4
 4
 4
 4
 4
rN   z&Ops banned from re-materialization: %sarF   brG   r;   c                   |j         t          j        j        j        k    rdS |j        d         }t          j        j                            |          \  }}|D ]2}|j	        |         }| |u r dS t          |t                    r| |v r dS 3dS NFr   T)r   r   r   r  auto_functionalizedr   _higher_order_opsauto_functionalizeget_mutable_argsr   r   r  )r  r  
mutable_opmutable_arg_namesrI  r   r  s          rL   !can_fuse_into_auto_functionalizedz8solve_min_cut.<locals>.can_fuse_into_auto_functionalized'  s    8uy-AAA5VAY
 #6GG
 
	

 & 	  	 D(4.CCxxtt#t$$  8844urN   c                    |j         t          j        j        j        k    rdS |j        d         }|D ]/}|j        d         }|t          d          ||         }| |u r dS 0dS )NFtensors_to_cloner   zkwargs must not be NoneT)r   r   r   r  r	  r   r   )r  r  r  r   r   r  s         rL   .can_fuse_into_triton_kernel_wrapper_functionalzEsolve_min_cut.<locals>.can_fuse_into_triton_kernel_wrapper_functional;  s|    8uy-NNN5H%78% 	 	D(8,F~$%>???,CCxxtt urN   c                H   t          |          t          j        k    rdS  | |          rdS  | |          rdS | j        t          j        u r*| j        d         j        t          j        j	        j
        u rdS                     |           o                    |          S )NTr   F)r9   r   catr   r   r   r   r   r   r  r	  rM   )r  r  r  r  op_typess     rL   rM   z!solve_min_cut.<locals>.is_fusibleH  s     1))4,,Q22 	499!Q?? 	4H(((q	 y%FG G
 5""1%%@(*=*=a*@*@@rN   r   zANeed networkx installed to perform smart recomputation heuristicsrE   c                z                        |           rdS t          | g          }t          |          dk    r|                                }|j        D ]P}                    |          s ||          s dS                      |          r|                    |           Qt          |          dk    dS r  )rT   r$   r   r  r  ry   r  )rE   r  curr  rM   r]  r  s       rL   is_materialized_backwardsz0solve_min_cut.<locals>.is_materialized_backwardsb  s    D!! 	5v&&	)nnq  --//C	 ( ( //55  jjd>S>S  44##D)) (MM$''' )nnq   urN   r   c                @   | j         dk    rdS | j        t          j        u rdS | j                            dd          t          j        k    rdS t          j	        r
                    |           rdS | j        t          j        j        t          j        j        fv rdS j        r                    |           sdS n?                    |           rdS                     |           rdS t'          |           rdS j        r; |           r0t*                              d	| t/          | j                             d
S | j        dk     r| j        t          j        k    rdS j        r8t9          d | j        D                       }t=          |           }|dz  |k     rdS dS )zRReturns reason string if node should be banned from recomputation, None otherwise.r   Nr   zmarked MUST_SAVEznot in recomputable allowlistz	random opzcompute intensive opznon-builtin opzmaterialized backwards: %s %szmaterialized in backwardi  ztoo far from backwardc              3  h   K   | ]-}t          |t          j                  t          |          V  .d S rI   )r   r   r   rq  r  s     rL   ro   zBsolve_min_cut.<locals>.should_ban_recomputation.<locals>.<genexpr>  sM       % % !*Q2H2H%% % % % % %rN   r   zreduction op)r   r   r   r   r   r   r%   rR  r   recompute_viewsrT   r   lift_fresh_copyr   
lift_freshr   rV   rR   rP   r  r   r=   r  r   r  dist_from_bwmax_dist_from_bwr   ro  r   rq  )rE   input_tensors_sizeoutput_sizer!  r  r  s      rL   should_ban_recomputationz/solve_min_cut.<locals>.should_ban_recomputationp  s   7o%%4;(***49==d++/?/III%%! 	h&6&6t&<&< 	4;4/79PQQQ42 		(++D11 7667 !!$'' #"{,,T22 .--(.. ('' 7 	.<U<U=
 =
 	. II5tU4:=N=NOOO-- $$):V=T)T)T** + 	&!$ % %%)Y% % % " " #4..KQ!333%~trN   c                d      j         dk    rdS t           fd j        D                        S )Nr   Tc              3  0   K   | ]} |          V  d S rI   r]   )rm   r  rM   rE   s     rL   ro   z9solve_min_cut.<locals>.is_materialized.<locals>.<genexpr>  s/      EE$zz$--EEEEEErN   )r   r  r  )rE   rM   s   `rL   is_materializedz&solve_min_cut.<locals>.is_materialized  sA    7m##4EEEEE$*EEEEEEErN   ri   rb   tuple[float, str | None]c           
        t           j        r| |v rdS t          |           }t           j        r#                    |           rt
          j        dfS t          | j        d         t                    r.t          | j        d         t          j                  s	t          dfS t          |dt          t          | j        d          d          z  z            } |           r|dfS |d	z  dfS )
zReturns (weight, cannot_save_reason).

        cannot_save_reason is None for finite weights, or a string explaining
        why the node cannot be saved for infinite weights.
        r  zview op (recompute_views=True)r   z$SymFloat (non-SymInt symbolic value)g?r~  r&   N   )r    treat_parameters_as_free_to_saverq  r$  rT   r  r  r   r   r   r   r   INT_INFr~   rB  rD  r'  )rE   ri   mem_szr.  r  s      rL   get_node_weightz&solve_min_cut.<locals>.get_node_weight  s     3	3337$! 	>h&6&6t&<&< 	> 8===di&55 	Gdi.== G FFF cST%6!<!<a@@@A
 
 ?4   	$4<A:t##rN   r  reasonr   c                                        |           rdS | v rDt          | j        t          j        j                  o| j        j        dk    }t          j        s|sdS t          |           rdS d| j
        v r't          | j
        d         t          j                  rdS                     |                                d| j        dz   t          j        |rd| nd           d	S )
NFr   r   r   _inzcannot recompute: zcannot recomputecapacityr6  T)rT   r   r   r   r  r  r   r   rF  r   r   r   r  add_edger   r  r  )rE   r6  is_collectivebanned_nodesr  nx_graphr  s      rL   ban_recomputation_if_allowedz3solve_min_cut.<locals>.ban_recomputation_if_allowed  s   D!! 	58 4;
(=>> @K)-??  > m u $ 	5DI*TYu-=u~"N"N5
 	IX4:R0000@R	 	 	
 	
 	
 trN   r   _outsinkz;must be available for backward: input required for gradientr9  r8  z3must be computed in backward: required for gradientz+must recompute: marked by checkpoint policyzprimal inputzforward RNG seedr   rG          znon-tensor outputzcannot save: )r:  zdata dependencystart_nodesr`   	max_ranger~   c                   g }| D ]-}t          j        |
                    |          |df           .t          |          dk    rt          j        |          \  }}}|s
                    |          S |j        D ]l}
                    |          rU
                    |          |k    r1
                    |          | 	||          f}||vrt          j        ||           mt          |          dk    |S )z
        Finds the first unfusible node in the chain of nodes starting from
        `start_nodes` and returns its position.
        Tr   )heapqheappushr   r   heappopr  ry   )rC  rD  sorted_nodesrn   rI  rE   node_is_fusibler  r   rM   r]  s            rL   find_first_unfusiblez+solve_min_cut.<locals>.find_first_unfusibleh	  s4   
 9; 	O 	OAN<)*@*@*C*CQ)MNNNN,!##',}\'B'B$At_" 4 --d333
 
: 
:++D11 	: --d33i?? !..t44"
4..6C
 ,..|S999 ,!## rN   c                d    g | ],}                     |                              |          -S r]   )ry   r   rm   r  r]  s     rL   r   z!solve_min_cut.<locals>.<listcomp>	  sK       ++D11&&t,,  rN   c                >    g | ]}                     |          |S r]   )ry   rM  s     rL   r   z!solve_min_cut.<locals>.<listcomp>	  s<       I4L4LT4R4R  rN   z1used above/below fusible %s:(%s) -> %s -> %s:(%s)r~  ztoo long %s %s %s %sr   FTr  r  c                     dddS )Nmin_cut_failed_fx_graphr  r  r]   r]   rN   rL   rq   zsolve_min_cut.<locals>.<lambda>	  s    5 (% % rN   c                      S rI   r]   )fx_graph_strs   rL   rq   zsolve_min_cut.<locals>.<lambda>	  s    < rN   r  min_cut_failed_graphz.txtwz(failed to write: )
c                     dddS )Nmin_cut_failed_edge_listr  r  r]   r]   rN   rL   rq   zsolve_min_cut.<locals>.<lambda>	  s    2$! ! rN   c                      S rI   r]   )edge_list_strs   rL   rq   zsolve_min_cut.<locals>.<lambda>	  s    } rN   	node_namec                n    dD ]1}|                      |          r| d t          |                    c S 2| S )N)r8  r@  )endswithr   )r[  suffixs     rL   get_base_namez$solve_min_cut.<locals>.get_base_name
  sO    - 9 9F ))&11 9(CKK<88889  rN   zdepends on z  :z    - z -> c                     dddS )Nmin_cut_failed_svgr  r  r]   r]   rN   rL   rq   zsolve_min_cut.<locals>.<lambda>9
  s     4$,) ) rN   c                      S rI   r]   )svg_contents   rL   rq   zsolve_min_cut.<locals>.<lambda>=
  s    { rN   zFX graph dump: zMin-cut graph visualization: z[Production debugging: Use tlparse to extract debug artifacts (min_cut_failed_fx_graph, min_cut_failed_edge_list, min_cut_failed_svg)]
a  AOT Autograd failed to partition the joint forward-backward graph.

The partitioner determines which intermediate values to save from the forward pass vs recompute in the backward pass. This error means a value is required for backward, but cannot be saved AND cannot be recomputed.

This is a bug in PyTorch. Please file an issue at https://github.com/pytorch/pytorch/issues

Nodes involved in the conflict:
z

[For PyTorch developers: one of the above constraints is wrong. Either the node should be recomputable, saveable, or not required for backward.]

[Debug: min-cut path] z-Failed to compute min-cut on following graph:rs  c                 h    d                      j        j                                                S NrV  join	readwriteedgelistgenerate_edgelistnxr>  s   rL   rq   zsolve_min_cut.<locals>.<lambda>d
  &    		","7"I"I("S"STT rN   c                 h    d                      j        j                                                S rf  rg  rl  s   rL   rq   zsolve_min_cut.<locals>.<lambda>n
  rn  rN   c              3  ,   K   | ]}||         fV  d S rI   r]   )rm   rn   r>  s     rL   ro   z solve_min_cut.<locals>.<genexpr>v
  s,      88Q$888888rN   c              3  (   K   | ]}|v |fV  d S rI   r]   )rm   r   non_reachableus     rL   ro   z solve_min_cut.<locals>.<genexpr>w
  s1      AAa=.@.@q!f.@.@.@.@AArN   znode_in[:-3]=z != node_out[:-4]=c                    i | ]\  }}||	S r]   r]   r  s      rL   r  z!solve_min_cut.<locals>.<dictcomp>
  s    HHHic4cHHHrN   c              3  (   K   | ]}|         V  d S rI   r]   rm   rE   name_to_nodes     rL   ro   z solve_min_cut.<locals>.<genexpr>
  s(      22d	222222rN   c                    |          S rI   r]   )r   node_idxs    rL   rq   zsolve_min_cut.<locals>.<lambda>
  s    (1+ rN   rr   )r  rF   r  rF   rG   r;   rW   )rE   rF   rG   r   )rE   rF   ri   rb   rG   r/  )r  )rE   rF   r6  r   rG   r;   )rC  r`   rD  r~   rG   r~   )r[  r   rG   r   )Kr$   get_default_op_listr<   r   rD   r=   r  networkxImportErrorr   DiGraphr   rd   re   r;  r   r  r  r   r5   r4   ry   r   r   r   rT  r   rC  r   r   r   r   r1   ri   r3  r  r   rv   r   rB  r   r   r   rF  rH  r  rG  minimum_cutNetworkXUnboundedr;   r   handlersowning_moduler  r   r   _get_unique_pathopenwrite	Exceptionrh  ri  rj  rk  _find_infinite_capacity_pathr   
setdefaultrp  visualize_min_cut_graphr   r  r   get_name_to_noder  rt   )Lr   r]  r  r  joint_module_opsops_ignoreder+  r5  r?  rE   
ban_reasonis_non_tensor_nodeweightcannot_save_reasonr  rK  	used_nodeordersfw_usersfirst_unfusible_usevisited
start_nodefusiblestart_orderrI  r   	cut_value	partitionunbounded_excstructured_tracing_enabledfx_graph_filer   finf_pathnode_constraintsraw_path_nodesr_  	from_nodeto_noder6  base	from_baseto_baseconstraint_linesr[  constraintscconstraints_strraw_path_strsvg_pathlocal_files_msgtlparse_msg	reachablecutsetnbrs	cut_nodesnode_innode_outr0  r=  r  r  rZ  rR  rM   r.  r!  ry  r{  rr  rm  r>  r  rd  rs  sL    ```                                                        @@@@@@@@@@@@@@@@rL   solve_min_cutr    s[    <<"$$H 	H% &
 &
#)&
 &
 &
 
 

 ' 4
 4
$54
 4
 4
 *
 *
 
 	9;GGG   (   A A A A A A A A&   O
 
	
       9 9 9 9 9 9 9 9vF F F F F F&$ &$ &$ &$ &$ &$ &$P rz||H(2L         @ ! Y Y7h9...9555!!I&!XX	 "     !!I%!XP	 "    $ 	
 	E!D	     d 	C((~>>>> && 	C((/ABBB
 .-d33
##D)) 	;j 	;((z::: "E}DI'EUty SDIe4Del)S)S%S 	 t 	=..//F!% 	 	e$$}6F&G  9%%9 %)""%8"")8i;* *&F&
  	V6TX#5#579J9J	E!	F";'9;;	      di%/V1CfUUUJ 	 	D	F"	E!(	     	2      4 , ;"4 	; 	;I   %O  F
   !*  H 6{{Q&:&:8S[[&Q&Q#!)/22 ; ;D!0066;%22488;NNN&Jy$77 O  <//$O%%229==/ %22488   54T::: 1 #V'1||%+ !	V !	VJ++J77 ''
33Z@2G $00<<Kg,,""w//3'>>C    **3//+2CCCG))HH."!..s33!..z::   10555I V VD!0066V&JsD11V !44w1G1G1M1Mt0TUUU5 g,,""8b-r~h&II	99 V V V%))*<%=%=" %)#'"0	6
  &++!&tD ,    %%    0///    --CVLLMmS)) &Q%%%& & & & & & & & & & & & & & & 	6 	6 	65555MMMMMM	6 		","7"I"I("S"STT  -,,,	
 	
 	
 	
 099 ^	! 68&ZN! ! ! ! /7  *	7F%%g... (((=11D$//b99@@HHHH&&(=33D$//b99@@HHHH"]9--w1G1GGG(=33D$//b99@@HHHH !.i 8 8I+mG44G$//<<CC1i11   
 +-*:*@*@*B*B : :&	; ''(9Y(9(9(9:::$ : :A$++LQLL9999: #ii(899O!;;~66L %<H$E$E!Hk  ! !  3222    8EL3-3333"   P#O8#O#O#OO K) a 
 ! #! ! *6! ! #! ! !  !!" 	@AAATTTTT 	
 	
 	
 	 ))) 	 	 	@AAATTTTT 	
 	
 	
 	 )))	  )I}*4,,F8888i888 B B4AAAAAdAAAAAAA!+I# ! !3B3<8CRC=(( OOO"OO   CRCL	i    #K00LHH9[5F+G+GHHHH2222	2228M8M8M8M  L %%s   $B) )
C3CC$Y; ;i3$h+A\8
\, \8,\0	0\83\0	4\87h8
]]h]J=hA$i3r>  nx.DiGraph[str, dict[str, Any]]!list[tuple[str, str, str]] | Nonec                   t          dg          }t          dg fg          }|r|                                \  }}|                     |          D ]}||v r| |         |         }|                    dd          }|t
          j        k    s|t          k    rW|                    dd          }|||f}	||	gz   }
|dk    r|
c S |                    |           |	                    ||
f           |dS )zBFS from source to sink following only infinite-capacity edges.

    Returns a list of (from_node, to_node, reason) tuples representing the path,
    or None if no such path exists.
    r   r:  r   r6  unknownrA  N)
r$   r   popleft
successorsr   r  r  r3  r  r   )r>  r  queuerE   	edge_pathneighbor	edge_datar:  r6  new_edgenew_paths              rL   r  r  
  s     ($$G <A8R.AQ;R;RE
 3--//i ++D11 	3 	3H7"" x0I }}Z33H48##x7':':"x;; (F3$z1v%%#OOOH%%%h1222  3  4rN   	base_name	extensionc                   |  | }t           j                            |          s|S d}t           j                            |  d| |           r+|dz  }t           j                            |  d| |           +|  d| | S )zGet a unique file path, appending a counter if the file already exists.

    For example, if "min_cut_failed.svg" exists, returns "min_cut_failed_1.svg".
    r&   rI  )ospathexists)r  r  r  counters       rL   r  r  
  s    
 $$$D7>>$ G
'..I<<<<<
=
= 1 '..I<<<<<
=
= ..'.9...rN   tuple[str | None, str | None]c                    ddl }	 ddl}n-# t          $ r  t                              dd           Y dS w xY w|j                            |                                           }|                    |          d         }|	                                D ]}| |
                                         |                                         d         }|                    t          |                     |t          d          k    r|                    d	           |                                                    d
          }t%          dd          }t'          |d          5 }	|	                    |           ddd           n# 1 swxY w Y   ||fS )zVisualize the min-cut graph to an SVG file.

    Returns (path_to_svg, svg_content) tuple. Both are None if pydot is unavailable.
    r   NzMInstall pydot to visualize the min-cut graph for debugging: pip install pydotT)exc_info)NNr:  r  redutf-8min_cut_failed.svgrT  )r}  pydotr~  r=   r  nx_pydotto_pydot	to_stringgraph_from_dot_data	get_edges
get_sourceget_destination	set_labelr   rC  	set_color
create_svgdecoder  r  r  )
r>  rm  r  
dot_format	dot_graphedger  rd  r  r  s
             rL   r  r  
  s       [ 	 	
 	
 	
 zz %%h//99;;J))*55a8I##%% " "$//++,T-A-A-C-CDZPs6{{###U5\\!!NN5!!! &&((//88K   0&99H	h		 	               [  s    &55FFFc                 .   g t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j	        t           j
        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j         t           j!        t           j"        t           j#        t           j$        t           j%        t           j&        t           j'        t           j(        t           j)        t           j*        t           j+        t           j,        t           j-        t           j.        t           j/        t           j0        t           j1        t           j2        t           j3        t           j4        t           j5        t           j6        t           j7        t           j8        t           j9        t           j:        t           j;        t           j<        t           j=        t           j>        t           j?        t           j@        t           jA        t           jB        t           jC        t           jD        t           jE        t           jF        t          jH        t           jI        t           jJ        t           jK        t           jL        } t           jI        t           jJ        t           jM        g}|t           jN        t           jO        t           jP        t          jR        t           jS        t           jT        t           jU        t           jV        t           jW        g	z  }|}| g t          j        t          jX        t           jY        t           jL        t           jZ        t          j[        t          j@        t           j[        t           j\        t          jR        t           jV        t           j]        t           jN        t           jS        t           jO        t           j^        t           j_        t           j`        t           ja        t           jb        t           jc        t           jd        t           je        t           jf        t           jg        t           jh        t           ji        t           jT        t           jj        t           jk        t           jl        t           jm        t           jn        t          jo        t          jp        z  } | t           jq        t           jr        gz  } | |z  } | t                      z  } | t           jt        gz  } | d t          D             z  } t          |           }t          t          dt          f                  t           jy        t           jz        t           j{        g          }t           j|        t           j}        t           j~        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        g}||z  }t          |t          |          |t          |          |          S )Nc                ,    g | ]}t          |          S r]   )r   )rm   ms     rL   r   z'get_default_op_list.<locals>.<listcomp>l  s!     N N N1!3A!6!6 N N NrN   .)r   r  r  r  atan2rS  rB  rD  pow	remainderfmod__and____or____xor__
__lshift__
__rshift__eqnegegtleltrL  bitwise_notceilfloorfracnegreluroundsilutruncr=   log10log1plog2lgammaexpexpm1erferfccosacoscoshsinasinsinhtanatantanhatanhsqrtrsqrtrR  sigmoidsoftplus	thresholdthreshold_backwardclampwherelerpaddcmulgelugelu_backwardro  mean_grad_sum_to_sizesum_to_sizerM  totype_asr   r   squeeze	unsqueezersub_to_copyaliasr   slicetrN  broadcast_in_dimexpand
as_stridedpermuteselectrv  rO  clone	full_likevarstd_unsafe_viewr   broadcast_tensorsscalar_tensorones	new_zerosr%  arangetriuvar_meanisinfr   fullzerosempty
empty_likeargmaxmaximumiota'_low_memory_max_pool_offsets_to_indicesr  gatherr  
zeros_liker   r$   r   r	   native_dropout	rand_like
randn_likemmconvolutionconvolution_backwardbmmaddmm#_scaled_dot_product_flash_attention'_scaled_dot_product_efficient_attention_flash_attention_forward_efficient_attention_forwardupsample_bilinear2d
_scaled_mmr?   )default_recomputable_opsrecomputable_view_opsrC   rD   rB   rA   r@   s          rL   r|  r|  
  ss   L:L:L: 	L: 	
	L:
 	L: 	L: 	L: 	L: 	L: 		L: 	L: 	L: 	L: 	L: 	L:  	!L:" 	#L:$ 	%L:& 	'L:( 	)L:* 	+L:, 	-L:. 	/L:0 		1L:2 	
3L:4 		5L:6 	7L:8 		9L:: 	
;L:< 		=L:> 	
?L:@ 	AL:B 	
CL:D 	
EL:F 		GL:H 	IL:J 	KL:L 	
ML:N 	OL:P 		QL:R 	SL:T 		UL:V 		WL:X 	YL:Z 		[L:\ 		]L:^ 	_L:` 		aL:b 		cL:d 	
eL:f 		gL:h 	
iL:j 	kL:l 	mL:n 	oL:p 	qL:r 	sL:t 	
uL:v 	
wL:x 		yL:z 	{L:| 		}L:~ 	L:@ 	AL:B 		CL:D 	EL:F 	GL:H 		IL:J 	KL:L 	ML:N 	OL:P 	QL:R 	SL:T 		UL:V 	WL:Z "\4>4:F	


 
 %H $!	$!"$! 	
$! 		$!
 	$! 		$! 		$! 	$! 	$! 	$! 	$! 	$! 		$! 	$! 	
$!  	!$!" 	#$!$ 	%$!& 		'$!( 	)$!* 	+$!, 	-$!. 		/$!0 	1$!2 	
3$!4 	5$!6 		7$!8 	9$!: 	
;$!< 	
=$!> 	?$!@ 	A$!B 	C$!D 	
E$!F 	5G$! $L T[ 99(/!   N N N N NN!":;;HS#X./		dndo> J 	!
04%)  #Z/K())8  rN   c                2    i }| j         D ]}|||j        <   |S rI   )r   r   )r   ry  rE   s      rL   r  r    s-    ')L ' '"&TYrN   memorylist[float]runtimes
max_memoryall_recomputable_banned_nodes"tuple[float, list[int], list[int]]c                0   t           j        }|dk    rt          |||          S |dk    rt          |||          S |dk    rt	          |||          S |dk    rt          |||          S |dk    rkt                              d           t          j	        | |||          }t	          ||t          |                              t          |	                    S t          |t                    r ||| |||          \  }}	d
||	fS t          d|           )Ngreedyilpdpr)   dynamic_memory_budget_dpzdynamic_memory_budget_dp is an experimental solver. It does not guarantee performance improvements. Additionally, it is not guaranteed to be stable.)r   rW   recorded_knapsack_input_memories recorded_knapsack_input_runtimes)graph_info_provider)knapsack_algomax_mem_budgetrB  z,Not aware of memory budget knapsack solver: )r   activation_memory_budget_solverr*   r+   r(   r)   r=   warningr'   inialize_from_graphr,   get_knee_point_memory_budgetr   r   r   )
r   rS  rU  rV  r]  rW  SOLVERr`  saved_node_idxrecomp_node_idxs
             rL   #_optimize_runtime_with_given_memoryrj    sr    3Fvx<<<	5FHj999	468Z888	3	3	3-fh
KKK	-	-	-?	
 	
 	

 0C#*G-3-5	
 
 
 $7  **)) +  	
 	
 		
 
F0	1	1 T*0&KY8U+
 +
' ^_55R&RRSSSrN   no_dispatchr   rj  c                    t          | j                  }d	fdfd|D             }fd|                                 D             }|                     ||          S )
Ndtorch.SymInt | intrG   r~   c                &    t          |           S )Nri  )r    )rn  rj  s    rL   realize_symbolz8_remove_symbols_without_guarding.<locals>.realize_symbol  s     X6666rN   c                &    g | ]} |          S r]   r]   rm   r  rq  s     rL   r   z4_remove_symbols_without_guarding.<locals>.<listcomp>  s#    ...1^^A...rN   c                &    g | ]} |          S r]   r]   rs  s     rL   r   z4_remove_symbols_without_guarding.<locals>.<listcomp>  s#    444AnnQ444rN   )stride)rn  ro  rG   r~   )r  shaperu  new_empty_strided)r   rj  rv  ru  rq  s    `  @rL    _remove_symbols_without_guardingrx    s    MME7 7 7 7 7 7 /......E4444444FuV444rN   c                   	 t           j        }dd}|dk    rdS |dk    rnt                      5  dd	lm} t          j        | j         j        f          \  	|	                    	 fd
          }|cd d d            S # 1 swxY w Y   d S |dk    rddl
m} t          j        | j         j        f          \  	 |d          5 }  j        i 	 d d d            n# 1 swxY w Y   |                                }t          |d          S t          |t                     r |           S t#          d|           )Nr   r	   rG   c                v   t          | t          j                  rAt          | j        d         t          j                  rt          | j        d         d          S t          | t          j                  rAt          | j        d         t          j                  rt          | j        d         d          S t          | t          j                  r't          | j        d         t          j	                  rdS t          | t          j                  r't          | j        d         t          j
                  rdS | S )Nr   rh  ri        ?T)r   r   r   r   r   rT  rx  r   r    r   r   rk  s    rL   materialize_argz)estimate_runtime.<locals>.materialize_arg  s    a!! 		j&M&M 		3AF5MDQQQQ27## 	
16%=%,(O(O 	$QVE]TBBBB27## 	
16%=%.(Q(Q 	327## 	
16%=%-(P(P 	4HrN   testingr&   profiler   )benchmarkerc                      j          i S rI   )r   )r   r   rE   s   rL   rq   z"estimate_runtime.<locals>.<lambda>  s    ;4;3O3O3O rN   flops)FlopCounterModeF)displayz Not aware of runtime estimator: )r   r	   rG   r	   )r   *activation_memory_budget_runtime_estimatorrl  $torch._inductor.runtime.benchmarkingr  r   tree_mapr   r   benchmark_gputorch.utils.flop_counterr  r   get_total_flopsrB  r   r   r   )
rE   RUNTIME_MODEr|  r  msr  modecounted_flopsr   r   s
   `       @@rL   estimate_runtimer    s   DL
 
 
 
 y  q		"	"]] 	 	HHHHHH!??TY<TUULD&**+O+O+O+O+O+OPPB	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 
	 	 <<<<<<DK8PQQf_U+++ 	)tDK((((	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) ,,..=!$$$	L"8	9	9 N|D!!! LlLLMMMs$   ABB
BC&&C*-C*memory_budgetc                
    !"#$%&'( |dk    s|dk     rt          d|           t          t          j        t          j        t          j        t          j        t          j                  }t          j        rt          |dddd          }|dk    rj
        S t           |          \  }}|dk    r|S dBd j
                  % |          ##%k    r|S dC#%fd dD#%fdt          |ddd          }t           |          \  }} |          |k     r|S t          |d          t                     \  }}	 |          |k     r|S ddlm t          fdj
        D                       "dE"fd}
 |
|	          }d |D             &&fd|D             }t          |t           d          t#                    dk    r
j
        &z   S  fdD             $d D             (ddlm' dF$'(fd$!t          j        r=dG! (fd'} |d(           |d)          g}|d         dd          |d         dd          k    r|d         |d         fg}|r|                                \  }}|d         |d         z
  d*k     r+|                    |           |                    |           Y ||d         |d         z   d+z            }|dd          |dd          k    r|                    ||f           |dd          |dd          k    r|                    ||f           ||                                 dd lm} d, |D             }d- |D             }|                    d./           |                    ||d01           t9          |          D ])\  }}|                    |d2|||         fd3d4d56           *|                    d7           |                    d8           |                     d9           |!                    d           |"                                }|#                                 tI          j%                    }t          j&        "t          j&        }tI          j'        |d:           d;}tP          j)        *                                r?tP          j)        +                                r!d<tP          j)        ,                                 }tH          j-        .                    |d=| d>t_                       d?          }|0                    |           tb          2                    d@|            !| A          d         S )HNr&   r   zJThe valid ranges for memory budget are 0 <= m <= 1. The provided value is )r   r   r   r   r   F)r   r   r   r   r0  r`   rG   rC  c                L    t          t          t          |                     dz  S N    eA)ro  maprq  )r0  s    rL   estimate_activations_sizez:choose_saved_values_set.<locals>.estimate_activations_size"  s    3x..//#55rN   szc                    | dz  z
  z  S r  r]   )r  max_act_sizemin_act_sizes    rL   get_normalized_sizez4choose_saved_values_set.<locals>.get_normalized_size+  s    S\L899rN   activationsc                ,     |           z
  z
  z  S rI   r]   )r  r  r  r  s    rL   get_mem_ratioz.choose_saved_values_set.<locals>.get_mem_ratio.  s(    ))+66E<'
 	
rN   )r   r   r   )r   )get_node_storagec              3  .   K   | ]} |          V  d S rI   r]   )rm   rE   r  s     rL   ro   z*choose_saved_values_set.<locals>.<genexpr>L  s/      TT4 0 0 6 6TTTTTTrN   r=  rb   c                "    fd| D             S )Nc                |    g | ]8}|j         t          d           k     r |          vst          |          6|9S )r  )r'  r~   r  )rm   r<  r  input_storagess     rL   r   zRchoose_saved_values_set.<locals>.get_recomputable_banned_nodes.<locals>.<listcomp>Q  s`     
 
 
 S))$$Q''~==033 >  >==rN   r]   )r=  r  r  s    rL   get_recomputable_banned_nodesz>choose_saved_values_set.<locals>.get_recomputable_banned_nodesN  s3    
 
 
 
 
!
 
 
 	
rN   c                d    g | ]-}|j                             d d          t          j        k    +|.S )r   F)r   r   r%   rR  r  s     rL   r   z+choose_saved_values_set.<locals>.<listcomp>_  sA       6::k5))-=-GGG 	
GGGrN   c                    g | ]}|v|	S r]   r]   )rm   r<  must_save_nodess     rL   r   z+choose_saved_values_set.<locals>.<listcomp>d  s*     ! ! !0H0H0H0H0HrN   Trt  c                @    g | ]} t          |                    S r]   rq  )rm   r<  r  s     rL   r   z+choose_saved_values_set.<locals>.<listcomp>q  s8       -.HQKK((  rN   c                ,    g | ]}t          |          S r]   )r  r&  s     rL   r   z+choose_saved_values_set.<locals>.<listcomp>t  s.       #'  rN   rk  r  r]  r_   r   r   tuple[list[fx.Node], float]c                                5  t          |t          | d          |          \  }}}d d d            n# 1 swxY w Y   t                      }|D ].}	 |                    |                    # t          $ r Y +w xY w|                              st          d          t          ||
|          \  }}	t          r"t          ||||d D             |	  	         ||fS )Nr   z:dont_ban must be a subset of all_recomputable_banned_nodesc                ,    g | ]}t          |          S r]   r  r  s     rL   r   zNchoose_saved_values_set.<locals>.get_saved_values_knapsack.<locals>.<listcomp>  s+     ' ' '$%HQKK' ' 'rN   )	r   rW  saved_node_idxsrecomputable_node_idxsexpected_runtimememories_banned_nodes normalized_memories_banned_nodesruntimes_banned_nodesmin_cut_saved_values)
rj  rB  r$   r  BaseExceptionissubsetr   r  r<   r   )r  r]  r   r  r  r  r  r;  r0  rI  aggressive_optionsrW  r  rl  r  s             rL   get_saved_values_knapsackz:choose_saved_values_set.<locals>.get_saved_values_knapsacky  s    []] 	 	
 4%%M1%%- 	 &		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 )3) 	 	C:3?@@@@       !>?? 	 L   (	
 
a ! 	4'.K /'=!1' ')F' ' ' 2G&;%1    ---s!   '?AAA99
BBr  tuple[float, float, float]c                b     |           \  }}| t                    |z
   |          fS )N)r]  r   )ro  )r  r0  r  r  r  r   r]  r  s      rL   estimate_for_budgetz4choose_saved_values_set.<locals>.estimate_for_budget  sU    -F-FYK. . .*L* )**-==l++ rN   rB  r{  gMbP?r1  c                    g | ]
}|d          S )r1  r]   rm   items     rL   r   z+choose_saved_values_set.<locals>.<listcomp>      000DG000rN   c                    g | ]
}|d          S r&   r]   r  s     rL   r   z+choose_saved_values_set.<locals>.<listcomp>  r  rN   )
      )figsizeo)markerz.4fzoffset points)r   r  center)
textcoordsxytexthazMemory Budgetz Runtime of Recomputed Componentsz:Pareto Frontier of Memory Budget vs. Recomputation Runtime)exist_okr  _rank_memory_budget_paretorI  r  z%Generated Pareto frontier curve at %s)r  r]  r   )r0  r`   rG   rC  )r  rC  rG   rC  )r  r`   rG   rC  )r=  rb   rG   r`   )r  rC  r]  r_   r   r   rG   r  )r  rC  rG   r  )3r   r   r   ban_recompute_used_far_apart!ban_recompute_long_fusible_chains#ban_recompute_materialized_backwardban_recompute_not_in_allowlistban_recompute_reductionsaggressive_recomputationr   ra   r  torch._inductor.fx_utilsr  r$   rt   rq  r   torch.utils._mode_utilsrl  visualize_memory_budget_paretor  r   sortmatplotlib.pyplotpyplotfigureplotr  annotatexlabelylabeltitlegridgcfshowr  getcwdmemory_budget_pareto_dirmakedirsr   r  r  is_initializedget_rankr  rh  r2   savefigr=   rd  ))r   r]  r  r  runtime_optimized_saved_valuesrI  more_aggressive_optionsmore_aggressive_saved_values%aggressive_recomputation_saved_valuesr=  r  recomputable_banned_nodesr  optionsbisectslhsrhsmidpltx_valuesy_valuesr<  txtfigfig_dirrank_suffixfig_namer  rW  r  r  r  r  r  r  r  r  r  r  rl  r  s)   ``                         @@@@@@@@@@@@@@rL   choose_saved_values_setr    s   
 qMA--hYfhh
 
 	
 $$A#)#K%+%O & E8  O & 
!"'',).$)
 
 
 (5) )%"A --6 6 6 6 -,Y-=>>L,,-KLLL|##--: : : : : : :
 
 
 
 
 
 
 

 &##(%*	   '4Y 7' '# ! }122]BB++  %   ;HY 2; ;7)< }:;;mKK44999999TTTT9CSTTTTTN
 
 
 
 
 
 
  !> =l K K *  O
! ! ! !,! ! ! %+!x% % %! ())Q../11   2O   +H   4333331. 1. 1. 1. 1. 1. 1. 1. 1. 1.f , AG	 	 	 	 	 	 	 	 	 	 '&s++-@-@-E-EF1:abb>WQZ^++
GAJ/0G 
/";;==Sq6CF?T))NN3'''NN3'''))3q6CF?a*?@@qrr7c!""g%%NNC:...qrr7c!""g%%NNC:...  
/ 	''''''0000000000 	

7
###8C000  )) 	 	FAsLLhqk"*      	

?###

5666		NOOOggii


)++*65GK$////))++ 	B0A0P0P0R0R 	BA5#4#=#=#?#?AAK7<<TKTT:L:N:NTTT
 
 	H;XFFF %$#yk  	 	rN   list[torch.fx.Node]c                X   ddl m dd}dfd}t          j                                        rwt          j                                        rXt          j                                        d	k    r5 ||           r) ||           rt                      5               5  d
 |D             g}d t          t          j                                                  D             }t          j        	                    ||d                    t          |           g }i }t          |          D ]t\  }}	fd|	D             }
d}|
D ]B}t          |          }||z  }|t          j                                        k    r
|||j        <   C||d<   |                    |           ut          j        |t          j        j                                                  }t          j                            |t          j        j        j        j                   t-          t          j        |                                                    }d| d| t3          dd fd           fd||         D             }d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |S )Nr   )unset_fake_temporarilyr   r@  rG   r;   c                    | j         D ]7}t          |j        t          j        j                  r|j        j        dv r dS 8dS )N>   c10d_functionalr   TF)r   r   r   r   r  r  r   )r   rE   s     rL   has_collectivesz3_sync_decision_cross_ranks.<locals>.has_collectives   sP    % 	 	DUZ2  +'+RRRtturN   c                0   d                     d | j        D                       }t          j        |                    d                                                    }d t          t          j        	                                          D             t                      5               5  t          j                            |           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   t          fdD                       S )N/c              3  $   K   | ]}|j         V  d S rI   r  rm   r   s     rL   ro   zE_sync_decision_cross_ranks.<locals>.has_same_nodes.<locals>.<genexpr>  s$      >>qAF>>>>>>rN   r  c                    g | ]}d S rI   r]   rm   rI  s     rL   r   zF_sync_decision_cross_ranks.<locals>.has_same_nodes.<locals>.<listcomp>  s    NNNqdNNNrN   c              3  0   K   | ]}d          |k    V  dS r  r]   )rm   r   
all_inputss     rL   ro   zE_sync_decision_cross_ranks.<locals>.has_same_nodes.<locals>.<genexpr>  s,      ::!:a=A%::::::rN   )rh  r   hashlibsha256encode	hexdigestr:  r   r  get_world_sizerl  all_gather_objectr  )r   node_strra   r  r  s      @rL   has_same_nodesz2_sync_decision_cross_ranks.<locals>.has_same_nodes  s   
 88>>K,=>>>>> 8 899CCEENNE%*;*J*J*L*L$M$MNNN
]] 	D 	D2244 	D 	D//
FCCC	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D ::::z::::::s6   #C2.!CC2C	C2"C	#C22C69C6r&   c                    g | ]	}|j         
S r]   r  r  s     rL   r   z._sync_decision_cross_ranks.<locals>.<listcomp>  s    5551555rN   c                    g | ]}g S r]   r]   r  s     rL   r   z._sync_decision_cross_ranks.<locals>.<listcomp>  s%     : : :: : :rN   c                     g | ]
}|         S r]   r]   )rm   op_namery  s     rL   r   z._sync_decision_cross_ranks.<locals>.<listcomp>'  s    TTT|G4TTTrN   z
total size)r  r'  zpicked_rank_idx=z, saved_nodes of current rank=r  c                     dddS )N)aot_joint_graph_sync_decision_cross_ranksr  r  r]   r]   rN   rL   rq   z,_sync_decision_cross_ranks.<locals>.<lambda>=  s    G (% % rN   c                      S rI   r]   )sync_decision_cross_ranks_strs   rL   rq   z,_sync_decision_cross_ranks.<locals>.<lambda>A  s    #@ rN   r  c                     g | ]
}|         S r]   r]   )rm   rn   ry  s     rL   r   z._sync_decision_cross_ranks.<locals>.<listcomp>D  s*       $%Q  rN   )r   r@  rG   r;   )torch._subclasses.fake_tensorr  r   r  r  r  r  rl  r:  r  r  r  rq  r  r   r   ri  distributed_c10d_get_object_coll_device
all_reduceReduceOpMAXr~   argminr  r   )r   r0  r   r  objectssaved_ops_names_all_rankssaved_sizessaved_ops_with_sizesr;  saved_ops_namessaved_nodes
saved_sizerE   size_of_nodesaved_sizes_tensorpicked_rank_idxry  r  r  s                   @@@rL   rT  rT    s    EDDDDD   ; ; ; ; ; ; 	&&((1,,..1 ,,..22OK(( 3N;'' 3 ]] *	 *	2244 *	 *	555556G: :!%"3"B"B"D"DEE: : :% //0I7ST:VVV+K88L%'K35 (12K(L(L 	/ 	/$_TTTTOTTT
' G GD#+D>>L,.Je/88:::::F,TY75?$\2"":....!&(9QQSS" " " (("u'8'I'R'V )    "%,/A"B"B"G"G"I"IJJO -E  -E  -E  oC  -E  -E)  A@@@      )B?)S  LQ*	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	 *	X s7   J&GJ<JJ	JJ	JJ#&J#moduler  c                r   d}|rdnd}t          t          | j                            d                              }| j                            dt          j        j        j                  D ]}t          | |j	        d         j
                  }t          |t          j                  rg }|j                            d          D ]}||j        v r| j                            |          5  | j                            | d|           }	|d	z  }|j        d
         |	j        d
<   |	}|                    |	           ddd           n# 1 swxY w Y   |r| j                            |          5  | j                            dt          j        j        j        g |j	        |R i           }
|                    |
d           ddd           n# 1 swxY w Y   |j                            d          }|r"|\  }}g |d |D             R }||f|
j        d<   | j                            |           | S )u  
    Graph-safe RNG lets torch.compile use CUDA Graphs for graphs with RNG ops.
    For graphs without HOPs, the partitioner adds placeholder nodes
    fwd_rng_state_* and bw_rng_state_* to the forward and backward graphs. At
    runtime, the AOTDispatcher retrieves these RNG states and passes them to the
    compiled graphs.

    This works well for no-HOP graphs. With HOPs, the partitioner runs
    recursively: it first partitions the HOP (producing forward/backward HOP
    subgraphs) and then stitches them back into the outer joint graph. For HOPs
    that contain RNG ops, the outer joint graph now includes HOP subgraph
    modules with extra RNG placeholders. We must thread these placeholders
    through the outer module partitioned forward and backward graphs—this
    function does exactly that. It collects the RNG placeholder nodes from the
    HOPs and creates corresponding placeholders in the outer forward and
    backward graphs.

    There is a catch: for a short period, the joint graph is in a “bad” state.
    The HOP subgraphs expect additional inputs (because of the new
    placeholders), but the outer graph call sites don't yet provide them. We
    can't fix this in the joint graph because the joint graph's input signature
    is fixed (primals, tangents). As a compromise, we keep the joint graph in
    somewhat of a bad state for some time and, once the outer forward and
    backward graphs are partitioned, insert the corresponding RNG placeholders
    and wire up the calls.
    r   r  r  r   r'  r   )r   r   rI  r&   r   NT)propagate_metaeager_input_valsc                (    g | ]}|j         d          S )r   )r   )rm   inps     rL   r   z2thread_graphsafe_rng_from_hops.<locals>.<listcomp>  s    DDDc#(5/DDDrN   )r)  r  r   r(  r   r   r  invoke_subgraphr   r   r   r   r   r  r   rK  r   r   r   r  r  r   r  )r+  r  r  
rng_string
last_inputhop_noder   new_rng_inputsplaceholder_noder  new_hop_node_with_fixed_args
eager_vals
eager_argseager_kwargsnew_eager_argss                  rL   rX  rX  K  s   < I$/D_Jhv|66-6HHIIJJJL++59#9#I ,   ,2 ,2 68=#3#:;;h// (	2,.N$,N$=$=$=$O$O 9 9 !1!666  55jAA 9 9$*L$<$<)77I77% %	 "Q	0@0Ee0L	u-%.
&--i8889 9 9 9 9 9 9 9 9 9 9 9 9 9 9  2\11(;; 	 	39<3K3K'	.>9(-9.99	4 40 224T 3   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &]../ABB
 	/9,J&#&DD^DDD& &N
 '$M056HI ''111Ms&   .AEEE3AGG	G		list[int]c           	        t          | j                  t                      | j        j        D ]n}|j        dk    rd|j        v r                    |           n$t          |          r                    |           |v r                    |j	                   ot          t          t          | j        j                            }t          t          t          | j        j                            }||z   }t          | |          \  }}}	}
                                }                    d |D                        t!          | j        |||	d          }t          fd|j        D                       t          fd| j        j        D                       }t          fdt#          |          D                       }d	}i }| j        j        D ]}|v r
|||<   |d
z  }t%          |||||          S )Nr   tangentsr  c              3  4   K   | ]}||j         dk    |V  d S )Nr   r'  )rm   r  s     rL   ro   z!classify_nodes.<locals>.<genexpr>  s;        !-ADH4D4D4D4D4D4D rN   r   c              3  H   K   | ]}|j         d k    |j                 V  dS r(  r)  rx  s     rL   ro   z!classify_nodes.<locals>.<genexpr>  sC       8 87h 	TY8 8rN   c              3  ,   K   | ]}|v|v
|V  d S rI   r]   )rm   rE   rd   rv   s     rL   ro   z!classify_nodes.<locals>.<genexpr>  sF       6 6(((T9J-J-J 	-J-J-J-J6 6rN   c              3  *   K   | ]\  }}|v 	|V  d S rI   r]   )rm   r<  pr#  s      rL   ro   z!classify_nodes.<locals>.<genexpr>  s;       - -a!7T2T2T2T2T2T2T- -rN   r   r&   )r  r   r$   r   r   r   r  r   r  r  r  r  r5   r4   r/  r  r  r  r_   )r   r#  r!  rE   r  r  ra   r+  r,  r-  r.  re   forward_only_graphrf   ri   fw_cntrh   ry  rd   rv   s    `               @@@rL   rJ  rJ    s   
 $L$677L-7\\"( 1 17m##
dk(A(A!!$''''!$'' 	(!!$'''$$$$$TZ000
L,>,DEEFFM!&)<l>P>V"W"WXX33F OOO CK/1B )--//        <FK1BI  .8 8 8 8 8&,8 8 8 . .
 ,6 6 6 6 6 6 &,6 6 6 , ,O
 #- - - - -..- - - # # FH"(  $$$#HTNaKF#  rN   r  )r#  compilerc               
   | j                                          |                                  | j         }t          j        rt          |          }|| _         | j         }t          |           }t          |           }	|rt          | d          } t          j	        st          |            t          |            t          |            |g }t          | ||          }
t          |
j                  dk    rt!          | ||||
j                  S t%          | j         j                  D ]}|j        dk    rt+          d          |_        "|
                    |          sd|_        ?t+          d          |_        |j        D ]$}t3          |j        |j        dz             |_        %t          j        }|j        D ]?}t7          |j                            d	d          t<                    r|j        d	         } n@t?          ||
|
          }t          j         rtA          ||          }d dtC          tE          fd|                    }tC          tE          tF          |                    }tC          tE          d |                    }tI          | |||||
j                  \  }}|r$|	r"tK          | ||t          |                    \  }}tM          |          }t          j'        rddl(m'}  |||||
j                   tS          |          }tS          |          }tU          |d          }tU          |d          }tV          rtY          d |D                       }t[          d |D                       dz  }t\          /                    d|           t\          /                    d|           ta          d |j         j        D                       }ta          d |j         j        D                       }||z  }tc          t*                    }|j         j        D ]G}|j2        |v r<tg          |j4        d          r'|tk          |j4        j6                  xx         dz  cc<   Ht\          /                    dt          |          t          |          t          |                     tY          |7                                tq          j9        d          d          }t\          /                    d|           ||fS )!ax  
    Partitions the joint graph such that the backward recomputes the forward.
    Recomputing helps in trading off memory bandwidth with computation.

    To create the fwd and bwd graph, we copy the joint graph, manually set the
    outputs to just original forward or backward outputs. And then we run the
    resulting graphs through dead code elimination.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.
        _joint_inputs: The inputs to the joint graph. This is unused.
        compiler: This option determines the default set of recomputable ops.
            Currently, there are two options: ``nvfuser`` and ``inductor``.
        recomputable_ops: This is an optional set of recomputable ops. If this
            is not None, then this set of ops will be used instead of the
            default set of ops.
        num_fwd_outputs: The number of outputs from the forward graph.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    Fr*  Nr   )r!  r#  ri   r   r  r&   r  )r  rn   rF   rG   r;   c                    t          | j                            d          t          j                  o5t          | j                  dk    ot          d | j        D                       S )Nr   r   c              3  Z   K   | ]&}|j         t          j        j        j        j        u V  'd S rI   )r   r   r   r   rL  r   )rm   rs  s     rL   ro   zWmin_cut_rematerialization_partition.<locals>._is_assert_only_symbool.<locals>.<genexpr>G  s3      WW!AH	 = EEWWWWWWrN   )r   r   r   r   r   r   r  r  rn   s    rL   _is_assert_only_symboolzDmin_cut_rematerialization_partition.<locals>._is_assert_only_symboolC  s\    qvzz%((%-88 XAGq XWWqwWWWWW	
rN   c                :    t          |           o |            S rI   r<  )rn   rK  s    rL   rq   z5min_cut_rematerialization_partition.<locals>.<lambda>L  s!    k!nnG-D-DQ-G-G)G rN   c                B    t          |            ot          |            S rI   )r   r1   rJ  s    rL   rq   z5min_cut_rematerialization_partition.<locals>.<lambda>Q  s    [^^+EN14E4E0E rN   r=  r?  rA  Tc                J    g | ] }t          |          t          |          f!S r]   )rq  r   r  s     rL   r   z7min_cut_rematerialization_partition.<locals>.<listcomp>|  s)    KKKSVV4KKKrN   c              3  4   K   | ]}t          |          V  d S rI   r  r  s     rL   ro   z6min_cut_rematerialization_partition.<locals>.<genexpr>  s(      'J'J'J'J'J'J'J'JrN   z'Theoretical Activations Stored: %.2f GBz,Theoretical Per Activation Storage Sizes: %sc              3  :   K   | ]}|j         d k    |j        V  dS r   Nr)  r&  s     rL   ro   z6min_cut_rematerialization_partition.<locals>.<genexpr>  9       %
 %
47o;U;UDI;U;U;U;U%
 %
rN   c              3  :   K   | ]}|j         d k    |j        V  dS rQ  r)  r&  s     rL   ro   z6min_cut_rematerialization_partition.<locals>.<genexpr>  rR  rN   r  z# remat/fw/bw: %d/%d/%drt  zCount of Ops Rematerialized: %sr   ):r   r   r  r   cser8   r   r   rE  rF  rG  rH  rI  rJ  r   rd   ra  ri   r  r   r   r~   r'  ry   r  rD  activation_memory_budgetr   r   r   rC  r  rT  r  r  r1   r"  rU  rV  r@  rW  r:   rX  r<   rt   ro  r=   r  r$   r   r   r   r   r   r  rp  r   rw  )r   r$  rF  r!  r#  r   	cse_graphr   r[  r\  r]  rE   r  r  r0  r  r  r_  r`  r@  sorted_sizestotal_activations_size_gbfw_module_nodesbw_module_nodesremat_nodescountsrematerialized_opsrK  s                              @rL   rD  rD    s   D **,,,D z ' &&	&$K!5l!C!C%=l%K%K"! X-lQVWWW: -|,,,\***|,,,$,(*%3_ I 9&''1,, +*G(1(M
 
 
 	
 +122 R R7h #CD))$// 	R !D #CD
 R R$'(94;Lq;P$Q$Q!!R 3M!  dimmOT::EBB 	 Io6ME	 +#  L ( M1+|LL
 
 
 
 GGGG	
 	
 O
 f^\BBCCEE|TT L
 4'-'$-$I  Iy " ) 	#8iC4H4H$ $ Iy 4I>>I * 

	
 	
 	
 	
 	
 	
 	%$1		
 	
 	
 y))Iy))I.yeLLLI.ydKKKI HKKlKKKLL %('J'J\'J'J'J$J$JS$P!:<UVVV 	?NNN$ %
 %
"+/"7%
 %
 %
 
 
 % %
 %
"+/"7%
 %
 %
 
 
 &7!,S!1!1O) 	> 	>DyK''GDKAR,S,S's4;677888A=888%    		
 	
 	
 $LLNN 3A 6 6
 
 
 	24FGGGirN   fx_graphTtracedfnamefigname
clear_metaprogstr | list[str] | Noneparse_stack_tracedot_graph_shapec                   |rDt          j        | j                  }t          j        | |          } | j        j        D ]	}i |_        
t          j        	                    |          \  }	}
|
sdt          j        z   }
t                              d|	|
           t          j        | |||          }|                                }t#          |d|
                    d          z             }|	 |
 }| ||           d S  |||           d S )Nru  zWriting FX graph to file: %s%s)re  rf  write_)rc  )r  deepcopyr   r   r  r   r   r  r  splitextr   torch_compile_graph_formatr=   r  r#   FxGraphDrawerget_main_dot_graphr   lstrip)r_  r`  ra  rb  rc  re  rf  r   rE   r  extgr   write_methods                 rL   
draw_graphrr    s'     M&,//		22L& 	 	DDII  ''ID# 6F55HH-tS999"+'		 	 	A 	
A1hC899LNSNNE|UU&&&&&&rN   rW   )r   r   rG   r;   )rE   rF   rG   r~   )r   rF   rG   r   )rE   rF   rG   r   )rE   rF   r   r   rG   r   )NF)r   r   ra   r`   r   r`   r   r   r   r   r   r;   rG   r   )r   r   r!  r~   rG   r"  )r0  r`   r   r   rG   r1  )r6  r7  rG   r~   )r>  r?  r   )r   r@  rE   rA  rB  rC  rD  rC  rE  r~   rG   rA  )r   r@  rE   rA  r\  rA  r^  r_  rQ  rC  r`  rC  rE  r~   rG   rA  )ri  rj  rG   rC  )rG   rp  )rE   rA  rG   r;   )rG   r_  )rw  r_  rG   r  )r   )r   r@  r!  r~   rG   r1  )r   r@  rG   r1  )
r  r   r  r   r  r  r!  r~   rG   r1  )Nr   )r0  r`   r  r   r  r   ri   r  r!  r~   rG   r1  rI   )r   r   r0  r`   r  r`   r  r  r!  r~   ri   r  r   r;   r  r;   rG   r  )r   r   r$  r	   r!  r~   r#  r%  ri   r  rG   r  )rl  r~   rw  r_  rG   r~   )r   r   rG   r1  )rG   rz  )r   r  r  rg   rG   r  )r  r   rG   r   )r_  r  r`  r  r  rA  r  rA  r  r  r  r~   r  rA  r  rA  rG   r  )
r   r   r_  r   r`  r   r  r~   rG   r  )r   r   rG   r1  )r   r   r+  r;   rG   r   )
r   r   r]  r_   r  r   r  r  rG   r	  )r>  r  rG   r  )r  r   r  r   rG   r   )r>  r  rG   r  )rG   r?   )r   r   rG   r  )r   r   rS  rT  rU  rT  rV  rC  r]  r_   rW  r`   rG   rX  )r   rj  rj  r~   rG   rj  )rE   rF   rG   rC  r  )r   r   r]  r_   r  rC  rG   r`   )r   r@  r0  r  rG   r  )r+  r   r  r;   rG   r   )r   r   r#  r<  r!  r~   rG   r_   )r  )r   r   r$  r	   rF  r   r!  r~   r#  r%  rG   r  )r^  TNFN)r_  r  r`  r   ra  r   rb  r;   rc  rd  re  r;   rf  r   rG   r1  )
__future__r   r  r   r	  rF  r	  loggingr  r   r  os.pathr  rB  rv  r   r   collections.abcr   dataclassesr   r   typingr	   r
   r   torch._inductor.inductor_primstorch.distributedtorch.fxr   torch.utils._pytreeutils_pytreer   torch._dynamo.utilsr   r   ;torch._functorch._activation_checkpointing.ac_logging_utilsr   $torch._functorch._aot_autograd.utilsr   torch._inductorr   r  !torch._inductor.custom_graph_passr   r   "torch._library.fake_class_registryr   torch._library.utilsr   torch._loggingr   r   torch._logging._internalr   r  r   %torch.fx.experimental._backward_stater   "torch.fx.experimental.proxy_tensorr   r   torch.fx.experimental.sym_noder   r   %torch.fx.experimental.symbolic_shapesr   r   r   r    r!   r"   torch.fx.passesr#   torch.utils._ordered_setr$   torch.utils.checkpointr%   r  -_activation_checkpointing.graph_info_providerr'   "_activation_checkpointing.knapsackr(   r)   r*   r+   ,_activation_checkpointing.knapsack_evaluatorr,   _aot_autograd.descriptorsr-   r.   r/   _aot_autograd.functional_utilsr0   _aot_autograd.graph_compiler1   _aot_autograd.logging_utilsr2   _aot_autograd.utilsr3   r4   r5   r6   r7   compile_utilsr8   r9   r:   r}  rm  sympydebug_partitionerr<   r\   	getLoggerrX   r=   r   r   rN  r?   r_   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r   r   r/  r5  r=  r]  rh  ro  r{  r  r  r  r  r  r  r  r"  ra  r~   r3  rd  rq  ry  cacher  r  rV  r  rU  rG  rH  rI  r   rE  r  r  r  r  r|  r  rj  r  rl  rx  r  r  rT  rX  rJ  rD  rr  r]   rN   rL   <module>r     s   " " " " " " "               				  				  * * * * * * * * $ $ $ $ $ $ * * * * * * * * % % % % % % % %  % % % %           $ $ $ $ $ $ $ $ $ < < < < < < < <      A @ @ @ @ @ 5 5 5 5 5 5        @ ? ? ? ? ? + + + + + + 7 7 7 7 7 7 7 7 . . . . . . A A A A A A ? ? ? ? ? ? H H H H H H H H L L L L L L L L                ) ( ( ( ( ( / / / / / / 3 3 3 3 3 3       L L L L L L            L K K K K K         
 A @ @ @ @ @ 7 7 7 7 7 7 ; ; ; ; ; ;              I H H H H H H H H H  LLL %6  6 6 6 6'g'11 1 1 1 1y~	 > > > > > > > >2                B                           I I I I o   2- - - -   H  $)r r r r rj   X X X XC C C CB B B BJ J J JK K K K   5 5 5 5J J J J$       K K K K K\C! C! C! C!L9 9 9 9"   G G G G45 5 5 5	 	 	 	FG FG FG FG FGRMG MG MG MGh 	X X X X X~ ?C'
 '
 '
 '
 '
\ 04	k" ?C$)&+k" k" k" k" k" k"f 7;>Bb  b  b  b  b  b J #c((" " " "   :R R R R    "P P P PK K K K\[* [* [* [*|a  a  a  a H@ @ @ @' ' ' '<   .K K K K5 5 5 5x ,0	v	& v	& v	& v	& v	&r   B/ / / /$! $! $! $!Ne e e eP   -T -T -T -T` 0 / / / / /5 5 5 5*N *N *N *N` z	 z	 z	 z	 z	zN N N NbO O O Od> > > >H |  7;|  |  |  |  |  | D #'#"&' ' ' ' ' ' 'rN   