
    IЦib                      % S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKJr  S SKJr  S SKJrJrJ r   S SK!J"r"  S SKJ#r#  S SK$J%r%  S S	K&J&r&J'r'  S S
K(J)r)  S SK*J+r+J,r,J-r-J.r.J/r/J0r0J1r1J2r2J3r3J4r4J5r5J6r6J7r7  S SK8r8S SK9J:r;  S SK8J<r<J=r=  S SK>J?r?J@r@JArAJBrB  S SKCJDrDJErEJFrF  S SKGJHrH  S SKIJJrJJKrK  S SKLJMrMJNrN  S SKOJPrP  S SKQJRrR  S SKSJDrT  SSKUJVrV  SSKWJXrX  SSKYJZrZ  SSK[J\r\  \6" S5      r]\5(       a$  S SK^J_r_  SSK`JaraJbrb  SSKcJdrdJere  SSKUJfrfJgrg  SSKhJiri   S S KjJkrkJlrlJmrmJnrnJoroJprpJqrqJrrr  S S!KsJtrt  S S"KuJvrvJwrwJxrx  S S#KyJzrzJ{r{  S S$K|J}r}J~r~JrJr  S S%KJr  S S&KJrJrJr  S S'KJrJrJr  \5(       a  S S(KJr  S S)KJr  S S*KJr  S S+KJrJr  \GR(                  GR+                  \5      r\GR(                  GR1                  \GR(                  GR1                  \5      5      r\GR(                  GR5                  \S,5      r\GR8                  S-:H  r\DGR<                  " 5       (       a  S S.KJr  S S/KJr  S S0KJrJrJrJr  OSS1 jrSS2 jrSS3 jrSS4 jr\8GRP                  GRS                  \S55      rS6r\GR8                  S-:H  r\GRZ                  " \5      rSS7 jrSS8 jr\GRb                  " S5      SS9 j5       r " S: S;5      r " S< S=\5      r " S> S?\5      rSS@ jrSSA jrSSSB jjr S       SSC jjr S       SSD jjr   S           SSE jjrSSF jr  S         SSG jjr\GR|                   " SH SI5      5       rSSJ jrSSK jr " SL SM\GR                  5      r        SSN jr\GRb                  " S5      SSO j5       rSSP jr\GR|                   " SQ SR5      5       r " SS ST\5      r " SU SV5      r          SSW jrSSX jr " SY SZ5      rSS[ jr\GRb                  " S5      SS\ j5       r\~ " S] S^5      5       r " S_ S`5      r\~\GRb                  SSa j5       5       rSSb jr        SSc jr        SSd jrSqSe\Sf'   SSg jr\~ " Sh Si5      5       r          SSj jr\~ " Sk Sl\5      5       r\~ " Sm Sn\5      5       r\~ " So Sp\5      5       rSSq jrSSr jr\~ " Ss St5      5       r " Su Sv5      rSSw jrSSx jrSSy jrSSz jrSS{ jr S         SS| jjr " S} S~5      r\~ " S S5      5       r\~ " S S5      5       r " S S5      r " S S\5      r " S S\5      rg)    )annotationsN)bisect_right)copy)c_void_pCDLLcdll)	timedelta)partial)Path)timetime_ns)
ModuleType)AnyCallablecastDict	GeneratorListNoReturnOptionalSequenceTupleTYPE_CHECKINGTypeVarUnion)SymIntTensor)countersdynamo_timedget_chromium_event_loggerget_metrics_context)configexcmetrics)cuda_env)rocm_compile_commandrocm_compiler)CustomGraphPassCustomGraphPassType)has_frozen_params)log_cache_bypass)r"      )create_cache)autotune_cache)AutotuneCacheBundler)TritonBundlerT)KeysView)_CompileFxKwargsCompiledFxGraph)CompiledFxGraphConstants
OutputCode)
JsonDataTyRemoteCache)	InputType)_set_gpu_runtime_env_transform_cuda_paths
CppBuilder
CppOptionsCppTorchDeviceOptionsget_compiler_version_info&get_name_and_dir_from_output_file_pathnormalize_path_separator)pick_vec_isa)_module_to_triton_kernel_reload_python_module _reload_python_module_in_subproc)	cache_dirdefault_cache_dir)ALIGN_BYTESclear_on_fresh_inductor_cacheis_linux
is_windows)trace_structured)extract_tensor_metadata
FakeTensorTensorMetadata)has_hinthint_intShapeEnv)FutureGraphLowering)ChoiceCaller)HalideInputSpec
HalideMetaz_inductor/script.ldwin32)build_paths)_run_build_command)log_global_cache_errorslog_global_cache_statslog_global_cache_valsuse_global_cachec                     g N argskwargss     X/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torch/_inductor/codecache.pyr\   r\              c                     g ra   rb   rc   s     rf   r]   r]      rg   rh   c                     g ra   rb   rc   s     rf   r^   r^      rg   rh   c                     gNFrb   rb   rh   rf   r_   r_      s    rh   output_codeiX  c                   [         R                  R                  c  SO,S[         R                  R                  R                  SS5       3nS[        R
                  R                   [        R
                  R                   3nU SU 3n[        R                  R                  [        5       U5      n[        R                  R                  X@5      n[        R                  " USS9  U$ )	Ncpucu. py_Texist_ok)torchversioncudareplacesysversion_infomajorminorospathjoinrF   makedirs)namecu_strpython_versionbuild_foldercpp_wrapper_dircpp_wrapper_build_directorys         rf   cpp_wrapper_cache_dirr      s     ==% 	%--$$,,S"567 
 #**001#2B2B2H2H1IJN$%Qvh/Lggll9;=O"$'',,"EKK+d;&&rh   c                 @    [         R                  R                  c  S$ S$ )N
cubin_path
hsaco_path)rw   rx   hiprb   rh   rf   get_cpp_wrapper_cubin_path_namer      s     ==,,4<F,Frh   c                    U b>  [        [        R                  R                  U [        R                  5       S   5      5      $ S $ )Nhash)r   r   r   r   	CacheBase
get_system)global_cache_dirs    rf   get_global_cache_path_implr      sA     ' 	RWW\\*I,@,@,B6,JKL rh   c                      \ rS rSr\\R                  " S5      S
S j5       5       r\\\R                  " S5      SS j5       5       5       r	\SS j5       r
SS jrS
S jrSS jrS	rg)r      Nc                     SSK Jn   U " 5       n SS 0SU0S.n[        R                  R                  [        R                  R                  5       5      n[        R                  R                  b3  UR                  US   S'   [        R                  R                  US   S'   O3UR                  US   S'   [        R                  R                  US   S	'    [        R                  " [        R                   " US
S9R#                  S5      5      R%                  5       US'   U$ ! [         a    S n GN f = f! [        [        4 a    0 n Nrf = f)Nr   )
triton_keyr   triton)devicerx   r   rx   ry   r   T)	sort_keysutf-8r   )triton.compiler.compilerr   ModuleNotFoundErrorrw   ry   get_device_propertiescurrent_devicerx   r   gcnArchNamer   AssertionErrorRuntimeErrorhashlibsha256jsondumpsencode	hexdigest)r   triton_versionsystemdevice_propertiess       rf   r   CacheBase.get_system   s3   	"; (\N	!4.n&F !&

 @ @

))+! }}!!-+<+A+Ax (,1MM,>,>y!&)+<+H+Hx (+0==+<+<y!%(
 !JJv.55g>

)+ 	v 7 # 	"!N	"& - 	F	s)   D  BD3 "2D3  D0/D03EEc                     [        [        R                  R                  [	        5       S[
        R                  5       S   5      5      $ )Ncacher   )r   r   r   r   rF   r   r   rb   rh   rf   get_local_cache_pathCacheBase.get_local_cache_path   s0     BGGLLgy7K7K7Mf7UVWWrh   c                 4    [        [        R                  5      $ ra   )r   r"   r   rb   rh   rf   get_global_cache_pathCacheBase.get_global_cache_path   s    )&*A*ABBrh   c                6    [         R                  5       U l        g ra   )r   r   r   selfs    rf   __init__CacheBase.__init__   s    **,rh   c                    U R                  5       nUR                  5       (       d  0 $ [        U5       n[        R                  " U5      nS S S 5        US   $ ! , (       d  f       WS   $ = fNr   )r   is_fileopenr   load)r   local_cache_pathlocal_cache_fplocal_caches       rf   get_local_cacheCacheBase.get_local_cache   sb    446''))I"#~))N3K $7## $#7##s   A
A)c                    U R                  5       n[        [        U5      [        R                  " U R
                  US.SS9SS9  g )N)r   r      )indentT	make_dirs)r   write_atomicstrr   r   r   )r   r   r   s      rf   update_local_cacheCacheBase.update_local_cache  s<    446 !JJ$++DQO	
rh   )r   returnDict[str, Any])r   r   )r   Optional[Path]r   None)r   r   r   r   )__name__
__module____qualname____firstlineno__staticmethod	functools	lru_cacher   rI   r   r   r   r   r   __static_attributes__rb   rh   rf   r   r      s~    "  "H "X  # X C C-$
rh   r   c                  (    \ rS rSrSS jrSS jrSrg)
LocalCachei  c                R    U R                  5       nUnU H  nXB;   a  X$   nM    g    U$ ra   )r   )r   keysr   	sub_cachekeys        rf   lookupLocalCache.lookup  s6    $$&	C|!J		  rh   c                   U R                  5       nUnUSS  H  nUR                  U0 5        XE   nM     XUS   '   U R                  U5        g )Nr   )r   
setdefaultr   )r   valuer   r   r   r   s         rf   	set_valueLocalCache.set_value  sX    $$&	":C  b)!I  $$r(&rh   rb   N)r   r   r   Optional[Dict[str, Any]])r   r   r   r   r   r   )r   r   r   r   r   r   r   rb   rh   rf   r   r     s    
	'rh   r   c                  f    \ rS rSr\R
                  " S5      SS j5       r          SS jrSrg)PersistentCachei(  Nc                    U R                  5       nUb  UR                  5       (       d  0 $ [        U5       n[        R                  " U5      nS S S 5        US   $ ! , (       d  f       WS   $ = fr   )r   r   r   r   r   )r   global_cache_pathglobal_cache_fpglobal_caches       rf   get_global_cache PersistentCache.get_global_cache)  sh     668$,=,E,E,G,GI#$99_5L %G$$ %$G$$s   A
A,c                D  ^^^^^ [         R                  " 5       m[        [        U R                  TTT5      n[        [
        U R                  TTT5      n[        [        U R                  TTT5      n0 mSSUUUUU4S jjjn[        R                  (       d  [        R                  (       Ga.  [        R                  (       a  U R                  5       O0 n	U" U	5      (       d  [        5       (       a  U" U R                  5       US9(       d  Ub   U" T5      m[        U4S jT 5       5      (       d   eU	R                  T0 5        U	T   R                  T0 5      R                  T0 5        TR!                  5        H   u  pXT   T   T   U
R#                  5       '   M"     U R'                  U	5        T V
s0 s H  oR#                  5       TU
   _M     nn
U" U5        T$ [        5       (       a  U" U R                  5       US9  T$ ! [$         a  nU" U5        UeSnAff = fs  sn
f )a  
Check to see if we have benchmarked the given choice callers. For each
choice caller:

    1. Check global_cache[op][inputs][choice][precision], return benchmark if cached.
    2. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
    3. If benchmark is not None:
        a. `max_autotune_gemm=True`: benchmark the choice, update
            local_cache[op][inputs][choice], and return the benchmark.
        b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
Nc                   > SnT H]  nUR                  5       nX@R                  T0 5      R                  T0 5      R                  T0 5      ;   a  U T   T   T   U   T	U'   M[  Sn  O   U(       a  U" US9  U$ )z2Check if `cache` contains data for all the choicesTF)cached)hash_keyget)
r   callbackhitchoicechoice_hashchoicesinputsop	precisiontimingss
        rf   check_cache+PersistentCache.lookup.<locals>.check_cacheM  s    C!$oo/))B"3"7"7"C"G"G	SU"VV&+Bi&7	&B;&OGFO  C " $Jrh   )r   c              3  ,   >#    U  H	  oT;   v   M     g 7fra   rb   ).0r   r   s     rf   	<genexpr>)PersistentCache.lookup.<locals>.<genexpr>k  s     GwV0ws   ra   )r   r   r   r   r   bool)rw   get_float32_matmul_precisionr
   r]   r   r^   r\   r"   max_autotunemax_autotune_gemmautotune_local_cacher   r_   r   allr   itemsr   r   r   )r   r   r   r   	benchmark	log_statslog_vals
log_errorsr   r   r   timingetimings_to_logr   r   s    ```          @@rf   r   PersistentCache.lookup2  s   $ 668	2DKKVYW	0$++r69U#T[["fi

 	 	  &":":":4:4O4O$..0UWK  ,,$&&#D$9$9$;iP)'0GGwGGGGG**2r2O..vr:EEiQST*1--/PVB/	:6??;LM +: ''4 FM"EL6OO%wv6W  " (  --/)D ! $ qMG"s   BH 2H
H
HHrb   r   )
r   zList[ChoiceCaller]r   r   r   r   r  z4Optional[Callable[[Any], Dict[ChoiceCaller, float]]]r   zDict[ChoiceCaller, float])	r   r   r   r   r   r   r   r   r   rb   rh   rf   r   r   (  s[    % %N#N N 	N
 HN 
#Nrh   r   c                     [         R                  R                  [        5       S5      n [         R                  R	                  U 5      (       d  [         R
                  " U SS9  U $ )NlocksTru   )r   r   r   rF   existsr   )lock_dirs    rf   get_lock_dirr    s>    ww||IK1H77>>(##
Ht,Orh   c                    [         R                  " [        R                  " U 5      R	                  5       5      S S R                  S5      R                  5       $ )N3   r   )base64	b32encoder   r   digestdecodelower)datas    rf   sha256_hashr"    s@    GNN40779:3B?FFwOUUWWrh   c                    [        U [        5      (       a  U OU R                  S5      nUS:w  a  US-   UR                  S5      -   nS[        U5      -   $ )Nr   rr   s   ||c)
isinstancebytesr   r"  )codeextrahashing_strs      rf   	code_hashr*    sM    $T511$t{{77KK{!E)ELL,AA[)))rh   c                V   U(       aP  [         R                  R                  U5      (       a  UnOT[         R                  R                  [	        5       U5      nO+[         R                  R                  [	        5       U SS 5      n[         R                  R                  X0 SU 35      nXU4$ )Nr,      rq   )r   r   isabsr   rF   )basename	extensionspecified_dirsubdirr   s        rf   get_pathr2    sy     77==''"FWW\\)+}=Fik8Aa=977<<*Ai[ 9:DT!!rh   c                t    US:X  a  [        X5      $ US;   a  [        [        U 5      5      $ [        SU 35      e)Nr'  )cubinhsacospvzUnknown hash type )r*  reprr   )contentr(  	hash_types      rf   get_hashr:    sB     F((--g''
-i[9
::rh   c                    [        U R                  5       X#5      n[        XQU5      u  pgnUS:H  n	[        R                  R                  U5      (       d
  [        XSS9  Xh4$ )Nr'  Tr   )r:  stripr2  r   r   r  r   )
r8  r/  r(  r9  r0  r   r.  r1  r   encode_utf_8s
             rf   writer>    sV     :C%cmDHd"f,L77>>$Td3>rh   c                     [        U S5      S   $ )zL
Write the `text` to a file and return the path computed based on the hash.
txtr,   r>  )texts    rf   
write_textrC    s     ua  rh   c                   [        U[        [        45      (       d   S5       e[        U 5      nU(       a  UR                  R                  SSS9  UR                  S[        R                  " 5        S[        R                  " 5        S3-  n[        U[        5      (       a  SOSnUR                  Xc(       a  SOS S	9 nUR                  U5        S S S 5         UR                  US
9  g ! , (       d  f       N= f! [         a@  n[        (       d  e [        R                   " XTS9  [        R"                  " U5         S nAg S nAff = f)Nz6Only strings and byte arrays can be saved in the cacheT)parentsrv   rq   z.tmpwwbr   )encoding)target)srcdst)r%  r   r&  r   parentmkdirr   getpid	threading	get_identr   r>  renameFileExistsError_IS_WINDOWSshutilcopy2remove)	path_r8  r   r=  r   tmp_path
write_modefe_file_exists	            rf   r   r     s    #u  @?@  ;D$6{{qQy/B/B/D.ETJJH"7C00dJ	z|G	NRS	 
O
t$ 
O	N  { 	,
		(s$   <C'C8 '
C58
E6D==Ec                  .    \ rS rSr% SrS\S'   S\S'   Srg)	TensorMetadataAndValuesi  z_
TensorMetadata plus the elements as a list of raw values.
Used for hashing inlined constants.
rO   tensor_metadata	List[Any]valuesrb   Nr   r   r   r   __doc____annotations__r   rb   rh   rf   r]  r]    s    
 $#rh   r]  c                    U $ ra   rb   xs    rf   _identrg    s    Hrh   c                j    [        U 5      n[        U S5      (       d  [        R                  " USSS9nU$ )zg
Extracts the tensor metadata and removes fields of the TensorMetadata
that are not needed for caching
_is_inductor_staticr   N)storage_offsetstorage_bytes)rM   hasattrdataclassesrz   )tmetas     rf   %extract_tensor_metadata_for_cache_keyrp    s5    
 #1%D1+,,""4NKrh   c                     ^  \ rS rSrSr  S       SU 4S jjjr    SS jr    SS jrSS jrSS jr	    SS jr
SS	 jrSS
 jrSS jrSrU =r$ )FxGraphCachePickleri  a&  
Custom pickler to customize the pickling of some objects (Tensors), only for the
purpose of computing a hash for keying into the FxGraphCache. Tensors contain
objects that don't pickle and/or vary between runs, and we want to capture the
data that allow us to compute a stable, but safe hash.
c                  > [         R                  " 5       U l        [        TU ]  U R                  5        X l        [        R                  R                  5       U l        U R                  R                  [        [        R                  " U R                  5      [        R                  [        R                  " U R                   5      [        R"                  [        R                  " U R$                  5      [        R&                  R(                  R*                  R,                  [        R                  " U R.                  5      05        U(       a7  [        R                  " U R0                  5      U R                  UR2                  '   SU l        g)a
  
Create an FX graph pickler. If include_non_inlined=True, then pickling will
include the _values_ for all Tensors. (Note that any tensors are constants
attached as attributes to the GraphModule). Otherwise, pickling will include
only the metadata for these tensors.
TN)ioBytesIO_streamsuperr   include_non_inlinedcopyregdispatch_tabler   updaterN   r   r
   _reduce_fake_tensorrw   r   _reduce_tensorr   _reduce_symintfxexperimental_backward_stateBackwardState_reduce_unsupported_reduce_graph_module	__class__fast)r   gmrx  has_user_defined_triton_kernelsr  s       rf   r   FxGraphCachePickler.__init__	  s	    zz|&#6 %4499;""I--d.F.FGi//0C0CDi//0C0CD%%55CCYEVEV,,F			
 +090A0A))1D- 	rh   c                *    [        U5      n[        U44$ )z'
Custom reducer to pickle FakeTensors.
)rp  rg  )r   rn  metadatas      rf   r|  'FxGraphCachePickler._reduce_fake_tensor/  s     9;$$rh   c                r   SSK Jn  UR                  (       a  [        S5      e[	        U5      nUR
                  " U5      (       d  U R                  (       aZ  [        5       nUR                  5       n[        5       U-
  nUS:  a  [        R                  " SUS S35        [        [        X55      44$ [        U44$ )zz
Custom reducer to pickle Tensors.  If we see tensors, we know they're constants
stored as attributes on the GraphModule.
r,   rT   zmkldnn tensors unpickleableg      ?z0FX graph cache copying of a large constant took z.1zs. Please file an issue.)graphrU   	is_mkldnnBypassFxGraphCacherp  can_inline_constantrx  r   tolistwarningswarnrg  r]  )r   rn  rU   r  startr`  elapseds          rf   r}  "FxGraphCachePickler._reduce_tensor8  s     	);; %%BCC 9;,,Q//43K3K FEXXZFfunG}Fwrl S, ,
 4XFHII $$rh   c                &    [         [        U5      44$ )z#
Custom reducer to pickle SymInts.
)rg  r   r   ss     rf   r~  "FxGraphCachePickler._reduce_symint\  s     Q	""rh   c                    [        S5      e)zc
Custom reducer to handle any objects that we don't support and therefore
raise to bypass caching.
zReduce unsupported)r  r  s     rf   r  'FxGraphCachePickler._reduce_unsupportede  s    
 !!566rh   c                    UR                  5       u  nu  p4US   n[        R                  " SSU5      n[        R                  " SSU5      nXSS'   X#U44$ )a  
Custom reducer for graph module to handle irrelevant data for user
defined triton kernels
Essentially what we are doing here is a huge hack where user defined
triton kernel contain a dynamo time side table and the arguments to the
call_function are indicies into this side table. These arguments are not
for hashing purposes since we included the source code into the cache
key and the numbers are prone to give false negatives due to ordering.
_codezkernel_idx = \d+rr   zconstant_args_idx = \d+)
__reduce__resub)r   r  fnr!  importsr'  s         rf   r  (FxGraphCachePickler._reduce_graph_modulel  sY     !mmoOTG}vv)2t4vv0"d;W'?""rh   c                    U R                  U5        U R                  R                  5       U R                  R                  S5        U R                  R	                  S5        $ ! [
        [        4 a%  n[        R                  SSS9  [        S5      UeSnAff = f! U R                  R                  S5        U R                  R	                  S5        f = f)z,
Pickle an object and return a byte string.
r   zFailed to pickle cache keyTexc_infoN)
dumprv  getvalueseektruncate	TypeErrorAttributeErrorlogwarningr  )r   objr  s      rf   r   FxGraphCachePickler.dumps  s    
	%IIcN<<((* LLa LL!!!$ >* 	JKK4tKD$%ABI	J LLa LL!!!$s#   *A# #B3 BBB 8Cc                :    U R                  U5      n[        U5      $ )z5
Serialize an object and return a hash of the bytes.
)r   r"  )r   r  serialized_datas      rf   r:  FxGraphCachePickler.get_hash  s     **S/?++rh   c                b  ^  SU 4S jjn/ n[        U5      R                  5        GH  u  pE[        U[        5      (       aU  [	        [        U5      5       H;  nT R                  XV   5      nUR                  SU SU SU SU" XV   5       35        M=     Mp  [        U[        5      (       aO  UR                  5        H9  u  pT R                  U	5      nUR                  SU SU SU SU" U	5       35        M;     M  T R                  U5      nUR                  SU SU SU" U5       35        GM     U$ )z
Get a printable string describing in more detail all the attributes
comprising an object. Useful for debugging when one graph hashes
to a different value than another.
c                2  > [        U [        R                  5      (       a  [        [	        U 5      5      $ [        U [
        5      (       a  g[        U 5      TR                  ;   a*  [        TR                  [        U 5         " U 5      S   5      $ [        U 5      $ )Nz<bytes>r,   )r%  rw   r   r   rp  r&  typerz  )r  r   s    rf   get_str0FxGraphCachePickler.debug_lines.<locals>.get_str  st    #u||,,@EFFC'' cd1114..tCy9#>qABB3xrh   [z] z]: z: r  r   r   r   )	varsr  r%  listrangelenr:  appenddict)
r   inpr  linesattrr  iihkvs
   `         rf   debug_linesFxGraphCachePickler.debug_lines  s   		  c*ID#t$$C/Bcg.ALL1QCr$qC8H7I!JK * C&&IIKDAa(ALL1QCr$q3wqzl!CD ( MM#&q2dV2gcl^<= + rh   )rv  rz  r  rx  )TF)r  torch.fx.GraphModulerx  r  r  r  r   r   )rn  r   r   z.Tuple[Callable[[T], T], Tuple[TensorMetadata]])rn  r   r   zNTuple[Callable[[T], T], Tuple[Union[TensorMetadata, TensorMetadataAndValues]]])r  r   r   z#Tuple[Callable[[T], T], Tuple[str]])r  r   r   r   )r  r  r   z&Tuple[Any, Tuple[Dict[str, Any], str]])r  r   r   r&  r  )r  FxGraphHashDetailsr   	List[str])r   r   r   r   rb  r   r|  r}  r~  r  r  r   r:  r  r   __classcell__)r  s   @rf   rr  rr    s     %)05	$ $ "$ *.	$
 
$ $L%%	7%"%"% 
X"%H#7#&#	/#&% , rh   rr  c                   [        [        R                  " X5      S S9 H  nUR                  R	                  UR
                  S 5      nUc   eUR                  nUc   e[        US5       nUR                  UR
                  R                  S5      5        UR                  UR                  5       5        S S S 5        UR                  (       d  M  [        UR                  UR
                   S3U5        M     g ! , (       d  f       NJ= f)Nc                    U R                   $ ra   )r   re  s    rf   <lambda>!build_code_hash.<locals>.<lambda>  s    rh   r   rbr   rq   )sortedpkgutiliter_modulesmodule_finder	find_specr   originr   r{  r   readispkgbuild_code_hashsubmodule_search_locations)rootsprefixhasherlibspecmodulerZ  s          rf   r  r    s     g**59?OP  **388T:!!!&$1MM$))**734MM!&&(#   999D;;		{!_fU Q
  s   +A
C99
D	c                    [        SSS9   [        R                  " 5       (       d  S
S jn U " [        5      sSSS5        $ SSKJn  UR                  S5      R                  5       R                  S	5      sSSS5        $ ! , (       d  f       g= f)zK
Compute a key that contains relevant information about torch source files
inductor_codecache_torch_keyTlog_pt2_compile_eventc                `   Sn[         R                  R                  [        5      nU Vs/ s H"  n[         R                  R	                  X#5      PM$     nn[
        R                  " 5       nUR                  [        R                  R                  S5      5        [        U /SU5        U H]  n[         R                  R                  U5      (       d  M)  [        US5       nUR                  UR                  5       5        S S S 5        M_     UR                  5       $ s  snf ! , (       d  f       M  = f)N)z"codegen/aoti_runtime/interface.cppz'codegen/aoti_runtime/implementation.cppcodegen/cpp_prefix.h	script.ldr   rr   r  )r   r   dirname__file__r   r   r   r{  rw   __version__r   r  r  r   r  r  )rootextra_filesinductor_rootrf  r  r   rZ  s          rf   get_code_hash torch_key.<locals>.get_code_hash  s     !# 9GRS{!rww||M={S )e//66w?@F3'Dww~~d++!$-"MM!&&(3 .- ( }}& T .-s   )D D
D-	Nr   parutilztorch/src_hash.txtascii)r  r   r   r&  )	r   r"   	is_fbcode_TORCH_PATHlibfb.pyr  get_file_contentsrstripr   )r  r  s     rf   	torch_keyr    sk    
 
4D	Q!!'* !-1 
R	Q4 	%(()=>EEGNNwW9 
R	Q	Qs   +A= 3A==
Bc                 H    [         R                  R                  [        5      $ ra   )r   r   r  r  rb   rh   rf   get_inductor_rootr    s    77??8$$rh   c                  $    \ rS rSr% SrS\S'   Srg)OrderedSetHolderi  zV
See FxGraphHashDetails. Holds a sorted list to support stable hashing
of set kwargs.
r_  r  rb   Nra  rb   rh   rf   r  r    s    
 rh   r  c                      \ rS rSrSrSrg)r  i  zA
Exception to indicate that the FxGraphCache should be bypassed.
rb   N)r   r   r   r   rb  r   rb   rh   rf   r  r    s    rh   r  c                  N    \ rS rSrSrS/r          SS jr    S	S jrSrg)
r  i   zn
Object to capture all the details for a compiled FX graph relevant to computing
a safe and stable cache key.
graph_idc           	        Xl         X l        [        R                  U l        0 U l        [        UR                  5       5       HY  u  pVXPR                  ;  d  M  [        U5      [        L a#  [        [        U5      5      U R                  U'   MK  X`R                  U'   M[     SSKJnJnJn	  SSKJn
  / U l        UGb  UR%                  5        GH   n['        U[(        R*                  R,                  5      (       d  M/  [.        R0                  " UR2                  R5                  SUS9UR2                  R5                  SU	S95       H  nSSKJn  UR;                  UR<                  S   5      n['        X5      (       a  UR>                  nU
" U5      nURA                  UR<                  S   5      nU R"                  RC                  UU45        M     GM     X@l"        [(        RF                  " 5       [(        RH                  " 5       [(        RJ                  RL                  RN                  4U l(        [(        RR                  RT                  RV                  RX                  [(        RR                  RT                  RV                  RZ                  [(        RR                  RT                  RV                  R\                  4U l/        [a        5       U l1        [d        Rg                  5       U l4        [j        Rl                  " 5       U l7        U Rq                  [j        Rr                  5      U l9        U Rq                  [j        Rt                  5      U l:        g )	Nr   )kernel_side_table triton_kernel_wrapper_functionaltriton_kernel_wrapper_mutation)9user_defined_triton_kernel_transitive_closure_source_codecall_function)r   rI  )	Autotuner
kernel_idxconstant_args_idx);r  example_inputscconfigcache_key_tag	fx_kwargsr  r  EXCLUDED_KWARGSr  setr  *torch._higher_order_ops.triton_kernel_wrapr  r  r  torch._inductor.codegen.wrapperr	  user_defined_triton_sourcemodulesr%  rw   r  GraphModule	itertoolschainr  
find_nodestriton.runtime.autotunerr  
get_kernelre   r  get_constant_argsr  inputs_to_check$are_deterministic_algorithms_enabled-is_deterministic_algorithms_warn_only_enabledutilsdeterministicfill_uninitialized_memory!deterministic_algorithms_settingsbackendsry   matmul
allow_tf32&allow_fp16_reduced_precision_reduction&allow_bf16_reduced_precision_reductioncuda_matmul_settingsr  torch_versionr   r   system_infor"   save_config_portableinductor_config_get_custom_pass_detailpost_grad_custom_pre_passpost_grad_custom_post_pass)r   r  r  r  r  r  r  r  r  r  r	  r  noder  kernelkernel_sourceconstant_argss                    rf   r   FxGraphHashDetails.__init__	  s    ,$22
 -/9??,-DA,,,7c> )9(CDNN1%()NN1% .	
 	

	
 68'>**,!&%((*>*>??%OOLL++*3S ,  LL++*3Q , 	D C.99$++l:STF!&44!' R" "
 %6$G$G$78%M 33::&6- '<  / 668??AKK%%??2
. NN&&11NN&&MMNN&&MM%
! '[$//1%::<)-)E)E,,*
& +/*F*F--+
'rh   c                `    U(       d  g [        U[        5      (       d   eUR                  5       $ ra   )r%  r(   uuid)r   custom_passs     rf   r0  *FxGraphHashDetails._get_custom_pass_detailh  s,     +7777!!rh   )r  r+  r%  r  r  r  r/  r  r2  r1  r-  r,  r  N)
r  r  r  Sequence[InputType]r  r3   r  Sequence[int]r   r   )r:  r)   r   zOptional[Any])	r   r   r   r   rb  r  r   r0  r   rb   rh   rf   r  r     sa     "lO]
 ]
 ,]
 $	]

 ']
 
]
~"."	"rh   r  c                *   [        U 5      (       + n[        XX#5      n[        UR                  5      S:g  n[	        XU5      nSUR                  U5      -   nUR                  U5      n	SR                  U	5      n
[        R                  SU SU
 35        X4$ )z5
Generate a unique hash of the FX graph for caching.
r   rZ  
z$FX graph cache hash details for key z:
)
r*   r  r  r  rr  r:  r  r   r  debug)r  r  r  r  rx  detailsr  picklerr   r  	debug_strs              rf   compiled_fx_graph_hashrD  q  s     033 YPG&)'*L*L&MQR&R#!
!@G
   )
)C%%g.K		+&III4SEYKHIrh   c                   [         R                  R                  5       (       a#  [         R                  R                  5       (       d  g[	        U S-  5      n[
        R                  " 5       (       aI  [         R                  R                  S5      n[        R                  SUU5        U[	        X-  S-  5      -  n[        R                  SU5        [        R                  R                  [        US95        U$ )zq
Ephemerally increases the NCCL timeout when compiling for a distributed job
Returns amount of seconds increased
r   g    eAz>pytorch/remote_cache:ephemeral_timeout_fudge_factor_percentagezNEphemeral NCCL timeout increase fudge factor %d and original increase value %dd   zIncreasing NCCL timeout by %d)seconds)rw   distributedis_availableis_initializedintr"   r  _utils_internaljustknobs_getval_intr  infodistdistributed_c10d"_add_ephemeral_timeout_for_all_pgsr	   )time_saved_nsincreased_timeout_secfudge_factors      rf   .add_ephemeral_timeout_increase_for_distributedrU    s    
 ))++53D3D3S3S3U3U 45,,AAL
 	\!	

 	%:%IC%O!PPHH,.CD<</0 ! rh   c                  \   \ rS rSrSr\SS j5       r\SS j5       r\SS j5       r\SS j5       r	\            SS j5       r
\            SS j5       r\SS	 j5       r\            SS
 j5       r\SS j5       r\                SS j5       r\SS j5       rSrg)FxGraphCachei  a  
Supports caching and reusing compiled Fx graphs.

The overall strategy is as follows:
- This cache stores entries on disk. When saving an entry, we can't
  serialize callables (that could be C++, Triton, etc.), so we serialize
  their own disk cache location. We then recreate the compiled artifact
  after fetching from disk.
- For indexing the cache, we gather the fields relevant to identifying an
  FxGraph (the graph module, graph inputs, system settings etc.) into an
  FxGraphCacheDetails object, pickle it, and compute a hash for the key.
  See FxGraphCachePickler.
- Among the metadata we store, we also include a guards expression that's
  appropriate for validating any symbols for Tensor arguments that have
  symbolic bounds. On cache lookup then, we evaluate those guards in the
  current context to validate that a cached entry can be served.
- A given graph could have multiple compiled versions, corresponding to
  different sets of guards. Therefore, we store cache entries in the form:
      <temp dir>/<fx graph hash>/<serialized metatdata>
- On lookup, we compute the key from the graph details, iterate over all
  leaf files in the corresponding subdirectory, deserialize the entry, and
  evaluate its guards expression. If the evaluation succeeds, we have a
  cache hit. If it fails, we compile the graph and store a new entry.
- Finally, on a cache hit, we need to make sure any guards that would
  have been created during compilation are added to the current context.
c                 R    [         R                  R                  [        5       S5      $ )zC
Get the toplevel temporary directory for storing compiled graphs.
fxgraph)r   r   r   rF   rb   rh   rf   _get_tmp_dirFxGraphCache._get_tmp_dir  s    
 ww||IK33rh   c                n    [         R                  R                  [        R	                  5       U SS U 5      $ )z1
Return the disk location for a given cache key.
r,   r,  )r   r   r   rW  rZ  r  s    rf   _get_tmp_dir_for_key!FxGraphCache._get_tmp_dir_for_key  s*    
 ww||L557Qq3GGrh   c                    U  Vs/ s H8  n[        U[        R                  5      (       d  M$  [        U5      (       d  M6  UPM:     sn$ s  snf )zw
Get the backed SymInt objects from the input list. Note that we can never
have guards that depend on unbacked symint.
)r%  rw   r   rP   )r   r  s     rf   _filter_backed_symints#FxGraphCache._filter_backed_symints  s1     "Q6aZ5<<%@Xa[6QQQs   #AAAc                     [         R                  R                  R                  5       n U (       d  gU R                  R
                  $ )z7
Helper to get the shape env from the tracing context.
N)rw   _guardsTracingContexttry_get	fake_mode	shape_env)ctxs    rf   _get_shape_envFxGraphCache._get_shape_env  s2    
 mm**224}}&&&rh   c                  ^ ^^^^ [         R                  5       nUc   e[         R                  U5      nU Vs/ s H  n[        U5      PM     nnSU UU4S jjn	Sn
[	        5       nU	" 5        Hk  nUR
                  (       d  Un
  OW[        UR                  UR
                  U5      5      n[        R                  ST UR
                  UU5        U(       d  Mi  Un
  O   U
c  SU4$ U
R                  =n(       a  [        R                  " U5      nU=nby  [        U5      US'   [        5       nSUR                  5       ;   a  UR!                  SUR"                  S9  [%        UR"                  5      S:  a  ['        5       R)                  SS	5         U
R+                  U5      m[.        R0                  " 5       nU
R2                  m[4        R6                  " UTS
9  U
R
                  (       aM  [        UR                  U
R
                  U5      5      nUSL d   e[        R                  ST UR8                  5        [:        R<                  R?                  U
R@                  5        [B        S==   U
RD                  -  ss'   [F        R                  ST5        [F        R                  ST5        [I        SU4S jU4S jS9  X4$ s  snf ! [,         a    SU4s $ f = f)z
Lookup a compiled graph in the cache by key. On a hit, return the
deserialized CompiledFxGraph object. On a miss, return None.
Nc               3  2  >#    T(       a  [         R                  T5      n [        R                  R	                  U 5      (       aq  [        [        R                  " U 5      5       HN  n [        [        R                  R                  X5      S5       n[        R                  " U5      v   S S S 5        MP     T(       a~   TR                  T5      =nbh  [        U[         5      (       d   eUS   n[        U["        [$        45      (       d   e[&        R(                  " U5      n[        R*                  " U5      v   g g g ! , (       d  f       M  = f! [         a    [        R                  SSS9   GM  f = f! [         a    [        R                  SSS9   g f = f7f)Nr  z,fx graph cache unable to load compiled graphTr  r!  )rW  r]  r   r   r  r  listdirr   r   pickler   	Exceptionr  r  r   r%  r  r   r&  r  	b64decodeloads)	r1  r   rZ  
cache_datar!  r8  r   localremote_caches	         rf   iterate_over_candidates;FxGraphCache._lookup_graph.<locals>.iterate_over_candidates  sL    %::3?77>>&)) &rzz&'9 :!%bggll6&@$!G1&,kk!n 4 "H!G !; 
&2&6&6s&;;
H)*d;;;;)&1)$e===="("2"24"8$ll733 I  "H!G( KK N)- (  ! KKFQU   ss   A#F')ED;)E1F=A;E3 8F;
E
	EF
EE0+F/E00F3FFFFzEfx graph cache key %s evaluating guards [%s] with values %s => hit=%striton_bundler_metainductor_compile)cached_kernel_namesr   num_triton_bundlesr,   r'  Tz*fx graph cache key %s post-load guards: %sinductorOutput code written to: %szOutput code: 
%sinductor_output_codec                    > ST 0$ )Nfilenamerb   )artifact_paths   rf   r  ,FxGraphCache._lookup_graph.<locals>.<lambda>a  s
    Z/rh   c                    > T $ ra   rb   r{  s   rf   r  r  b  s    trh   
payload_fn)r   z&Generator[CompiledFxGraph, None, None])%rW  ri  r`  rQ   r  guards_exprr  evaluate_guards_expressionr  r@  _triton_bundler0   read_and_emitr   r    	get_stackadd_event_datary  r  r!   	incrementafter_deserializationOSErrorr.   inductor_meta_from_configsource_coder/   begin_compileguardsr$   CachedMetricsHelperapply_deltasmetrics_deltasr   counter_deltasoutput_code_logrL   )r   r  rs  rt  	constantsrg  symintsr  hintsru  r  
cache_info	candidater   bundlerw  ro  loggerinductor_metacheckr  r'  s   ` ``                @@rf   _lookup_graphFxGraphCache._lookup_graph  s    !//1	$$$55nE&-.g!g.	 	: %)V
02I((! 44Y5J5JERC IIW%% s!- 30 =##)))6)"/"="=f"E++847I
0124%)9)9);;))*@X@X *  t//014')334H!L	$!77	BM '@@B  **=tD 44U5F5FPE D= =II<c9CSCS 	##001E1EF 4 44:MJ148"/#	

   [ /X  	$ ##	$s   J5
J: :K
Kc                   SSK Jn  [        X5      (       d   S[        U5       S35       e[	        U5      nUR                  5         [        R                  5       nUc   e[        R                  U5      nUR                  U5      n	UR                  XS9Ul         [        R                  " U5      n
 U(       a  [        R%                  U 5      n[&        R(                  R+                  U5      (       d  [&        R,                  " USS9  [&        R(                  R/                  U[1        U
5      5      n[3        XSS9  U(       a[  [5        UR6                  =(       d    SS-  5      n[8        R:                  " U
5      R=                  S5      US.nUR?                  X5        gg! [         a+    [        R!                  SSS	9  ["        S
   S==   S-  ss'    gf = f! [         a+    [        R!                  SSS	9  ["        S
   S==   S-  ss'    gf = f)z-
Store a serialized CompiledFxGraph on disk.
r,   )r4   zserialization for z NYIN)placeholdersr  z1fx graph cache unable to serialize compiled graphTr  r|  fxgraph_cache_pickle_errorru   r   r   g    .Ar  )r!  time_taken_msz!fx graph unable to write to cachefxgraph_cache_write_error) 
compile_fxr4   r%  r  r   prepare_for_serializationrW  ri  r`  get_pruned_guardsproduce_guards_expressionr  rn  r   ro  r  r  r   r]  r   r   r  r   r   r"  r   rK  _time_taken_nsr  	b64encoder  put)r   compiled_graphr  rs  rt  r4   disk_compiled_graphrg  r  r  r8  r1  r   r  rr  s                  rf   _save_graphFxGraphCache._save_graphf  s    	0
 
 	;^ 45T:	; 
 #>2557 !//1	$$$55nE,,W5*3*M*M  +N +
'	ll#67G	C%::3?ww~~f--KK6
 ww||FK,@ATd; #%8%G%G%L1QT$T U",,W5<<WE%2*
   1 '  	KKCd   Z !=>!C>	4  	CKK;dKKZ !<=B=	Cs%   F 1C)G 2GG2H	H	c                   [         R                  [         R                  4 HA  nU(       d  M  [        U[        5      (       a  UR                  5       (       a  M8  [        S5      e   [        U 5      (       a/  [        R                  R                  S5      (       d  [        S5      e[         R                  R                  (       a  [        S5      eSSKJn  UR                  (       a  [         R#                  S5        [        e[$        R'                  5       c   [         R#                  S	5        [        S
5      eU R)                  5        GH  n[        U[        R*                  R,                  5      (       d  M/  UR.                  R0                   H  n[        UR2                  [        R4                  R6                  5      (       aE  UR2                  R9                  5       (       d&  [        SUR2                  R;                  5        35      eUR<                  S:X  d  M  [        [?        XR2                  5      [        R@                  RB                  5      (       d  M  [        S5      e   GM      g)z{
Check some conditions that would preclude caching and raise BypassFxGraphCache
to bypass in case caching is not possible.
z!Unsupported post grad custom passz,pytorch/inductor:allow_freezing_with_cachingz$Skipping graph with frozen constantszORuntime constant folding can introduce constants that aren't static across runsr   )CompilerBisectorz$dont cache graph when bisect enabledNzfx graph cache no shape envzNo shape envz!Can't cache HigherOrderOperator: getattrzCan't cache torchbind objects)"r"   r1  r2  r%  r(   r9  r  r*   rw   rL  justknobs_checkaot_inductoruse_runtime_constant_folding!torch._inductor.compiler_bisectorr  bisection_enabledr  r@  rW  ri  r  r  r  r  nodesrI  _opsHigherOrderOperator	cacheabler   r   r  _CScriptObject)r  pr  r  r3  s        rf   _check_can_cacheFxGraphCache._check_can_cache  s    22F4U4UVAq*Q88()LMM W
 R  )>)>)N)N:*
 *
 %%KLL;;$% 
 	G--II<=$$ &&(0II34$^44 jjlFfehh&:&:;;**t{{EJJ,J,JKK KK1133,;DKK<L<L<N;OP  77i'JB,ehh.C.C- - --LMM + #rh   c                D    [         R                  U 5        [        XX#5      u  pVXV40 4$ ! [         ak  n[        S   S==   S-  ss'   [
        R                  SU5        U(       a  [        S[        U5      5        S[        U5      [        5       S.nSU4s SnA$ SnAff = f)	a  
Checks that the inductor input is cacheable, then computes
and returns the cache key for the input.
Returns (key_info, cache_info) where:
- key_info is (hash_key, debug_lines), and
- cache_info will contain debug info in the event of BypassFxGraphCache.

NB: It is possible to have this function return a union instead. But
I personally believe it is more annoying/difficult to read in that format.
r|  fxgraph_cache_bypassr,   z%Bypassing FX Graph Cache because '%s'bypass_fx_graphbypass)cache_statecache_bypass_reasoncache_event_timeN)
rW  r  rD  r  r   r  rN  r+   r   r   )	r  r  r  r  remoter   r  r  r  s	            rf   prepare_keyFxGraphCache.prepare_key  s    $	$))"-5I C !2%% " 
	$Z !78A=8HH<a@ !2CF;''*1v$+IJ
 ##
	$s   #* 
BA BBBc                 H    Sn [        U [        R                  " 5       SS5      $ )z;
Attempts to load the remote cache, returns None on error.
zfx-graph-v1FbRemoteFxGraphCacheRemoteFxGraphCache)r-   r"   r  )cache_ids    rf   get_remote_cacheFxGraphCache.get_remote_cache   s+    
 !" 	
 	
rh   c                   [         R                  XX4U5      u  px0 UEU U[        5       S.EnUb  [        R	                  SU 5        [
        S   S==   S-  ss'   SUS'   U(       a4  [        5       R                  SS5        [        5       R                  S	U 5        UR                  =n	b6  XS
'   [        5       R                  SU	S-  5        [        U	5      =n
S:w  a  XS'   Xx4$ U(       a4  [        5       R                  SS5        [        5       R                  SU 5        [        R	                  SU 5        [
        S   S==   S-  ss'   SUS'   Xx4$ )z
Lookup the graph with the given key, and return results and metadata.
Doesn't do any logging on its own, because AOTAutograd handles a cache miss
differently from FXGraphCache.
)r   
componentsr  zfx graph cache hit for key %sr|  fxgraph_cache_hitr,   r   r  "inductor_fx_remote_cache_hit_count!inductor_fx_remote_cache_hit_keysrR   distributed_ephemeral_timeout_usi  r   ephemeral_timeout_increase#inductor_fx_remote_cache_miss_count"inductor_fx_remote_cache_miss_keyszfx graph cache miss for key %sfxgraph_cache_missmiss)rW  r  r   r  rN  r   r!   r  
add_to_setr  rU  )r   r  r  rs  rt  is_backwardr  r  r  rR  ephemeral_increases              rf   load_with_keyFxGraphCache.load_with_key  s}    &2%?%?i&
"

% '		

 %HH4c:Z !45:5(-J}%#%//0TVWX#%007 "0!>!>>K.;?+#%//68M +Y%+ & 	
 @R;< )) #%//91 $%008# HH5s;Z !56!;6(.J}%))rh   c                 v     [         R                  " [        R                  5       5        g! [         a     gf = f)z
Clear out the on-disk cache.
N)rT  rmtreerW  rZ  FileNotFoundErrorrb   rh   rf   clearFxGraphCache.clearJ  s.    
	MM,3356  		s   (+ 
88rb   Nr   r   )r   r   r   r   )r   r<  r   zList[torch.SymInt])r   zOptional[ShapeEnv])r   r   r  r<  rs  r  rt  !Optional[RemoteCache[JsonDataTy]]r  r5   r   0Tuple[Optional[CompiledFxGraph], Dict[str, Any]])r   r   r  r6   r  r<  rs  r  rt  r  r   r   )r  r  r   r   )r  r  r  r<  r  r3   r  r=  r  r  r   z6Tuple[Optional[Tuple[str, List[str]]], Dict[str, Any]])r   r  )r   r   r  r  r  r<  rs  r  rt  r  r  r  r  r5   r   r  r   )r   r   r   r   rb  r   rZ  r]  r`  ri  r  r  r  r  r  r  r  r   rb   rh   rf   rW  rW    s   : 4 4 H H R R ' ' |!|!+|! |! 8	|!
 ,|! 
:|! |!| =C=C"=C ,=C 	=C
 8=C 
=C =C~ 2N 2Nh "& "&+"& $"& '	"&
 "& 
@"& "&H 

 

 :*:*:* ,:* 	:*
 8:* :* ,:* 
::* :*x  rh   rW  c                "   [        SSS9   [        R                  " U 5      n [        R                  " U5         S S S 5        g ! [        R
                   a&  n[        R                  " XR                  5      UeS nAff = f! , (       d  f       g = f)Nrun_command_and_checkTr  )	r   shlexsplit
subprocess
check_callCalledProcessErrorr#   CppCompileErroroutput)cmd_cmdr  s      rf   r  r  U  sp    	-T	Jkk$	<!!#& 
K	J ,, 	<%%c884!;	<	 
K	Js(   B AA=!A88A==B  
Bc                    U R                  S5      (       a  [        R                  R                  U 5      $ U R                  S5      (       a  [        R                  R                  U 5      $ U S4$ )zDReturns the path where the AOT Inductor compiled kernels are stored..soz.pt2rr   )endswithr   r   r  )r   s    rf   split_aot_inductor_output_pathr  ^  sQ     }}Uww}}T""	v		ww}}T""Rxrh   c                      \ rS rSr% 0 rS\S'   \" \R                  5      r\	S	S j5       r
\	S
S j5       r\	SS j5       rSrg)CudaKernelParamCacheii  zDict[str, Dict[str, str]]r   c                    [        UUU[        [        R                  R                  5      S   S9u  pVXb[        5       '   X R                  U'   g )Nr   )r9  r0  )r>  r  r"   r  output_pathr   r   )clsr   paramsr4  bin_typert   r   s          rf   r  CudaKernelParamCache.setn  sO    8##//	
 59.01		#rh   c                :    U R                   R                  US 5      $ ra   )r   r   )r  r   s     rf   r   CudaKernelParamCache.get|  s    yy}}S$''rh   c                6    U R                   R                  5       $ ra   )r   r   )r  s    rf   get_keysCudaKernelParamCache.get_keys  s    yy~~rh   rb   N)
r   r   r   zDict[str, str]r4  r   r  r   r   r   )r   r   r   zOptional[Dict[str, str]])r   zKeysView[str])r   r   r   r   r   rc  r   r  cache_clearclassmethodr  r   r  r   rb   rh   rf   r  r  i  sU    ')E$)u{{+K    ( (    rh   r  c                  @    \ rS rSr\            SS j5       rSrg)AotCodeCompileri  c           
       ^^^^6^7^8^9^:^; Un[         R                  S:X  a  [        S5      e[        5         [	        5       n[        SS[        UTTR                  S9S9n[        UR                  5       5      n	[        R                  " 5       =(       a    TS:H  =(       a    TR                  m9T9m;[        [        R                  R                  5      u  n
n[        TSU	U
S	9u  nm:[        R                  R                   (       a  UR#                  T:5        [$        R'                  S
T:5        [)        SU:4S jU4S jS9  [*        R,                  R/                  [*        R,                  R1                  T:5      S   U5      m8S:U8UU9UU;4S jjnSSKJn  [7        5       nU" [*        R,                  R/                  XS-   5      [8        S9nU   U(       a{  [*        R,                  R;                  T:5      S   S-   n[=        US5       nUR                  U5        SSS5        [        R                  R                   (       a  UR#                  U5        [        R                  R>                  nTUS'   [*        R,                  R;                  T:5      S   S-   n[        R                  R>                  RA                  5        H7  u  nn[C        U[D        5      (       a  [C        U[D        5      (       a  M2   S5       e   [=        US5       nUR                  [F        RH                  " [        R                  R>                  5      5        SSS5        [        R                  R                   (       a  UR#                  U5        U(       a  [        R                  R                  O$[*        R,                  R;                  T:5      S   S-   n[*        R,                  R;                  T:5      S   S-   n[K        U4S jTRL                  RO                  5        5       5      m7S;S jm6[        R                  RP                  (       a6  SR/                  U6U7U4S jTRL                  RO                  5        5       5      nOSn[S        U5      n[        R                  " 5       (       + =(       a    US :  n[        R                  RT                  (       a  S!n[W        T:5      u  nn[        UTTR                  S!T;US"9n[        UT:UUS#9nUR                  5       n URY                  5       n[Z        R]                  S$U 5        [        R                  R^                  (       dj  T9(       aX  [*        R,                  R;                  T:5      S   S-   n[a        T:UU R1                  5       5        [*        Rb                  " US%5        O[e        U 5        [        R                  R^                  (       aG  [*        R,                  R;                  T:5      S   S&-   n!URg                  U!5        UR#                  U!5        U(       d  Un"Sn#O{[i        [j        [l        Rn                  " S[l        Rp                  " [l        Rr                  5      Rt                  S'5      Rw                  5       5      n#[x        Rz                  " S(US)-   U#5      n"U" U"[         R                  5      n$[l        R|                  R~                  (       a
  [        5       O	[        5       n%U%R                  R                  5        V&s/ s H1  n&U&R                  R                  S5      (       d  M%  U&R                  PM3     n'n&S*R/                  U'5      n'[W        U5      u  n(n)[        UTTR                  T;S+9n*[        U(UU$U'/U)U*S#9n+U+R                  5       n,U+RY                  5       n[Z        R]                  S,U,5        [=        T:S-5       nUR                  S.5        UR                  S/U  S.35        UR                  S0U, S.35        SSS5        [        R                  R^                  (       a  [*        R,                  R;                  T:5      S   S1-   n-U*Rg                  U-5        UR#                  U-5        U(       a  [*        R,                  R;                  T:5      S   S2-   n.[=        U.S35       n/U/R                  U5        U/R                  [x        Rz                  " S4U#5      5        SSS5        UR#                  U.5        UR#                  U$5        UR#                  U'5        GOT9(       a{  U(       a  [        R                  R                  O$[*        R,                  R;                  T:5      S   S-   n[a        UU$/UU,R1                  5       5        [*        Rb                  " US55        O[e        U,5        UU$[*        R,                  R;                  U$5      S   S6-   4 H  n0[*        R                  " U05        M     U(       a  SSKFn1U1R                  5       n2[u        S7U25      n3[=        US85       n4U4R                  5       n5U4R                  S9U3U5U3-  -
  -  5        U4R                  U5        U4R                  [x        Rz                  " S4U#5      5        SSS5        [        R                  R                   (       a  UR#                  U5        SSS5        [        R                  R                   (       a  U$ W$ ! , (       d  f       GN= f! , (       d  f       GN= fs  sn&f ! , (       d  f       GN= f! , (       d  f       GNI= f! , (       d  f       N= f! , (       d  f       N= f)<zk
Returns the .so path, or returns a list of files that were generated if
config.aot_inductor.package=True.
rY   z.AotCodeCompiler not yet supported for inductoroi)vec_isadevice_typeaot_moder   sourcesBuildOptionro   cpp)r(  r0  r}  
graph_dumpc                    > SST S.$ )Ninductor_aot_coder  )r   r  r  rb   )
input_paths   rf   r  )AotCodeCompiler.compile.<locals>.<lambda>  s    +&rh   c                    > T $ ra   rb   )r  s   rf   r  r    s    {rh   r  r   c                   > US:X  aW  TR                   [        TR                  R                  5       5      -  (       a  [	        U 5      S:  a  [        S5      eSnOSnSnOUS:X  a  SnS	nO[        S
U 35      e[	        U 5      S:  nSU S3nUS[         S3-  nUSU S3-  nXS S3-  nU(       d  U  H  nUSU S3-  nM     U (       d  US-  nOUS-  nUS[	        U 5      S-
   S3-  nUSU S3-  nXS S3-  n[        USTS9u  px[        U5      u  n	n
[        TS:w  a  TOSTR                  STS9n[        U	UU
US 9nUR                  5       nUR                  5       nT(       aW  [        R                   R#                  U5      S!   S"-   n[%        XUR'                  5       5        [        R(                  " US#5        O[+        U5        U(       a  [-        US$5       nUR/                  S!5        UR1                  S5      nUR3                  S%5      nUS&:w  d   eUR/                  U5        S!nU[	        U 5      :  a*  UR                  U US  5      nUU-  nU[	        U 5      :  a  M*  S S S 5        U$ U$ ! , (       d  f       U$ = f)'Nlinux 5wzPModels with buffer mutation included doesn't support constants greater than 2GB!z.ldata, "aw"z.lrodata, "a"rr   darwinz__DATA,__datart   zUnsupported platform: i   z
	.section	r?  z		.balign z	.globl	z_binary_constants_bin_start
z_binary_constants_bin_start:
z	.byte z
	.space 1
z	.quad 0x1234567899abcdef
z	.space    z.globl	z_binary_constants_bin_end
z_binary_constants_bin_end:
S)r0  xpuro   T)r  r  compile_onlyuse_absolute_pathr   r  
output_dirr  r   .o  zr+bs   ͫxV4r   )mutated_buffersr  r  r   r  
ValueErrorr   rH   r>  r@   r>   r  r<   get_command_lineget_target_file_pathr   r   splitextcompile_filer  chmodr  r   r  r  find)constsplatformsection_attrsymbol_prefixis_large_consts
consts_asmr$  rt   consts_sobject_output_nameobject_output_dirobject_build_optionsobject_buildercompile_cmdconsts_orZ  hdr	start_idxposrcconsts_specified_dirr  fbcode_aot_cpu_rer  r$  s                       rf   _compile_consts0AotCodeCompiler.compile.<locals>._compile_consts  s   7"((3u/C/C/E+FF 6{]2(n  $2L#2L "X%. #"%;H:#FGG!&kD0O'~R8JJ{m266JJ}o5RSSJO+IJJJ"AHQCr"22J   .0J<<
	#f+/):"==
H]O3NOOJO+GHHJ2KA 7x@"!#8 ,7%+?KU!"3$  (' ,0	N )99;K%::<H 77++H5a84?X1B1B1DE5)%k2(E*aFF1I&&,C #)L MI$?*?FF9%CF+WWVCD\2r	 F+ + O8O +* Os   BI..
I=FileLock.locktimeoutz.jsonrF  NAOTI_DEVICE_KEYz_metadata.jsonz"Metadata must only contain stringsr  r'  c              3  ~   >#    U  H2  nUTR                   ;  d  M  TR                  U5      R                  v   M4     g 7fra   )folded_constantsget_original_value_of_constantis_cuda)r  r   r  s     rf   r  *AotCodeCompiler.compile.<locals>.<genexpr>H  s:      2Du555 C44T:BB2s   ="=c                *   SS jnSS K nU R                  5       S:X  a  gU R                  (       aS  [        R                  R
                  R                  U 5      n[        R                  R
                  R                  U 5      nO>U R                  5       R                  5       nUR                  5       nUR                  5       nUR                  UUR                  UR                  U-  5      5      n[        UR                  5      nU(       a  U$ U" U5      $ )Nc                l    U R                  [        U 5      [        -   S-
  [        -  [        -  S5      nU$ )Nr,       )ljustr  rH   )	raw_bytespadded_bytess     rf   _pad_to_alignmentEAotCodeCompiler.compile.<locals>._to_bytes.<locals>._pad_to_alignmentO  s6    #,??Y+59kIKW$L ('rh   r   rh   )rU  r&  r   r&  )ctypesnumelr  rw   opsmkldnndata_ptr_nbytesuntyped_storagero   nbytesr   POINTERc_ubyter&  contents)	rn  all_cudarW  rY  r]  r`  t_cpu	raw_arrayrU  s	            rf   	_to_bytes*AotCodeCompiler.compile.<locals>._to_bytesN  s    ( 779>;;$yy//88;H"YY--55a8F--/335E$~~/H"\\^F"KKNN6>>F#:;	 ")"4"45	$,yN2CI2NNrh   rh   c              3  x   >#    U  H/  nUTR                   ;  d  M  T" TR                  U5      T5      v   M1     g 7fra   )rM  rN  )r  r   rg  rd  r  s     rf   r  rP  m  s>      . 65#9#99 TIeBB4H(SS 6s   ::r  T)r  r  r  r#  r$  use_mmap_weightsr%  zaot compilation command: %sr(  z_compile_flags.json)r,   qqr    )r  r  r  r$  zaot linkage command: %sar?  z// Compile cmd
// z// Link cmd
// z_linker_flags.jsonz_serialized_weights.binrG  qi  z.Si @  za+b    )r1  r&  r2  r   r   r   )rn  ztorch.Tensorrd  r  r   r&  )Ir{   r2  r   r:   rB   r<   r>   r  r7  r+  r"   r  r  r  r  r>  packager  r  rN  rL   r   r   r   r  filelockrG  r  LOCK_TIMEOUTr-  r   r  r  r%  r   r   r   r  r  r   package_constants_in_sor  force_mmap_weightsr@   r,  r  r@  package_cpp_onlyr.  r/  r  save_flags_to_filer   rK  rw   randintiinfoint64maxitemstructpackrx   r   ROCmCodeCacheCUDACodeCacher   r`  r  rV  resourcegetpagesizetell)<r  r  r  serialized_extern_kernel_nodesr  additional_filesgenerated_filespicked_vec_isavec_isa_cmd_gencpp_commandspecified_output_pathspecified_so_namer   rD  rG  r  lockextern_kernel_nodes_jsonrZ  r  	meta_jsonr  r  	output_sooutput_oserialized_weightsconsts_sizerj  r8  r9  r:  r;  r<  compile_flagsaot_constantsmagic_numberr=  gpu_codecacheentry	kernels_ooutput_namer&  so_build_options
so_builderlink_cmdlinker_flagsweight_file	f_weightso_filer  
page_size_	page_sizef_soso_sizerg  rd  rB  rC  r  r$  s<    `` `                                                 @@@@@@rf   compileAotCodeCompiler.compile  s	    +<<7"OPP%$-&'
 ?;;=> J;%#7JENN 	 .
 +6+>+>+J+JK	
!/	
Z &&"":.9:F
 +	
  "ww||BGGMM*,Ea,H#NR	 R	h 	&>XW}=|T-+-77+;+;J+G+JW+T(2C8AGG:; 9 &&..#**+CD**33H*5H&' ((4Q7:JJI++44::<1!!S))j/ / 878  =
 i%

6#6#6#?#?@A & ""**&&y1 % ##//WW%%j1!4u<  ww''
3A6=H !OO002 HO< ""::%(XX . % 4 4 6. &" &)"01K $*#3#3#55U+:U""55#' 
 7zB"!#8&'!"3!1$  ('",0	N )99;K%::<HII3[A&&77$!ww//
;A>EH X{7H7H7JKHHXu-)+6""33 " 0 0 <Q ?BW W$77F&&}5# 2 #q%++ekk*B*F*FMRRT  !'D+/< P&}cllCH#(==#4#4-/ 
 +007799E$$--d3 "!!9  
 +I&LY&W#K4&'"3	  $ !8Y7%,	J "224H"779III/: j#&!-k]"=>*8*B78 '
 ""33!ww//
;A>AUU 33LA&&|4
 $((4Q7:SS   k40I!(:;!C(FG 1 $**;7&&x0&&y1 % - ++77WW--j9!<uD 
 !(H!5y(..BRSHHY.)(3 GG$$X.q1D8F IIf% $#!)!5!5!7J #E: 6Ii/4"&))+

49w7J+J#KL

#56

6;;sL#AB 0 &&..#**95y | && #"C 98  &%d6 '&" 10F 0/i Ts   9l8<kCl8$l89>k,7Ol89$k>!k>1Bl89<l5B&l88lEl8A"l'78l8
k)	$l8,
k;	6l8
l	l8
l$	l8'
l5	1l88
mrb   N)r  rU   r  r   r  Optional[str]r  r   r  r  r   zUnion[List[str], str])r   r   r   r   r	  r  r   rb   rh   rf   r  r    sV    DD D )6	D
 D $D 
D Drh   r  c                     [        [        5      R                  S-  n U R                  5        nUR	                  5       n[        US5      u  p4S S S 5        [        W5      $ ! , (       d  f       N= f)Nr  r  )r   r  rL  r   r  r>  rA   )r   rZ  r8  rt   r  s        rf   cpp_prefix_pathr    s[     >  #99D	&&(
 
 $H-- 
s   A
A-c                     [        5       n [        R                  " 5       (       a#  S[        R                  R                  U 5       S3$ SU  S3$ )Nz
#include "")r  r"   r  r   r   r.  r  s    rf   
cpp_prefixr  #  sI     H BGG,,X67q99H:Q''rh   c                f    [        S5         [        XU5      sS S S 5        $ ! , (       d  f       g = f)Nr.  )r   _compile_file)r  r  r  s      rf   r.  r.  /  s"     
n	%Zc: 
&	%	%s   "
0c           	         [        U [        5      (       a  U /OU nU Vs/ s H>  n[        R                  " 5       (       a  [        R
                  R                  U5      OUPM@     nn [        R                  " 5       (       Ga  [        5       n[        R
                  R                  U5      n[        R
                  R                  U5      n[        R
                  R                  [        S5      n	[        R                  " 5        n
[        R                  " U[        R
                  R                  X5      5        [        R                  " [        [        R
                  R                  U
S5      5        [        X55       H9  u  p[        R                  " U[        R
                  R                  X5      5        M;     [        R
                  R                  U
S5      n[        R                   " X5        [#        X*U5      n[        R
                  R%                  U5      (       a  [        R&                  " U5        [        R                  " X5        S S S 5        g [(        R*                  " U[(        R,                  S9  g s  snf ! , (       d  f       g = f! [(        R.                   ak  nUR0                  R3                  S5      nSU;   =(       d    SU;   nU(       a  [4        R6                  S:X  a  SnUU-  n[8        R:                  " UU5      UeS nAff = f)	Nincluder  )stderrr   z'omp.h' file not foundlibompr  a  

OpenMP support not found. Please try one of the following solutions:
(1) Set the `CXX` environment variable to a compiler other than Apple clang++/g++ that has builtin OpenMP support;
(2) install OpenMP via conda: `conda install llvm-openmp`;
(3) install libomp via brew: `brew install libomp`;
(4) manually setup OpenMP and set the `OMP_PREFIX` environment variable to point to a path with `include/omp.h` under it.)r%  r   r"   r  r   r   r.  r  r   r  tempfileTemporaryDirectoryrT  r   _LINKER_SCRIPTzipcopytreer[   r  rV  r  check_outputSTDOUTr  r  r  r{   r2  r#   r  )r  r  r  input_pathsipinput_filesheader_pathheader_namer  torch_includes_pathtmp_dirr  rZ  dest_include_pathoutput_file_pathr  r  openmp_probleminstructions                      rf   r  r  6  s#    #-Z"="=:,:KEPEPr 0 0 2 2:[  (6)+K''**;7K''**;7K #%'',,{I"F,,.'Kg)KLNBGGLL+,NO9DAKK277<<#;< :$&GGLL)$D! 3G#5cK#P 77>>+..IIk*,: /. ##C
0A0AB9 /.  (( 6)1V;Qx6?Qcllh62  k!F!!#v.A56sE   AI8'BJ EI=J #J =
JJ J L"A&LLzOptional[CDLL]_libgompc                B  ^ S	U4S jjmU Vs/ s H  nT" U5      PM     nnU R                  S5      (       d
   U S-   5       eS n[        U R                  S5      5       H,  u  pVUS:X  a  [        R                  " U5      n[        XF5      nM.     [        U5      (       d
   U S-   5       e[        5       n[        UR                  R                  U5       H&  u  pUR                  (       d  M  XUR                  '   M(     U(       a  U[        U5      * S 2	 U" U0 UD6n
[        U
[        [         45      (       a  U
 Vs/ s H  oc  ["        R$                  " / 5      OUPM     n
n[        U
5       H.  u  p[[        U["        R&                  5      (       a  M&   U S-   5       e   ["        R(                  R*                  R-                  U
5      $ [        U
["        R&                  5      (       d
   U S-   5       e["        R(                  R*                  R/                  U
5      $ s  snf s  snf )
Nc                   > [        [        U 5      5      S:X  a)  [        R                  R                  R                  U 5      $ [        U [        [        45      (       a  [        U 5      " U4S jU  5       5      $ U $ )Nz<class 'PyCapsule'>c              3  4   >#    U  H  nT" U5      v   M     g 7fra   rb   )r  rm  convert_args     rf   r  9custom_op_wrapper.<locals>.convert_arg.<locals>.<genexpr>s  s     9S[^^Ss   )	r   r  rw   r  _aoti&alloc_tensor_by_stealing_from_void_ptrr%  r  tuple)argr  s    rf   r  &custom_op_wrapper.<locals>.convert_argn  sX    tCy>2288>>HHMMdE]++99S999Jrh   z
torch.ops.z, can not be called through custom_op_wrapperrq   r   z, can not be loaded through custom_op_wrapperz returns a list of non-tensorsz returns a non-tensor)r  r   r   r   )
startswith	enumerater  	importlibimport_moduler  callabler  r  _schema	arguments
kwarg_onlyr   r  r%  r  r  rw   tensorr   r  r  #unsafe_alloc_void_ptrs_from_tensors!unsafe_alloc_void_ptr_from_tensor)r   rd   r  converted_argsfuncr  r  re   func_argconv_argresultrr  s               @rf   custom_op_wrapperr  k  s    377$3k#&$N7==&& 
;;& D"((3-(6**1-Dt )
 D>>N2 NNN> VF!$,,"8"8.I$,8==! J CK<>*>,V,F&4-((@FG1i%,,r"Q6Gf%DAa..U5U0UU. &xx~~AA&II&%,,//M6M1MM/xx~~??GG= 80 Hs   H9$Hc                      \ rS rSr% 0 rS\S'   \" \R                  5      r0 r	S\S'   \SS j5       r
\SS j5       r\   S         SS
 jj5       r\SSS jj5       rS	rg)CppCodeCachei  0Dict[str, Callable[[], Union[CDLL, ModuleType]]]r   r   cpp_compile_command_flagsc                .    [         R                  " U 5      $ ra   )r   LoadLibrary)r   r   s     rf   _load_library_inner CppCodeCache._load_library_inner  s    %%rh   c           	         U R                  X5      nX#l        U$ ! [        [        4 a  nS[	        U5      ;   aX  [
        R                  R                  S5      (       a4  [        R                  " S5      q
U R                  X5      nX#l        Us S nA$ S[	        U5      ;   a;  [        U S[        R                  " 5        S[        R                  " 5        S35      Uee S nAff = f)Ngompz/usr/lib64/libgomp.so.1z(failed to map segment from shared objectz3.  The most common reason this may occur is if the zl folder is mounted with noexec (e.g., by default Docker mounts tmp file systems as noexec).  Please remount zi with exec enabled, or set another temporary directory with TORCHINDUCTOR_CACHE_DIR environment variable.)r  r   ImportErrorr  r   r   r   r  r   r  r  r  
gettempdir)r  r   r   r  r  s        rf   _load_libraryCppCodeCache._load_library  s    	,,T7FJMW% 	QBGGNN3L$M$M  ++,EF00; 
9SVCcLXM`M`MbLc d33;3F3F3H2I J]]
  	s"    C"A!CC"ACC"Nrb   c           	     Z  ^ ^^^^^ 0 T R                   EU[        5       US.En[        5         [        SS[	        S0 UD6S9n[        UR                  5       5      n[        USUS9u  mnTT R                  ;  Ga  SSK	J
n	  [        R                  R                  [        5       TS	-   5      n
[        U5      u  p US S
 S-   nS mS m[	        S0 UD6n[        UUUUS9n[         R"                  " [$        U
UUU5      m['        [(        R*                  " 5       (       a  UOUR-                  5       5      mSUU UUUU4S jjnUb@  U	" U
[.        S9   [        R                  R1                  T5      (       d  U" T5      mS S S 5        UT R                  T'   T R                  T   $ ! , (       d  f       N,= f)N)r  r  extra_flagsr  r  r  r  r(  r   rF  rH  sor%  c                 z   > Tc6  Tb  TR                  5         T" 5       n U b   eTR                  TT5      mTc   eT$ ra   )r  r  )r  binary_pathr  futurer   r  	worker_fns    rf   load_fn(CppCodeCache.load_async.<locals>.load_fn  sI    ;)&[F!>)>++K=C?*?
rh   rI  rb   r   r   )r  rB   r:   r<   r>   r7  r+  r>  r   rq  rG  r   r   r   r  r@   r   r
   _worker_compile_cpprA   r"   r  r,  rr  r  )r  r  r  	submit_fnr  compile_commandcommand_genvec_isa_cmdr  rG  	lock_pathr  r&  fb_output_pathcpp_build_optioncpp_builderr  r  r  r   r  r  s   `                @@@@@rf   
load_asyncCppCodeCache.load_async  s   
++
&#~&	
 	 c/D/W/W
 ;779:U+FZcii)\^S7]CI&LZ&X#K (_t3N,0FC4GG$ "%,	K "))#I 3##%%  557K	 	 $i>77>>+66!*9!5 ? %CIIcNyy~ ?>s   	-F
F*c                .    U R                  X5      " 5       $ ra   )r  )r  r  r  s      rf   r   CppCodeCache.load  s    ~~k799rh   )r   r   r   r   r   zUnion[CDLL, ModuleType])ro   Nrb   )
r  r   r  r   r  r   r  Sequence[str]r   r   )ro   )r  r   r  r   r   r   )r   r   r   r   r   rc  r   r  r  r  r  r	  r  r  r   r   rb   rh   rf   r  r    s    >@E;@u{{+K02~2& &  ,  !%'PP P 	P
 #P 
P Pd : :rh   r  c           	        SSK Jn  U" U [        S9   [        R                  " 5       (       a  UOUR                  5       n[        R                  R                  U5      (       dZ  [        R                  " 5       (       a0  [        UU[        R                  " UR                  5       5      5        OUR                  5         S S S 5        g ! , (       d  f       g = f)Nr   rF  rI  )rq  rG  rr  r"   r  r,  r   r   r  r.  r  r  r+  build)r  r  fb_input_pathr  rG  r  s         rf   r  r    s     "	)\	2$..00Nk6V6V6X 	 ww~~k**!!!"KK < < >? !!# 
3	2	2s   B+C
Cc                      \ rS rSr% 0 rS\S'   \" \R                  5      rSSS.r	Sr
SrS	r\R                  " S
5      r\SS j5       r\    S             SS jj5       r\SS j5       rSrg)CppPythonBindingsCodeCachei)  r  r   FTinclude_pytorchsharedr4  zkernel(%s);Py_RETURN_NONE;rr   a  
        // Python bindings to call %s():
        #define PY_SSIZE_T_CLEAN
        #include <Python.h>
        #include <sstream>
        #include <cstdlib>

        #ifndef _MSC_VER
        #if __cplusplus < 202002L
        // C++20 (earlier) code
        // https://en.cppreference.com/w/cpp/language/attributes/likely
        #define likely(x)       __builtin_expect(!!(x), 1)
        #define unlikely(x)     __builtin_expect(!!(x), 0)
        #endif
        #else
        #define likely(x) (x)
        #define unlikely(x) (x)
        #endif

        // This is defined in guards.cpp so we don't need to import PyTorch headers that are slooow.
        // We manually link it below to workaround issues with fbcode build.
        static void* (*_torchinductor_pyobject_tensor_data_ptr)(PyObject* obj);

        template <typename T> static inline T parse_arg(PyObject* args, size_t n) {
            static_assert(std::is_pointer_v<T>, "arg type must be pointer or long");
            return static_cast<T>(_torchinductor_pyobject_tensor_data_ptr(PyTuple_GET_ITEM(args, n)));
        }
        template <> inline int64_t parse_arg<int64_t>(PyObject* args, size_t n) {
            auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == -1 && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return result;
        }
        template <> inline uintptr_t parse_arg<uintptr_t>(PyObject* args, size_t n) {
            auto result = PyLong_AsVoidPtr(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == reinterpret_cast<void*>(-1) && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return reinterpret_cast<uintptr_t>(result);
        }

        %s

        static PyObject* %s_py(PyObject* self, PyObject* args) {
            try {
                if(unlikely(!PyTuple_CheckExact(args)))
                    throw std::runtime_error("tuple args required");
                if(unlikely(PyTuple_GET_SIZE(args) != %s))
                    throw std::runtime_error("requires %s args");
                %s
            } catch(std::exception const& e) {
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return nullptr;
            } catch(...) {
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return nullptr;
            }
        }

        static PyMethodDef py_methods[] = {
            {"%s", %s_py, METH_VARARGS, ""},
            {NULL, NULL, 0, NULL}};

        static struct PyModuleDef py_module =
            {PyModuleDef_HEAD_INIT, "%s", NULL, -1, py_methods};

        PyMODINIT_FUNC PyInit_%s(void) {
            const char* str_addr = std::getenv("_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR");
            if(!str_addr) {
                PyErr_SetString(PyExc_RuntimeError, "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR must be set");
                return nullptr;
            }
            std::istringstream iss(str_addr);
            uintptr_t addr = 0;
            iss >> addr;
            _torchinductor_pyobject_tensor_data_ptr =
                reinterpret_cast<decltype(_torchinductor_pyobject_tensor_data_ptr)>(addr);
            PyObject* module = PyModule_Create(&py_module);
            if (module == NULL) {
                return NULL;
            }
            #ifdef Py_GIL_DISABLED
                PyUnstable_Module_SetGIL(mod, Py_MOD_GIL_NOT_USED);
            #endif
            return module;
        }
        c                   [        [        R                  R                  R                  R
                  5      [        R                  S'   U SU R                   3n [        R                  U   $ ! [         a     Of = f[        R                  R                  X15      nUc   e[        R                  R                  U5      nU[        R                  U'   UR                   R#                  U5        U$ )N'_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTRrq   )r   rw   r  _dynamor  '_torchinductor_pyobject_tensor_data_ptrr   environentry_functionr{   r  KeyErrorr  utilspec_from_file_locationmodule_from_specloaderexec_module)r  r   r   module_namer  r  s         rf   r  .CppPythonBindingsCodeCache._load_library_inner  s    @CHH##KKA


<= Qs1123	;;{++ 		~~55kH006#)K 's   A. .
A;:A;Nrb   c                  ^ ^
^ SR                  S [        U5       5       5      nT R                  T R                  T R                  (       a  T R                  U-  OST R                  [        U5      [        U5      T R                  U-  T R                  T R                  T R                  T R                  4
-  nT R                  X(-   UUUS9m
SmSU U
U4S jjn	U	$ )z
Wrap a C++ function in fast Python bindings.

Args:
    argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
    source_code: C++ source code containing a ENTRY_FUNCTION() function

Returns:
    A python version of ENTRY_FUNCTION()
, c              3  X   #    U  H   u  pS UR                  SS5       SU S3v   M"     g7f)z
parse_arg<zconst rr   z>(args, )N)rz   )r  nargtypes      rf   r  BCppPythonBindingsCodeCache.load_pybinding_async.<locals>.<genexpr>  s4      
1
 267xs!D1s   (*rr   )r  r  Nc                 r   > Tc  T" 5       m[        T[        5      (       d   e[        TT R                  5      $ ra   )r%  r   r  r  )r  
get_resultr  s   rf   r  ?CppPythonBindingsCodeCache.load_pybinding_async.<locals>.future  s5    ~#!&*555563#5#566rh   r  )r   r  suffix_templater  extra_parse_argr  call_entry_functionr  )r  argtypesr  r  num_outputsr  r  	parseargssuffixr  r(  r  s   `         @@rf   load_pybinding_async/CppPythonBindingsCodeCache.load_pybinding_async  s    ( II 
'1
 
	 $$141D1DC+-"MM##i/(
 
 ^^ #	 $ 

 	7 	7 rh   c                0    U R                   " U0 UD6" 5       $ ra   )r1  r  rd   re   s      rf   load_pybinding)CppPythonBindingsCodeCache.load_pybinding  s    ''88::rh   )r   r   r   r   r   r   )ro   r   Nrb   )r-  r  r  r   r  r   r.  rK  r  r   r  r	  r   r   )rd   r   re   r   r   r   )r   r   r   r   r   rc  r   r  r  r  r  r,  r+  textwrapdedentr*  r	  r  r1  r5  r   rb   rh   rf   r  r  )  s    >@E;@u{{+K !!
 N6OooU	WOr    
 !%'22 2 	2
 2 2 #2 
2 2h ; ;rh   r  c                  ~    \ rS rSr% 0 rS\S'   \" \R                  5      rSSS.r	Sr
Sr\R                  " S5      rS	rg
)CppWrapperCodeCachei  r  r   Tr  inductor_entry_cppzreturn inductor_entry_cpp(%s);a3	  
        #include <torch/csrc/inductor/aoti_torch/c/shim.h>

        static inline std::vector<AtenTensorHandle> unpack_tensor_handle_list(PyObject* pyvec) {
            std::vector<AtenTensorHandle> result;
            size_t result_len = PyList_GET_SIZE(pyvec);
            result.reserve(result_len);
            for (size_t i = 0; i < result_len; i++) {
                // AtenTensorHandle is essentially a pointer
                void* elem = PyCapsule_GetPointer(PyList_GET_ITEM(pyvec, i), NULL);
                result.push_back(reinterpret_cast<AtenTensorHandle>(elem));
            }
            return result;
        }

        static inline PyObject* pack_tensor_handle_list(const std::vector<AtenTensorHandle>& cppvec) {
            size_t result_len = cppvec.size();
            PyObject* result = PyList_New(static_cast<Py_ssize_t>(result_len));
            for (size_t i = 0; i < result_len; i++) {
                PyObject *elem =
                    cppvec[i] == nullptr
                        ? Py_None
                        // Store AtenTensorHandle as PyCapsulate
                        : PyCapsule_New(reinterpret_cast<void*>(cppvec[i]), NULL, NULL);
                PyList_SET_ITEM(result, i, elem);
            }
            return result;
        }

        template <> inline std::vector<AtenTensorHandle> parse_arg<std::vector<AtenTensorHandle>>(PyObject* args, size_t n) {
            return unpack_tensor_handle_list(PyTuple_GET_ITEM(args, n));
        }

        PyObject* inductor_entry_cpp(std::vector<AtenTensorHandle>&& input_handles) {
            // For outputs, we only allocate a vector to hold returned tensor handles,
            // not allocating the actual output tensor storage here
            std::vector<AtenTensorHandle> output_handles(%s);
            try {
                inductor_entry_impl(input_handles.data(), output_handles.data());
                if (PyErr_Occurred()) {
                    return nullptr;
                }
                return pack_tensor_handle_list(output_handles);
            } catch(std::exception const& e) {
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return nullptr;
            } catch(...) {
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return nullptr;
            }
        }
        rb   N)r   r   r   r   r   rc  r   r  r  r  r  r,  r7  r8  r+  r   rb   rh   rf   r:  r:    sI    >@E;@u{{+K! *N:oo3	5Orh   r:  c                  8   \ rS rSr% 0 rS\S'   \" \R                  5      rSr	S\S'   \
R                  " S5      r\\
R                  " S5      -   r\\
R                  " S	5      -   r\
R                  " S
5      r\SS j5       r\SS j5       r\\R(                  " S5      SS j5       5       r\SS j5       r\\R(                  " S5      SS j5       5       r\\R(                  " S5      SS j5       5       r\ S       SS jj5       r\SS j5       r\SS j5       rSrg)HalideCodeCachei	  z0Dict[str, Callable[[], Union[ModuleType, CDLL]]]r   Nr  _standalone_runtime_patha  
        #include "{halideruntime_h}"
        #include "{headerfile}"
        #include <stdexcept>
        #include <cmath>

        namespace c10 {{
            inline long div_floor_integer(long a, long b) {{
                if ((a<0) != (b<0)) {{
                    const auto quot = a / b;
                    const auto rem = a % b;
                    return rem ? quot - 1 : quot;
                }}
                return a / b;
            }}
        }}
        z
        void kernel({argdefs}) {{
            {buffers}
            int err = halide_kernel({buffer_names});
            if(err != 0) throw std::runtime_error("halide_kernel failed");
        }}
        a{  
        #include <cuda.h>
        static const halide_device_interface_t* cuda_interface = halide_cuda_device_interface();

        void kernel({argdefs}, uintptr_t stream) {{
            {buffers}
            int err = halide_kernel(reinterpret_cast<void*>(stream), {buffer_names});
            if(err != 0) throw std::runtime_error("halide_kernel failed");
        }}
        a  
        #include "{}"
        #include <cuda.h>

        static int acquire_context(void* user_context,
                                   void** cuda_context_out,
                                   bool create) {{
            return cuCtxGetCurrent(reinterpret_cast<CUcontext*>(cuda_context_out));
        }}

        static int release_context(void* user_context) {{
            return 0;
        }}

        static int get_stream(void* user_context,
                              void* cuda_context,
                              void** stream_out) {{
            *stream_out = user_context;
            return 0;
        }}

        static int register_halide_hooks() {{
            halide_set_cuda_acquire_context(&acquire_context);
            halide_set_cuda_release_context(&release_context);
            halide_set_cuda_get_stream(&get_stream);
            return 0;
        }}

        int inductor_register_halide_hooks_result = register_halide_hooks();
        c                   UR                   c   eUR                  b,  [        UR                   5      [        UR                  5      :X  d   eUR                  c   eUR                  =(       d    UR
                   SUR                   3nU(       a  SU S3nSnSnSnOSnSnSU S3nS	n/ n	[        UR                   UR                  5       H  u  pU	R                  S
U
 SU S35        M     SU S3SU SSR                  U	5       S3U SU S3U SU S3U SU S3U SU S3U SUR                  5        S3U S[        U	5       S3U SU S3U S3/
$ )Nz + zreinterpret_cast<uint64_t>(r#  cuda_interfacenullptrhalide_buffer_flag_device_dirty0zreinterpret_cast<uint8_t*>(halide_buffer_flag_host_dirtyzhalide_dimension_t(0, r!  zhalide_buffer_t ;zhalide_dimension_t z_dims[] = {z};z
.device = z.device_interface = z.host = z	.flags = z.type = z.dimensions = z.dim = z_dims;z.padding = nullptr;)
shapestrider  offsetalias_ofr   r  r  r   halide_type)r  r   r  ry   r]  r   device_interfacehostflagsdimssizerG  s               rf   _codegen_bufferHalideCodeCache._codegen_bufferi	  s   yy$$$zz%#cii.C

O*KKKzz%%%ll.chh/s3::,?28*A>F/D5EF(0
!<D3E		3::6LDKK0bBC 7 tfA&!$|DIIdO3DCHfJvha(f()9(:!<fHTF!$fIeWA&fHS__./q1fN3t9+Q/fGD6(f'(
 	
rh   c           	        UR                  5       nUSUR                  ;   L d   eSUR                  ;   d   e/ n/ n[        UR                  5       H  u  pgUR	                  5       (       a:  UR                  SU 35        UR                  U R                  SU 3Xs5      5        MT  SUR                  ;  d   eUR                  UR                  5        M     SR                  U Vs/ s H  nSU 3PM
     sn5      R                  5       nU(       a  U R                  OU R                  n	U	R                  U R                  U(       a  SOS	5      US
R                  S UR                   5       5      US
R                  U5      S9n
U
$ s  snf )Nuser_context
no_runtimez&hl_buf_hl_buf_*r?      HalideRuntimeCuda.hzHalideRuntime.hr!  c              3  ~   #    U  H3  nUR                   b  M  UR                  5        SUR                   3v   M5     g 7f)Nrl  )rI  bindings_typer   )r  rm  s     rf   r  0HalideCodeCache._codegen_glue.<locals>.<genexpr>	  s7      &A:: 01??$%Qqvvh/&s   =&=)halideruntime_h
headerfileargdefsbuffersbuffer_names)rO  rI  r  r-  	is_bufferr  extendrP  ctyper   r   lstripglue_template_cudaglue_template_cppformatfind_header)r  ro  r]  rO  r_  r`  r  r  lineglue_template	glue_codes              rf   _codegen_glueHalideCodeCache._codegen_glue	  se   ,,.>T[[8999t{{***.FA}}##hqcN3s22WQC=#OP#))+++##CHH- / ))w?wttD6]w?@GGI29..s?T?T!((OO)0%6G "II  
 <0 ) 
	 ! @s   $Fc                    [        SS[        5       S9nUR                  5       n[        SR	                  U R
                  U R                  U R                  U/5      R                  S5      5      $ )NOIr  r?  r   )	r<   r=   r+  r"  r   rf  re  standalone_runtime_cuda_initr   )r  r  command_lines      rf   config_hashHalideCodeCache.config_hash	  sp     !"

 #335II))**44 	 fWo	
 		
rh   c                   [         R                  R                  R                  S5      nUb  UR                  (       d  [        S5      e UR                  S   n[        R                  " U5       GH  nUR                  S5      (       d  M   [        R                  " S[        R                  R                  X45      /5      n[        R                  " SUR!                  S5      5      nU(       d  M  [        R                  R                  [        R                  R#                  UR%                  S5      5      U 5      n[        R                  R'                  U5      (       d  M  [        R                  R#                  U5      s  $    [        U5      e! [        R                   a     GM9  f = f! [(         a  n[        U5      UeS nAff = f)	Nhalidez$halide python bindings not installedr   r  lddz(/.*)/libHalide.sor   r,   )r  	machinery
PathFinderr  r  r   r   rm  r  r  r  r   r   SubprocessErrorr  searchr  abspathgroupr  ro  )	r0  errmsgr  r{  fileoutmr   r  s	            rf   _search_for_file HalideCodeCache._search_for_file	  sM   ""--77A<t>>EFF	.44Q7F

6*==''!(55"BGGLL$>?
 		"7G9LMAq!ww||BGGOOAGGAJ,GP77>>$//#%77??4#88 + 6"" &55 ! !  	.v&A-	.sO   
=F, 5F+F, 0A.F, " F, F, F)$F, (F))F, ,
G6GGc                6   SU R                  5        S3nS[        R                  ;   aW  [        R                  R	                  [        R                  S   U5      n[        R                  R                  U5      (       a  U$ SU S3n[        R                  X5      $ )Nlibautoschedule_r  
HALIDE_LIBCan't find z3, set env HALIDE_LIB to the directory containing it)r   r   r  r   r   r  r=  r  )r   sofiler   r~  s       rf   find_libautoschedule$HalideCodeCache.find_libautoschedule	  s~     $DJJL>52::%77<<

< 8&ADww~~d##&!TU 	 //??rh   c                ,   S[         R                  ;   aW  [         R                  R                  [         R                  S   U 5      n[         R                  R	                  U5      (       a  U$ S[         R                  ;   aw  [         R                  R                  [         R                  R                  [         R                  S   SU  35      5      n[         R                  R	                  U5      (       a  U$ SU  S3n[        R                  SU  3U5      $ )NHALIDE_INCLUDEr  z../include/r  z7, set env HALIDE_INCLUDE to the directory containing it)r   r  r   r   r  r|  r=  r  )r   r   r~  s      rf   rh  HalideCodeCache.find_header	  s     rzz)77<<

+; <dCDww~~d##2::%77??RZZ5TF7KLD ww~~d##$VW 	 //+dV0DfMMrh   c                d  ^^ [        [        [        U[        U R	                  5       U45      S9S5      S   5      n[
        R                  " USS9  S m[        US-  5      n[        US-  5      n[        US-  5      n[        US	-  5      n[        US
-  5      n	[
        R                  R                  U5      (       + n
/ nU
(       a  [        XR5        [        R                  USSSU SSSS/
nUR                  (       a,  UR                  SU R                  UR                  5      /5        UR                  UR!                  5       5        UR#                  [$        R&                  " [(        R*                  U5      5        UR,                   Vs/ s H!  oR.                  b  M  UR1                  5       PM#     nnUR3                  5       (       a  UR#                  S5        U R5                  UU R7                  X5      X`R9                  5       4U
(       a  UR"                  OS UR3                  5       (       a  SOSS9mU
(       af  UR#                  [$        R&                  " [:        U5      5        [$        R&                  " [<        X5      nU(       a  U" U5      R>                  mOU" 5         SUU4S jjnU$ s  snf )Nr  rv     Tru   zgenerate_kernel.pyzhalide_kernel.azhalide_kernel.hdoner  -gr4  -oz-fhalide_kernelz-ezstatic_library,h,schedulez-p	uintptr_try   ro   )r  r  r  c                 .   > T(       a  T" 5         T " 5       $ ra   rb   )bindings_futurewait_for_compiles   rf   r   3HalideCodeCache.generate_halide_async.<locals>.load4
  s     ""$$rh   )r   Callable[[], Any]) r   r2  r*  r7  rs  r   r   r   r   r  r   r{   
executable	schedulerrb  r  rd   r  r   r
   r  r  r-  rI  rZ  rO  r1  rl  build_standalone_runtimetouch_worker_task_halider  )r  ro  r  r  dirpathgenfilelibfiler]  donefilelockfileneed_compilejobsr  r  binding_typestaskr   r  r  s                    @@rf   generate_halide_async%HalideCodeCache.generate_halide_async	  sN     1489  
 	Gd+g 445g 112#445
w'(w'(77>>(33.)+C ~~

D#":":4>>"JKLJJtyy{#KK	))**?*?EF ,0==
+8CLLC= 	 
 <<>>  -22d/ ">">"@A%1dkkt"&,,..e 3 
 KK	))%:;$$%8(ID#,T?#9#9 	% 	%
 5
s   
J-J-c                0    U R                   " U0 UD6" 5       $ ra   )r  r4  s      rf   generate_halideHalideCodeCache.generate_halide;
  s    (($9&9;;rh   c           
        U R                   (       a:  [        R                  R                  U R                   5      (       a  U R                   $ [        R
                  R                  5       (       a  SOSnSnUS:X  a  SOSnU R                   (       a;  [        R                  R                  U R                   5      (       a   e[        5       nO
[        5       n[        U5      SU SU R                  5        3-  n[        R                  " USS	9  [        US
-  5      n[        US-  5      n[        US-  5      n[        US-  5      n	[        XR-  5      n
[        R                  R                  U5      (       Gd#  SS KnSS KnUR                  U[         5         [        R                  R                  U5      (       d  [#        US5       nUS:X  a9  UR%                  U R&                  R)                  U R+                  S5      5      5        S S S 5        UR-                  XR/                  U5      5        [1        U
5      u  p[3        UX/U[5        US9S9n[6        R8                  " [:        R<                  " UR?                  5       5      5        [A        U5        S S S 5        [        R                  R                  U
5      (       d   eXl         U
$ ! , (       d  f       N= f! , (       d  f       NM= f)Nry   ro   zlibStandaloneHalideRuntime.soz	host-cudarL  zhalide-runtime--Tru   r  r  z	hooks.cppzstandalone_halide_runtime.ar   rF  rX  r  r%  )!r>  r   r   r  rw   ry   rI  rG   rF   r   rs  r   r   rq  rv  rG  rr  r   r>  rq  rg  rh  compile_standalone_runtimeTargetr@   r<   r>   r  r  r  r  r+  r  )r  r  libnamerI  baser  r  r  hookfileafiler  rq  hlrZ  r   r&  halide_cmd_gens                    rf   r  (HalideCodeCache.build_standalone_runtime?
  sJ   ''BGGNN((-
 -
 /// %

 7 7 9 9fu1 +v 56''ww~~c&B&BCCCC
 %&D;Dt*#//:K9LMM
Gd+w'(w'(w,-G;;<W&'ww~~h''""8\:ww~~h//h,&&0GG # @ @ G G$'OO4I$J!" - 11%69JK'Mf'U$D%/!!) 1#-$9(3%	&N ))N$C$C$EF (O1 ;2 ww~~f%%%%'-$3 -, ;:s&   1KA KBK
K	K
K&rb   )r   r   r  rW   ry   r  r   r  )ro  rX   r]  objectr   r   r  )r0  r   r~  r   r   r   r   r   r   r   ra   )ro  rX   r  r   r  r   r   r  )rd   r   re   r   r   r  )r   r   r   r   r   rc  r   r  r  r>  r7  r8  r  rf  re  rq  r	  rP  rl  r   r   rs  r  r  rh  r  r  r  r   rb   rh   rf   r=  r=  	  s   >@E;@u{{+K.2m2__	F& 	"   (//		#  $,??	$ B 
 
B  > 
  
$ # #. 	@  	@ N  N  BFBB,/B<?B	B BH < < 7 7rh   r=  c                h   SSK Jn   U" U [        5         U H
  nU" 5         M     S S S 5        g ! , (       d  f       g = f! [        R                   Ga\  n[
        R                  R                  S5      S:X  Ga2  [        USS5      tpVn[
        R                  R                  U5      R                  S5      (       a  [        U5      R                  5       nSn	UR                  U	5      S	:X  d   e " S
 S5      n
U
" 5       XwR                  S5      S	-   '   [         R"                  " [         R$                  " SS/UQ< S35      S5      nUR'                  X5      n[        SS5       nUR)                  UR+                  5       5        S S S 5        O! , (       d  f       O= f[-        SU 35      Uee S nAff = f)Nr   rF  HALIDE_REPRO1r  )rr   rr   rr   pythonz    hl.main()r,   c                      \ rS rSrSS jrSrg) _worker_task_halide.<locals>.Outi
  c                    g)Nr  rb   r   s    rf   __repr__)_worker_task_halide.<locals>.Out.__repr__
  s    $rh   rb   Nr  )r   r   r   r   r  r   rb   rh   rf   Outr  
  s    %rh   r  r  z                        import sys, tempfile
                        with tempfile.TemporaryDirectory() as out:
                            sys.argv = zrepro.pyz?
                            hl.main()
                        rW  rF  zwrote repro.py: )rq  rG  rr  r  rz  r   r  r   r  r   r.  r  r   r  countindexr7  r   r8  rz   r>  rd  r   )r  r  rG  jobr  r  scriptr  r'  mainr  replfds                rf   r  r  z
  so   ! h-  .-- %% ::>>.)S0#*1e\#B FSww'228<<F|((*&zz$'1,,,% % ,/5IIdOa'(OO( *4(:c(:'= > 
 ||D/*c*bHHT[[]+ +**"%5aS#9:A9sG   A  /A  
=A  A   F1DF," F	F,
F	F,,F1c                8    [        U S5      R                  5         g )Nrm  )r   closer  s    rf   r  r  
  s    3rh   c                  
   \ rS rSr% / rS\S'   0 rS\S'   \SSS jj5       r\   S         SS jj5       r	\  S         SS	 jj5       r
\SSS
 jj5       r\\R                  " S5            SS j5       5       rSrg)PyCodeCachei
  zList[ModuleType]r  z Dict[str, List[Tuple[Any, ...]]]linemapsc                    [        USUS9$ Nrs   r  rA  )r  r  r(  s      rf   r>  PyCodeCache.write
  s    [$e44rh   Nc                @    [        USUS9u  pVU R                  XVX45      $ r  )r>  load_by_key_path)r  r  r(  linemapattrsr   r   s          rf   r   PyCodeCache.load
  s'     +t59	##Cw>>rh   c                D   Uc  / n[        X5      n[        [        U6 5      U R                  U'   Ub%  UR	                  5        H  u  pg[        XVU5        M     U(       d'  U(       d   [        R                  " [        X5      Ul	        U R                  R                  U5        U$ ra   )rD   r  r  r  r  setattrr   r
   rE   _reload_in_subprocr  r  )r  r   r   r  r  modr  r  s           rf   r  PyCodeCache.load_by_key_path
  s     ?G#C. "#w-0T" & 5%.%6%60#&C" 	3
rh   c                    U(       aG  U R                    H7  n UR                  (       d   e[        R                  " UR                  5        M9     U R                   R                  5         g! [         a     Mc  f = f)zj
Clear the in-memory module cache. If purge=True, also delete all the
corresponding on-disk source files.
N)r  r  r   rV  r  r  )r  purger  s      rf   r  PyCodeCache.cache_clear
  s\     {{<<'<IIcll+ # 	 ) s   3A**
A87A8c                    XR                   ;  a  g U R                   U   u  p4[        X25      nUS:X  a  g XES-
     nU(       d  g SS jnU" U5      $ )Nr   r,   c           	         Sn[         R                  " X5      n[        U5       VVVs/ s H  u  p4nU[        U5      US.PM     snnn$ s  snnnf )Nz"File "(.+)", line (\d+), in (.+)\n)r  ri  r   )r  findallreversedrK  )stack_traceregexmatchesrZ  lr$  s         rf   parse_stack_trace<PyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace
  sR     :Ejj4G  (00GA! A:0  s   A	)r  r   r   zList[Dict[str, Any]])r  r   )r  r   linenor  r  r  r  r  s           rf   stack_frames_for_code!PyCodeCache.stack_frames_for_code
  sX    
 ||#||D)'6!e	 !''rh   rb   rr   )r  r   r(  r   r   Tuple[str, str])rr   NN)
r  r   r(  r   r  Optional[List[Tuple[int, str]]]r  r   r   r   )NN)
r   r   r   r   r  r  r  r   r   r   )F)r  r  r   r   )r   r   r  rK  r   zOptional[List[Dict[str, Any]]])r   r   r   r   r  rc  r  r	  r>  r   r  r  r   r   r  r   rb   rh   rf   r  r  
  s    !#G"13H.35 5  37*.?? ? 1	?
 (? 
? ? 
 48*.  1	
 ( 
 6   (( #(	'(  (rh   r  c                  (    \ rS rSr\SS j5       rSrg)TritonCodeCachei  c                @    [        [        R                  U5      U5      $ ra   )rC   r  r   )r  kernel_namer  s      rf   r   TritonCodeCache.load  s    '(8(8(E{SSrh   rb   N)r  r   r  r   r   r   )r   r   r   r   r	  r   r   rb   rh   rf   r  r    s    T Trh   r  c                    [         R                  " [        R                  R                  5      (       a  [        R                  R                  $ [        R
                  " 5       (       a/  [        R                  R                  [        R                  SS5      $ [         R                  " [        R                  " S5      5      (       a  [        R                  " SS5      $ [         R                  " [        R                  " S5      5      (       aR  [        R                  R                  [        R                  R                  [        R                  " SS5      S5      5      $ g)NbinnvccCUDACXXrr   	CUDA_HOMEzbin/nvcc)r%   
nvcc_existr"   ry   cuda_cxxr  r   r   r   rZ   sdk_homegetenvrealpathrb   rh   rf   _cuda_compilerr   
  s    6;;//00{{###ww||K00%@@299Y/00yyB''299[122wwRYY{B-G TUUrh   c            	     ~   [         R                  " 5       (       a  SSKJn   U R	                  S5      nO[         R
                  R                  n[        R                  R                  [        R                  R                  US5      5      [        R                  R                  [        R                  R                  US5      5      [        R                  R                  [        R                  R                  US5      5      [        R                  R                  [        R                  R                  US5      5      /$ )Nr   r  zcutlass-3-headersr  ztools/library/includeztools/library/srcztools/util/include)r"   r  r  r  get_dir_pathry   cutlass_dirr   r   r  r   )r  cutlass_paths     rf   _cutlass_include_pathsr    s    $++,?@{{.. 	lI>?
l4KLM
l4GHI
l4HIJ rh   c                 L   [        5         SSKJn   U R                  SS9[        R
                  " S5      /-   n/ n[        5       (       aR  [        U5        U H  nUR                  SU 3SSU 3/5        M     UR                  S	5        UR                  S
5        U$ [        S5      e)Nr   )cpp_extensionry   r  LIBDIRz-Lz-Xlinkerz-rpath=z-lcudaz-lcudartzMUnsupported env, failed to find cuda libs! Currently only Linux is supported.)r:   torch.utilsr  library_paths	sysconfigget_config_varrJ   r;   rb  r  NotImplementedError)r  lpathsextra_ldflagsr   s       rf   _cuda_lib_optionsr  &  s    )((V(<  *@ F  "Mzzf%D   Btf+zWTF;K!LM  	X&Z(
  "[
 	
rh   c                 
    / SQ$ )N)z-fPICz-fno-strict-aliasingz-fvisibility=hiddenz-Wconversionrb   rb   rh   rf   _nvcc_host_compiler_optionsr  =  s     rh   c                    [         R                  " 5       n U S:X  a  Sn SU  3SU  3/n[        R                  R                  (       a	  USU  3/-  nSSSS	S
SU  SSR                  U5       S3[        R                  R                  SSS/
n[        R                  " 5       (       a>  UR                  S[        R                  R                  [        R                  5      /5        [        R                  R                  (       a  UR                  / SQ5        [        R                  R                  (       a  UR                  / SQ5        [        R                  R                   (       a  UR                  SS/5        U$ )N9090asm_compute_lto_z-t=0z"-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1z+-DCUTLASS_ENABLE_SM90_EXTENDED_MMA_SHAPES=1z'-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLEDz-wz-gencode=arch=compute_z,code=[,]z
-std=c++17z--expt-relaxed-constexprz-DNDEBUGz-ccbin)z	-lineinfor  z-DCUTLASS_DEBUG_TRACE_LEVEL=1)z--keepz,--ptxas-options=--warn-on-local-memory-usagez --ptxas-options=--warn-on-spillsz--resource-usagez--source-in-ptxz--use_fast_mathz -DCUTLASS_USE_TANH_FOR_SIGMOID=1)r%   get_cuda_archr"   ry   enable_cuda_ltor   compile_opt_levelr  rb  r   r   r  rZ   gccenable_debug_infoenable_ptxas_infouse_fast_math)archr'  optionss      rf   _nvcc_compiler_optionsr$  F  s1   !!#Dt|$LHTF+,D{{""4v,51
 gchhtn-=Q?%%"G "''//+//"BCD{{$$KL{{$$	
 {{  !2	
 Nrh   c                   Uc  / n[        5       n[        5       n[        5       n[        5       nUU-   U Vs/ s H  nSU;   a  SU 3OSU 3PM     sn-   U V	s/ s H  n	SU	-   PM
     sn	-   U-   n
SR	                  U 5      nSnUS:X  a%  [        5        SSR	                  U
5       SU SU 3nOvUS	:X  a6  U
R                  S
5        [        5        SSR	                  U
5       SU SU 3nO:US:X  a%  [        5        SSR	                  U
5       SU SU 3nO[        SU S35      e[        R                  SU5        U$ s  snf s  sn	f )N=z-Xcompiler z-Xcompiler=z-Irl  rr   r  z -c -o r  z-sharedz -o exezUnsupported output file suffix !zCUDA command: %s)
r  r  r  r$  r   r   r  r  r  r@  )	src_filesdst_filedst_file_ext
extra_argsinclude_pathscuda_lib_optionsnvcc_host_compiler_optionsnvcc_compiler_optionsoptr   r#  src_fileress                rf   cuda_compile_commandr4  r  s    
*,M(*!<!>24
	 2
1 $'#:k#[3FF1
	
 $1
1=44$;=
1	2 	  xx	"H
Cs!"!CHHW$5#6ghZq
S		y!!"!CHHW$5#6d8*AhZP		!"!CHHW$5#6d8*AhZP!$CL>QR"STTII #&J'
 2s   D;E c                  f    \ rS rSrSr    SS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrS
rg)
DLLWrapperi  z A wrapper for a dynamic library.c                b    Xl         SU l        [        R                  " U5      U l        SU l        g )NFT)lib_pathis_openr   r  DLL)r   r8  s     rf   r   DLLWrapper.__init__  s)     !##H-rh   c                V    U R                   (       a  U R                  5         SU l         g g rl   )r9  _dlcloser   s    rf   r  DLLWrapper.close  s    <<MMO DL rh   c                X   S n[        5       (       aE  [        S 5      n[        US5      (       d  [        S5      n[        US5      (       a  UR                  nO;[	        5       (       a!  SS KnUR                  SSS9nUR                  nO[        S5      eUb  [        5       (       a)  [        /Ul	        U" U R                  R                  5        g [	        5       (       a9  SS KnSSKJn  UR                  /Ul	        U" U R                  R                  5        g g [        R                  S	5        g )
Ndlclosezlibc.sor   kernel32T)use_last_errorz&Unsupported env, failed to do dlclose!)wintypeszKdll unloading function was not found, library may not be unloaded properly!)rJ   r   rl  r@  rK   rY  FreeLibraryr  r   r-  r:  _handlerC  HMODULEr  r  )r   	f_dlclosesymsrY  rA  rC  s         rf   r=  DLLWrapper._dlclose  s    	:::D4++ItY'' LL	\\{{:d{CH ,,I%&NOO zz&.Z	"$((**++&.&6&6%7	"$((**+  KK]rh   c                   ^ U R                   (       d  [        SU R                   35      e[        U R                  U5      mSU4S jjnU$ )NzCannot use closed DLL library: c                 N   > T" U 6 nU(       a  [        STR                   35      eg )NzError in function: )r   r   )rd   errmethods     rf   _wrapped_func-DLLWrapper.__getattr__.<locals>._wrapped_func  s,    $-C"%88I#JKK rh   rd   r   r   r   )r9  r   r8  r  r:  )r   r   rN  rM  s      @rf   __getattr__DLLWrapper.__getattr__  s?    ||!@PQQ4(	L
 rh   c                    U $ ra   rb   r   s    rf   	__enter__DLLWrapper.__enter__  s    rh   c                $    U R                  5         g ra   r  )r   rd   s     rf   __exit__DLLWrapper.__exit__      

rh   c                $    U R                  5         g ra   rW  r   s    rf   __del__DLLWrapper.__del__  rZ  rh   )r:  r9  r8  N)r8  r   r   r   r   )r   r   r   zCallable[..., None])r   r6  rP  )r   r   r   r   rb  r   r  r=  rQ  rT  rX  r\  r   rb   rh   rf   r6  r6    s;    * 
!
!Frh   r6  c                      \ rS rSr% \R
                   " S S5      5       r0 rS\S'   \	" \R                  5      rSr\SS j5       r\ S       SS	 jj5       r\SS
 j5       rSrg)r  i  c                  *    \ rS rSr% S\S'   S\S'   Srg)CUDACodeCache.CacheEntryi  r   r  r  rb   Nr   r   r   r   rc  r   rb   rh   rf   
CacheEntryr`        rh   rb  Dict[str, CacheEntry]r   rp   c                b    [        [        S/SU5      5      n[        XR                  US9u  pEXE4$ z
Writes source code into a file with dst_file_ext as the file extension.
Returns the hash key of source code, and the path to the file.
dummy_inputdummy_outputr  )r7  r4  r>  _SOURCE_CODE_SUFFIXr  r  r+  cuda_commandr   r  s         rf   r>  CUDACodeCache.write  ?      -.,O
  00
 rh   Nc                   U R                  X5      u  pEX@R                  ;  GaZ  SSKJn  [	        5       nU" [
        R                  R                  XtS-   5      [        S9nU   US[        U R                  5      *  U-   n	[
        R                  R                  U	5      (       d  [        U/XU5      n
[        5       n[        R                  SU
5        U
R!                  S5      n ["        R$                  " U["        R&                  [
        R(                  S9  [        5       nS	X-
   S
U
 3n[        R3                  U5        O[        R                  SU5        [4        R7                  XY5      U R                  U'   SSS5        U R                  U   R8                  XE4$ ! ["        R*                   a&  n[,        R.                  " XR0                  5      UeSnAff = f! , (       d  f       Nf= f)z
Compiles CUDA source_code into a file with dst_file_ext extension.
Returns a tuple of dst_file_path, hash_key, source_code_path
r   rF  rH  rI  NzCUDA Compilation: %srl  )r  envzCUDA Compilation took  seconds. Compile command: z8CUDA Compilation skipped: %s since output already exists)r>  r   rq  rG  r  r   r   r   rr  r  ri  r  r4  r   r  r@  r  r  r  r  r  r  r#   CUDACompileErrorr  rN  r  rb  r  )r  r  r+  r,  r   r  rG  r  r  r  r  
start_time	cmd_partserrorend_timelog_duration_msgs                   rf   r  CUDACodeCache.compile  s    ))K>ii)#~HBGGLL=A<XD()HC0G0G,H+HILXww~~k22.#kC "&JII4c: #		#IW"//%j.?.?RZZ
  $vH)?@U?VVqruqv'w$HH-.IIR" "/!9!9*!R		#/ 2 		#**C<< &88 W!229llKQVVW s2   !B G"2FA"GG-!GGG
G$c                r    US:w  a  [        SU SU 35      eU R                  X5      u  p4n[        U5      XE4$ zr
Compiles source code and loads the generated .so file.
Returns a tuple of DLLWrapper, hash_key, source_code_path
r  zCOnly support loading a .so file for now. Requested file extension: z. Source code: r   r  r6  r  r  r+  dst_file_pathr   source_code_paths         rf   r   CUDACodeCache.load%  Z     4--9N/+X  58KK5
1!1 =)8FFrh   rb   r  r   r+  r   r   r  ra   r  r   r+  r   r,  Optional[List[str]]r   Tuple[str, str, str]r  r   r+  r   r   zTuple[DLLWrapper, str, str])r   r   r   r   rm  	dataclassrb  r   rc  r   r  r  ri  r	  r>  r  r   r   rb   rh   rf   r  r    s       $&E %u{{+K  TX&=&=-0&=>Q&=	&= &=P G Grh   r  c                      \ rS rSr% \R
                   " S S5      5       r0 rS\S'   \	" \R                  5      rSrSr\SS j5       r\ S       SS
 jj5       r\SS j5       rSrg	)r~  i7  c                  *    \ rS rSr% S\S'   S\S'   Srg)ROCmCodeCache.CacheEntryi9  r   r  r  rb   Nra  rb   rh   rf   rb  r  9  rc  rh   rb  rd  r   r  Fc                b    [        [        S/SU5      5      n[        XR                  US9u  pEXE4$ rf  )r7  r&   r>  ri  rj  s         rf   r>  ROCmCodeCache.writeC  rm  rh   Nc                `   U R                   (       d6  SU l         [        R                  [        [	        [        5       5      5      5        U R                  X5      u  pEX@R                  ;  Ga\  SSKJ	n  [        5       nU" [        R                  R                  XtS-   5      [        S9nU   US[        U R                   5      *  U-   n	[        R                  R#                  U	5      (       d  [%        U/XU5      n
['        5       nU
R)                  S5      n [*        R,                  " U[*        R.                  S[        R0                  S9n[        R                  S	U5        ['        5       nS
X-
   SU
 3n[        R;                  U5        O[        R                  SUU	5        [<        R?                  XY5      U R                  U'   SSS5        U R                  U   R@                  XE4$ ! [*        R2                   a&  n[4        R6                  " XR8                  5      UeSnAff = f! , (       d  f       Nf= f)z
Compiles source_code into a file with dst_file_ext extension,
using the compile command specific for the ROCm platform.
Returns a tuple of dst_file_path, hash_key, source_code_path
Tr   rF  rH  rI  Nrl  )r  rB  ro  zCompilation output: %szCompilation took rp  z+Skip compiling %s: output %s already exists)!_logged_compiler_versionr  r@  r?   r   r'   r>  r   rq  rG  r  r   r   r   rr  r  ri  r  r&   r   r  r  r  r  r  r  r#   rq  r  rN  r~  rb  r  )r  r  r+  r,  r   r  rG  r  r  r  r  rr  rs  r  rt  ru  rv  s                    rf   r  ROCmCodeCache.compileR  s    +++/C(II/MO0DEF))K>ii)#~HBGGLL=A<XD()HC0G0G,H+HILXww~~k22.#kC "&J #		#I	W!+!8!8%#-#4#4!% "

	" 		":FC  $vH):8;P:QQlmplq'r$HH-.IIE"#
 "/!9!9*!R		#7 : 		#**C<< &88 W!229llKQVVW! s3   (A*HA	G"A#H"H6!HHH
H-c                r    US:w  a  [        SU SU 35      eU R                  X5      u  p4n[        U5      XE4$ ry  rz  r{  s         rf   r   ROCmCodeCache.load  r  rh   rb   r  ra   r  r  )r   r   r   r   rm  r  rb  r   rc  r   r  r  ri  r  r	  r>  r  r   r   rb   rh   rf   r~  r~  7  s       $&E %u{{+K$  TX/=/=-0/=>Q/=	/= /=b G Grh   r~  c                      \ rS rSrSS jrSrg)CodeCacheFuturei  c                    [         era   )r  r   s    rf   r  CodeCacheFuture.result  s    !!rh   rb   Nr   )r   r   r   r   r  r   rb   rh   rf   r  r    s    "rh   r  c                  @    \ rS rSr% S\S'         SS jrS	S jrSrg)
TritonFuturei  r   r4  c                    Xl         X l        g ra   )r4  r  )r   r4  r  s      rf   r   TritonFuture.__init__  s    
 rh   c                    U R                   b@  U R                   R                  5       nUb   eS U l         U R                  R                  5         U R                  $ ra   )r  r  r4  
precompile)r   r  s     rf   r  TritonFuture.result  sI    ;;"[['')F>!>DKKK""${{rh   )r  r4  N)r4  r   r  zOptional[Future[Any]]r   r   )r   r   )r   r   r   r   rc  r   r  r   rb   rh   rf   r  r    s.     & 
	rh   r  c                  (    \ rS rSrSS jrSS jrSrg)LambdaFuturei  c                    Xl         g ra   	result_fn)r   r  s     rf   r   LambdaFuture.__init__  s    "rh   c                "    U R                  5       $ ra   r  r   s    rf   r  LambdaFuture.result  s    ~~rh   r  N)r  Callable[..., Any]r   r   )r   r  )r   r   r   r   r   r  r   rb   rh   rf   r  r    s    # rh   r  )rd   r   re   r   r   r   )r   r  r  r  )r   r   r   r   )r!  r&  r   r   r  )r'  Union[str, bytes]r(  r   r   r   )r.  r   r/  r   r0  r   r   r  )rr   r'  )r8  r  r(  r   r9  r   r   r   )rr   r'  rr   )r8  r  r/  r   r(  r   r9  r   r0  r   r   r  )rB  r   r   r   )FF)
rW  r   r8  r  r   r  r=  r  r   r   )rf  r1   r   r1   )rn  r   r   rO   )r  zList[str] | Noner  r   r  zhashlib._Hashr   r   )r   r&  )
r  r  r  r<  r  r3   r  r=  r   zTuple[str, List[str]])rR  rK  r   rK  )r  r   r   r   )r   r   r   r  )r  zUnion[str, List[str]]r  r   r  r  r   r   )r   r   rd   r   r   zUnion[list[c_void_p], c_void_p])
r  r   r  r<   r  r   r  r   r   r   )r  r   r  zList[partial[Any]]r   r   )r  r   )r   r  )r   r  ra   )
r)  r  r*  r   r+  r   r,  r  r   r   )
__future__r   r  ry  rm  r   r   r  rt  r  r   loggingr   rn  r  r  r  rT  r|  r  r{   r  r  r7  rO  r  bisectr   r   rY  r   r   r   datetimer	   r
   pathlibr   r   r   typesr   typingr   r   r   r   r   r   r   r   r   r   r   r   r   rw   torch.distributedrH  rO  r   r   torch._dynamo.utilsr   r   r    r!   torch._inductorr"   r#   r$   torch._inductor.codegen.cudar%   ,torch._inductor.codegen.rocm.compile_commandr&   r'   !torch._inductor.custom_graph_passr(   r)   torch._inductor.output_coder*   torch._utils_internalr+   torch.compilerr  rt  r-   runtimer.   runtime.autotune_cacher/   triton_bundlerr0   r1   collections.abcr2   r  r3   r4   rm   r5   r6   r7   r8   r"  r9   torch._inductor.cpp_builderr:   r;   r<   r=   r>   r?   r@   rA   torch._inductor.cpu_vec_isarB   %torch._inductor.runtime.compile_tasksrC   rD   rE   %torch._inductor.runtime.runtime_utilsrF   rG   torch._inductor.utilsrH   rI   rJ   rK   torch._loggingrL   torch._subclasses.fake_tensorrM   rN   rO   %torch.fx.experimental.symbolic_shapesrP   rQ   rR   concurrent.futuresrS   torch._inductor.graphrU   torch._inductor.irrV   torch._inductor.runtime.hintsrW   rX   r   r|  r  _HEREr  r  r   r  r2  rS  r  	triton.fbrZ   triton.fb.buildr[   torch._inductor.fb.utilsr\   r]   r^   r_   _logginggetArtifactLoggerr   r  rr  	getLoggerr  r   r   r   r   r   r   r   r  r"  r*  r2  r:  r>  rC  r   r  r]  rg  rp  Picklerrr  r  r  r  r  ro  r  r  rD  rU  rW  r  r  r  r  r  r  r.  r  r  rc  r  r  r  r  r:  r=  r  r  r  r  r   r  r  r  r$  r4  r6  r  r~  r  r  r  rb   rh   rf   <module>r     s   "       	    	   	     
        ' '                1 0 1 S 9 2 , & # 8 ) CL (=A5 	 	 	 5 
 O  , 
 O N )3/I 	!ggoobggooe45k+@Allg%	%2  ..228]Kllg% !'G T B
 B
J' '2Xi XvX
* 9;""!"25"" CI;;'*;<?;;   	
  $! 	  	
 
>   	t&.. tnVV%(V2?V	V  T X  XF%    n" n"b'   #	
 8!8i iX< T       6F Fb 
.  .(;%;47;>G;	;/6%/647/6>G/6	/6d  . *HZ u: u: u:p$$$ $ 	$
 
$0 l; l; l;^ >4 > >B [0 [ [|
#L  \( \( \(~T T	 .)` '+	""" " $	"
 	"JH HV QG QG QGh [G [G [G|" "
? * ?  rh   