
    a,j'                        d Z ddlZddlmZ ddlmZmZ ddlmZ ddl	Z	ddl
Z	ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZmZmZmZ ddlmZ ddl m!Z! de	j"        j#        de$e%         fdZ&de	j"        j'        de(e	j)        e	j"        j*        f         fdZ+de	j"        j'        de%de,dz  fdZ-de	j"        j'        de%de,dz  fdZ.de	j"        j'        de%fdZ/de	j"        j'        de0e,dz           fdZ1de	j"        j'        dee         defdZ2 G d d          Z3 e!d e3                       	 	 d'd!ed"ef         d#ee         d$e4d%e4ded"ee         f         f
d&Z5dS )(a  
This module implements CUDA graphs support for TorchDynamo backends.

CUDA graphs allow for capturing and replaying GPU operations, which can significantly
reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:

- CUDA graph creation and management for both forward and backward passes
- Input mutation detection and handling
- Device compatibility checking
- Stack trace management for debugging
- Integration with TorchInductor's cudagraph trees

The backend supports two main modes:
1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking

Key components:
- CudagraphsBackend: Main backend class for CUDA graph integration
- Mutation detection utilities to ensure graph safety
- Device mapping and compatibility checks
- Stack trace collection for debugging
    N)defaultdict)CallableSequence)Any)config)aot_autograd)	boxed_nop)BoxedDeviceIndex'check_multiple_devices_or_any_cpu_nodesformat_default_skip_messageget_mutation_stack_traceget_placeholder_info#log_cudagraph_skip_and_bump_counter)	BoxedBoolcount_tangents%get_first_incompatible_cudagraph_nodenum_fw_fixed_argumentsoutput_node)StorageWeakRef   )register_backendgreturnc           	      n   dt           t          t          f         dt          fd}t          t                    }d}t	                      }| j        D ]c}|j        dk    rvt           ||j                  t          j
                  rH|t           ||j                                                                                         |           |dz  }|j        dk    rt          |j        d          s|j        j        }t#          |j                  D ]\  }}|t'          |j                  k     r|j        |         }	n!|j        |j        vr:|j        |j                 }	d	}
|j        r|j        j        rd
}
|
r8||t           ||	j                                                                     z  }e|S )Nmetar   c                 *    d| v r| d         n| d         S )Nvalfake_result )r   s    a/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/torch/_dynamo/backends/cudagraphs.pymeta_fkz%find_input_mutations.<locals>.meta_fk7   s    #tmmtE{{m1DD    r   placeholderr   call_function_schemaFT)dictstrr   r   setnodesop
isinstancer   torchTensorr   _typed_storageaddhasattrtargetr%   	enumerate	argumentslenargsnamekwargs
alias_infois_write)r   r!   inputs	input_idxmutated_inputsnschemaiargargumentmut_args              r    find_input_mutationsrC   6   s   Ed38n E E E E E FIUUNW  4=  ''!&//5<88 X~ggafoo&D&D&F&FGGHLLYWWWNIIT_$$18Y// X%F#F$455  3s16{{?? vayHHxqx//  x1H> '~. '"&  #f&wwx}'='='L'L'N'NOO' N
 r"   gmc                     i }| j         j        D ]J}|j                            dd           }t	          |t
          j                  r|j        |vr
|||j        <   K|S )Nr   )graphr)   r   getr+   r,   r-   device)rD   device_node_mappingr=   ts       r    get_device_node_mappingrK   ]   sh     >@X^ . .FJJud##a&& 	.18;N+N+N,-)r"   	aot_model	num_fixedc                     t          | j                  t          t          |                    z
  }|sd S t	          | j                  }t          ||          S N)rC   rF   r(   ranger   r   )rL   rM   mutation_indicesplaceholderss       r    3check_for_mutation_ignore_cuda_graph_managed_tensorrS   h   sV     ,IO<<s5CSCS?T?TT t'	88L#L2BCCCr"   c                     t           j        st          | |          x}r|S t          t	          |                     x}r|S t          |           x}rt          d|j         d          S d S )Nzincompatible op ())r   (cudagraph_backend_support_input_mutationrS   r   rK   r   r   r6   )rL   rM   mut_skipskipnodes        r    check_for_skiprZ   s   s    : Jy
 
 
8 	 O6	**  t  4Y???t M*+Kty+K+K+KLLL4r"   c                 |    t          t          t          |                               }|j        dk    sJ |j        S )Ncuda)nextiterrK   typeindex)rD   rH   s     r    get_device_indexra      s;    $.r223344F;&    <r"   c                     t          |           }t          |j                  dk    sJ |j        d         }t          |d          sg S d |D             S )Nr   r   __iter__c                 h    g | ]/}t          |t          j        j        j                  r|j        nd 0S rO   )r+   r,   fxrY   Nodestack_trace).0r@   s     r    
<listcomp>z$get_stack_traces.<locals>.<listcomp>   sD        'sEHM,>??	IT  r"   )r   r4   r5   r0   )rD   outputr5   s      r    get_stack_tracesrk      sk    __Fv{q    ;q>D4$$ 	    r"   dynamo_modeldynamo_inputsc           	         ddl m t          d          t          d           	 ddt          j        j        dt          t                   dt          dt          ffd	}dt          j        j        dt          t                   dt          ffd
}t          ||t          j        |d          t          j        j        j                  } ||           S )Nr   )cudagraphify_implTFrL   
aot_inputsis_inferencer   c                    t          | |          }t          t          
          t          |                    }t          | |          x}r(t	          j        	           t          d|            |S                     t          |                       ||t          |          j
        d|t          |           t          | j                  t          | j                  	  	        }d|_        |S )Nskipping cudagraphs due to Fdevice_indexis_backwardrq   stack_tracesrR   mutated_input_idxsT)r	   r   r4   rZ   r   disabler   r(   ra   rP   valuerk   r   rF   rC   _boxed_call)rL   rp   rq   interpfixedskip_msgoutboxed_device_indexro   do_cudagraphsrm   s          r    forward_cudagraphsz&cudagraphs.<locals>.forward_cudagraphs   s    
 9j11&s='9'93z??KK%i7778 	m,,,/8h88   M/	::;;;%LL+1%))44-io>>3IODD

 

 

 
r"   c                 "    t           |          }s S t                     }t           |          x}rpt          d|            	j        }|d}t
          j        j                            |d          J dt          t                   dt          f fd}d|_        |S  
||t          |          t                     ddt                     t           j                  t#           j                  		  	        }d|_        |S )
Nrs   r   F)create_if_none_existsr:   r   c                 B                                       |           S rO   )set_to_running_backward)r:   rL   managers    r    fnz3cudagraphs.<locals>.backward_cudagraphs.<locals>.fn   s%    //111 y(((r"   Trt   )r	   r   rZ   r   rz   r,   	_inductorcudagraph_treesget_managerlistr   r{   rP   ra   rk   r   rF   rC   )rL   rp   r|   r}   r~   
device_idxr   r   r   r   ro   r   s   `       @r    backward_cudagraphsz'cudagraphs.<locals>.backward_cudagraphs   sZ    9j11 	y))%i7778 	/8h88  
 ,1J!
o5AA% B  G &&&)49 ) ) ) ) ) ) ) )
 "BNI%LL))44))44-io>>3IODD

 

 

 
r"   )rq   )fw_compilerbw_compilerinference_compilerkeep_inference_input_mutations)F)torch._inductor.cudagraph_treesro   r   r
   r,   re   GraphModuler   r   boolr   	functoolspartial_dynamor   %cudagraph_backend_keep_input_mutation)rl   rm   r   r   aot_cudagraphsr   ro   r   s    `   @@@r    
cudagraphsr      s.   AAAAAAdOOM)$//
 # 8'I  
	        :*8'*59#Y*	* * * * * * * *X "&'$,-?dSSS',}';'a	  N >,666r"   c                   n    e Zd ZdZedd            Zedej        j        de	e
         de
fd            ZdS )	CudagraphsBackendr   r   Nc                  &    ddl m}   |              d S )Nr   reset_cudagraph_trees)r   r   r   s    r    resetzCudagraphsBackend.reset   s)    IIIIIIr"   modelr:   c                 "    t          | |          S rO   )r   )r   r:   s     r    __call__zCudagraphsBackend.__call__   s    %(((r"   )r   N)__name__
__module____qualname__compiler_namestaticmethodr   r,   re   r   r   r   r   r   r"   r    r   r      su         M      \ 
 ), )hsm ) ) ) ) \) ) )r"   r   r   )r6   compiler_fnTr   .r:   copy_outputscopy_inputsc                    t          |t          t          f          sJ rd |D             nt          |          t          j                                         t          j                                        }|                    t          j                                                   t          j        	                    |          5   | |  ddd           n# 1 swxY w Y   |                                 t          j                                                            |           t          j                                         t          j        
                                t          j                            |          5   |  ddd           n# 1 swxY w Y   t          t          t          f          sfdt          dt          t                   ffd}|S )zBThis isn't registered as a backend, but is used in some benchmarksc                 6    g | ]}t          j        |          S r   )r,   
zeros_likerh   xs     r    ri   z$cudagraphs_inner.<locals>.<listcomp>  s#    ===)!,,===r"   N)stream
new_inputsr   c                      t                    t          |           k    sJ r+t          |           D ]\  }}|                    |                                            rd D             S S )Nc                 6    g | ]}|                                 S r   )cloner   s     r    ri   z1cudagraphs_inner.<locals>.run.<locals>.<listcomp>'  s     666!AGGII666r"   )r4   zipcopy_replay)r   dstsrcr   r   rF   static_inputsstatic_outputss      r    runzcudagraphs_inner.<locals>.run   s    =!!S__4444 	z::  S		# 	"66~6666!!r"   )r+   r   tupler,   r\   synchronizeStreamwait_streamcurrent_streamr   	CUDAGraphrF   r   r   )	r   r:   r   r   r   r   rF   r   r   s	     ``  @@@r    cudagraphs_innerr     sN    ftUm,,,,, %==f===V 
JZ  F
uz0022333			6	"	"  v              
	J++F333	J J  ""E			%		/	/ / /./ / / / / / / / / / / / / / /ntUm44 +(*	" 	"# 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" Js$   C  C$'C$FF#&F#)TT)6__doc__r   collectionsr   collections.abcr   r   typingr   r,   torch.fxtorch._dynamor   torch._dynamo.backends.commonr    torch._dynamo.backends.debuggingr	   torch._inductor.cudagraph_utilsr
   r   r   r   r   r   torch._inductor.utilsr   r   r   r   r    torch.multiprocessing.reductionsr   registryr   re   Graphr(   intrC   r   r&   rH   rf   rK   r'   rS   rZ   ra   r   rk   r   r   r   r   r   r"   r    <module>r      s   .     # # # # # # . . . . . . . .                     6 6 6 6 6 6 6 6 6 6 6 6                             < ; ; ; ; ; & & & & & &$EHN $s3x $ $ $ $N	%,
%&   Dx#D03D4ZD D D Deh2 s sTz    $- #    	- 	$sTz2B 	 	 	 	U7UX1 U7(3- U7TW U7 U7 U7 U7p) ) ) ) ) ) ) )   l0A0A0C0C D D D D 	) )CH)SM) ) 	)
 c8C= !) ) ) ) ) )r"   