
    ΑiD                     "   S SK r S SKrS SKJr  S SKJr  S SKrS SKJr  SSK	J
r
  \" \" \R                  " SS5      5      5      r\" \" \R                  " S	S5      5      5      r\R                   " \\ R$                  S
S9rS r     S#S jrS$S jrS rS rS rS rS rS r " S S5      r " S S\5      r " S S5      r " S S5      r S r!S r" " S S \RF                  RH                  5      r% " S! S"\RL                  RN                  5      r(g)%    N)deque)Enum)
log_helper   )	CUDAGraph+PADDLE_DEBUG_ENABLE_CUDAGRAPH_LAYER_LOGGING01PADDLE_DEBUG_CUDAGRAPHEDLAYER_FALLBACK_TO_DEFAULTz[%(levelname)s] %(message)s)fmtc                 F    [         (       d  g [        R                  U 5        g N)enable_debug_printloggerinfoxs    e/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/device/cuda/cuda_graphed_layer.pydebug_printr   &   s    
KKN    c                    / nU(       a  UR                  U5        Uc  S nU c  [        U S35        g [        U [        R                  5      (       a  U(       a<  UR                  SU R
                   35        UR                  SU R                   35        U(       a+  UR                  S[        U R                  5       5       35        U(       a  UR                  SU" U 5       35        [        SR                  U5      5        g g )Nc                 X    [        U R                  S5      S-  R                  5       5      $ )Nfloat32i  )floatastypesum)ts    r   <lambda>print_tensor.<locals>.<lambda>8   s      3d :??ABr   z is Nonezshape = zplace = zptr = zhash = z | )
appendr   
isinstancepaddleTensorshapeplacehexdata_ptrjoin)r   name
print_meta	print_ptr
print_hashhashoutputs          r   print_tensorr.   ,   s     Fd|BytfH%&	Av}}	%	%MMHQWWI./MMHQWWI./MMF3qzz|#4"567MMGDG9-.EJJv&' 
&r   c                 r    [         (       d  g [        UR                  SS5      5        [        [        U 5        g )Nd   -)r   r   centerrecursive_applyr.   )r   banners     r   printerr5   G   s(    c3'(L!$r   c           	      l  ^  [        U[        5      (       a  U Vs/ s H  n[        T U5      PM     sn$ [        U[        5      (       a  [        U 4S jU 5       5      $ [        U[        5      (       a0  UR                  5        VVs0 s H  u  p4U[        T U5      _M     snn$ T " U5      $ s  snf s  snnf )Nc              3   <   >#    U  H  n[        TU5      v   M     g 7fr   r3   ).0itemfunctions     r   	<genexpr>"recursive_apply.<locals>.<genexpr>T   s     K_Xt44s   )r    listr3   tupledictitems)r;   	input_varr:   keyvalues   `    r   r3   r3   P   s    )T""<EFID$/IFF	Iu	%	%KKKK	It	$	$ (oo/
/
 511/
 	

 	"" G
s   B+B0c                     [        U [        R                  5      (       a#  U R                  5       nU R                  Ul        U$ U $ r   )r    r!   r"   detachstop_gradient)tensordetached_tensors     r   detach_tensorrJ   ^   s7    &&--(( --/(.(<(<%Mr   c                 .   ^ / mU4S jn[        X5        T$ )Nc                 h   > [        U [        R                  5      (       a  TR                  U 5        g g r   )r    r!   r"   r   )argrets    r   r   !recursive_flatten.<locals>.appendl   s&    c6==)) JJsO *r   r8   )targetr   rN   s     @r   recursive_flattenrQ   i   s    
C F#Jr   c                 `    / [        U 5      Q[        [        UR                  5       5      5      Q$ r   )rQ   r?   values)argskwargss     r   recursive_flatten_args_kwargsrV   {   s1    	4	 	51	2 r   c                 "    [        [        U 5      $ r   )r3   rJ   r   s    r   r   r      s    ?=!4r   c                 ~    [        U [        R                  5      (       a  U R                  (       a  gU R                  $ g)zWReturns the gradient of a Paddle Tensor if it's a tensor; otherwise, returns the input.N)r    r!   r"   rG   gradr   s    r   get_grad_tensorrZ      s)    !V]]##??66Mr   c                   8    \ rS rSrS rS rS rS rS rS r	Sr
g	)
CUDAGraphWithStaticInputOutput   c                     Xl         [        5       U l        SU l        SU l        S U l        S U l        S U l        S U l        g )NF)	num_warmup_stepsr   graphhas_recordedhas_preserved_inputsargs_statickwargs_staticinputs_staticoutputs_staticselfr_   s     r   __init__'CUDAGraphWithStaticInputOutput.__init__   sC     0[
!$)!! ""r   c                    U R                   (       d9  SU l         Xl        X l        [        U R                  U R                  5      U l        g[        X5      n[        U R                  U5       H  u  pEUR                  US5        M     g)a  
For the CUDA Graph, it is crucial that the buffer remains address-stable,
meaning that the buffer addresses for any inputs to the CUDA Graph should not change.
One solution to achieve this is to preserve all input tensors.

This function attempts to recursively flatten the input arguments and keyword arguments
to identify all tensors passed to the layer (though it may still miss some due to other implicit
ways inputs can be passed to a layer). It then preserves references to these input tensors
as `self.inputs_static` so that the buffer pointers can be reused later.

When this method is called subsequently, it copies the values back to the preserved input tensors
to ensure the buffers are reused.
TN)rb   rc   rd   rV   re   zipcopy_)rh   rT   rU   inputsx_staticr   s         r   preserve_or_copy/CUDAGraphWithStaticInputOutput.preserve_or_copy   sr     (((,D%#!'!>  $"4"4"D 34@F"4#5#5v>q$'  ?r   c                 >   U R                  X#5        U R                  R                  5         U" U R                  0 U R                  D6U l        U R                  R                  5         [        S5        U R                  R                  5         SU l	        U R
                  $ )NzF[CUDAGraph] Record-Replay Start (Graph is replayed for the first time)T)
rp   r`   capture_beginrc   rd   rf   capture_endr   replayra   )rh   frT   rU   s       r   record%CUDAGraphWithStaticInputOutput.record   s    d+

  "!1!1HT5G5GH

 T	
 	

 """r   c                     Xl         g r   )rf   )rh   rf   s     r   set_output_static0CUDAGraphWithStaticInputOutput.set_output_static   s    ,r   c                     U R                   (       d  [        S5      eU R                  X5        [        S5        U R                  R                  5         U R                  $ )NzGraph should be recorded firstz[CUDAGraph] Replay Start)ra   RuntimeErrorrp   r   r`   ru   rf   )rh   rT   rU   s      r   ru   %CUDAGraphWithStaticInputOutput.replay   sJ      ?@@d+./

"""r   c                 l    [         R                  " SU 35        U R                  R                  U5        g )Nzsave graph to )loggingr   r`   print_to_dot_files)rh   r(   s     r   save#CUDAGraphWithStaticInputOutput.save   s'    ~dV,-

%%d+r   )rc   r`   rb   ra   re   rd   r_   rf   N)__name__
__module____qualname____firstlineno__ri   rp   rw   rz   ru   r   __static_attributes__ r   r   r\   r\      s     #(4#-#,r   r\   c                   $    \ rS rSrSrSrSrSrSrg)CUDAGraphLayerStatus   z3Enum to represent the status of a CUDA Graph Layer.r         r   N)	r   r   r   r   __doc__WARMUPRECORD	CUDAGRAPHr   r   r   r   r   r      s    =FFIr   r   c                   ,    \ rS rSrS rS rS rS rSrg)CUDAGraphForwardBackward   c                 n    [        U5      U l        [        U5      U l        [        R                  U l        g r   )r\   forward_graphbackward_graphr   r   statusrg   s     r   ri   !CUDAGraphForwardBackward.__init__   s+    ;<LM<=MN*11r   c                 .    [         R                  U l        g r   )r   r   r   rh   s    r   rt   $CUDAGraphForwardBackward.capture_end   s    *44r   c                 <    U R                   [        R                  :H  $ r   )r   r   r   r   s    r   is_record_step'CUDAGraphForwardBackward.is_record_step       {{29999r   c                 <    U R                   [        R                  :H  $ r   r   r   r   r   s    r   is_cuda_graph_step+CUDAGraphForwardBackward.is_cuda_graph_step       {{2<<<<r   )r   r   r   N)	r   r   r   r   ri   rt   r   r   r   r   r   r   r   r      s    2
5:=r   r   c                   H    \ rS rSrSrS rS rS rS rS r	S r
S	 rS
 rSrg)CUDAGraphContext   z
Manages the context for CUDA graph execution in layers. This includes handling
the state of CUDA graph layers, managing forward and backward graphs, and
tracking the execution steps.
c                     Xl         X l        SU l        [        R                  U l        [        5       U l        [        5       U l        g)z
Initializes the CUDA graph context.
:param layer: The layer to be used in the CUDA graph.
:param num_warmup_steps: Number of warmup steps before recording starts.
r   N)	layerr_   _stepr   r   r   r   
data_queuegraph_queue)rh   r   r_   s      r   ri   CUDAGraphContext.__init__   s<     
 0 
*11  ' !7r   c                     [        U R                  5      S:X  a  [        U R                  5      $ U R                  R	                  5       $ )Nr   )lenr   r   r_   popleftr   s    r   	get_graphCUDAGraphContext.get_graph  s;    t A%+D,A,ABB##++--r   c                 :    U R                   R                  U5        g r   )r   r   )rh   gs     r   reuse_graphCUDAGraphContext.reuse_graph  s    "r   c                 :    U R                   R                  U5        g r   )r   r   )rh   rT   s     r   	push_dataCUDAGraphContext.push_data  s    t$r   c                 6    U R                   R                  5       $ r   )r   r   r   s    r   pop_dataCUDAGraphContext.pop_data  s    &&((r   c                     U =R                   S-  sl         U R                   U R                  :X  a  [        R                  U l        g g )Nr   )r   r_   r   r   r   r   s    r   warmup_stepCUDAGraphContext.warmup_step   s3    

a
::....88DK /r   c                 <    U R                   [        R                  :H  $ r   )r   r   r   r   s    r   is_warmup_stepCUDAGraphContext.is_warmup_step%  r   r   c                 <    U R                   [        R                  :H  $ r   r   r   s    r   r   #CUDAGraphContext.is_cuda_graph_step(  r   r   )r   r   r   r   r_   r   N)r   r   r   r   r   ri   r   r   r   r   r   r   r   r   r   r   r   r   r      s/    #*.#%)9
:=r   r   c                    Su  p#[        U [        R                  5      (       a  XS   p2Oe[        U [        [        45      (       aJ  [        X5       H;  u  pE[        U[        R                  5      (       d  M&  UR                  (       a  M9  XEp2  O   [        U[        R                  5      (       a  [        U[        R                  5      (       d   eX#4$ )N)NNr   )r    r!   r"   r>   r?   rl   rG   )ysdysydyvdvs         r   select_y_with_gradr   ,  s    EA"fmm$$F2	Bu	&	&\EA!V]]++Q___2 " a''Jr6==,I,III5Lr   c                 F   U u  p/ n[        X5       H  u  pEUR                  (       d[  UR                  c1  UR                  [        R
                  " UR                  5      5        MT  UR                  UR                  5        Mq  UR                  S 5        M     [        U5      $ r   )rl   rG   rY   r   r!   zerosr#   r?   )rn   grad_inputsdetached_grad_inputs	args_gradr   
detached_xs         r   get_args_gradr   <  s    (.%KI[?&   j.>.>!?@   1T" @ r   c                   8    \ rS rSrSr\S 5       r\S 5       rSrg)_CUDAGraphedLayeriN  z
A custom layer that integrates CUDA Graph recording and execution into PaddlePaddle's autograd system.
It handles forward and backward operations differently based on the CUDA graph layer status.
c                   ^ Uu  pE[        U5      n[        U5      n[        XE5      nX64n[        US5        TR                  5       (       d  [        (       a_  [        S5        [        R                  " 5          TR                  " U0 UD6nSSS5        TR                  [        R                  SUW45        OTR                  5       n	U	R                  5       (       a`  [        S[        U	5       35        U4S jn
U	R                  R                   " U
/UQ70 UD6nTR                  [        R"                  XU45        OV[        S[        U	5       35        U	R                  R$                  " U0 UD6nTR                  [        R&                  U	SU45        [        S5        U R)                  T5        [        US5        [        U5      $ ! , (       d  f       GNA= f)	z
Handles the forward pass of the layer. It operates differently based on the
context's status: warmup, recording, or CUDA graph step.
zForward inputz"[CUDAGraph] Forward Step (Default)Nz%[CUDAGraph] Forward Step (Record) id c                     > [         R                  " 5          TR                  " U 0 UD6sS S S 5        $ ! , (       d  f       g = fr   )r!   enable_gradr   )rT   rU   contexts     r   forward*_CUDAGraphedLayer.forward.<locals>.forwards  s,    ++-&}}d=f= .--s	   3
Az$[CUDAGraph] Forward Step (Graph) id z[CUDAGraph] Forward Step EndzForward output)rF   rV   r5   r   *debug_cudagraphedlayer_fallback_to_defaultr   r!   r   r   r   r   r   r   r   idr   rw   r   ru   r   save_for_backward)ctxr   	arg_tupler   rT   rU   r   rn   r   r`   r   s    `         r   r   _CUDAGraphedLayer.forwardT  s    !d|<TJ4$o6""$$99<=##%MM4262 & 3::D&!LM%%'E##%%CBuI;OP> ''..wHHH!!)00%C B2e9+NO''..??!!)33UD!D 	23g&#$ay? &%s   6G
Gc                 4   U R                  5       u  nUR                  5       u  p4pV[        Xa5      u  px[        Xx4S5        U[        R
                  :X  a8  [        S5        UR                  U5        [        U5      n	UR                  5         OU[        R                  :X  a~  [        S[        U5       35        S n
UR                  R                  XU5        [        U5      n	UR                  R                  U	5        UR                  5         UR!                  U5        OcU[        R"                  :X  aD  [        S[        U5       35        UR                  R%                  Xx5      n	UR!                  U5        O['        S5      e[        S5        [        U	S5        U	$ )	z
Handles the backward pass of the layer. Similar to forward, it handles
backward based on the context's status: warmup, record, or CUDAGraph.
zBackward inputz#[CUDAGraph] Backward Step (Default)z&[CUDAGraph] Backward Step (Record) id c                 &    U R                  U5        g r   )backward)r   r   s     r   r   ,_CUDAGraphedLayer.backward.<locals>.backward  s    

2r   z%[CUDAGraph] Backward Step (Graph) id zUnknown cuda graph statusz[CUDAGraph] Backward Step EndzBackward output)saved_tensorr   r   r5   r   r   r   r   r   r   r   r   r   rw   rz   rt   r   r   ru   r}   )r   r   r   r   r`   rn   r   r   r   r   r   s              r   r   _CUDAGraphedLayer.backward  sd    %%'
&-&6&6&8#"2+)*)000=> JJrN%f-I!+222@ELM   ''R8 &f-I  229=&+555?5	{KL ,,33A:I&:;;34	,-r   r   N)	r   r   r   r   r   staticmethodr   r   r   r   r   r   r   r   N  s0    
 3 3j 0 0r   r   c                   r   ^  \ rS rSrSrS	S\R                  R                  4U 4S jjjrS r	S r
S rSrU =r$ )
CUDAGraphedLayeri  a  
CUDAGraphedLayer: A PaddlePaddle Layer to convert an eager mode model to utilize CUDA Graphs.

CUDA Graphs provide a way to capture kernel-level operations of a model and play
them back efficiently, allowing for potential speedups in repetitive computations,
such as those during training iterations. This layer is a wrapper that enables
the usage of CUDA Graphs with PaddlePaddle models.

Overview:
- The layer encapsulates another layer (the model to be converted).
- During the first few (num_warmup_steps) iterations, the layer operates in
  eager mode without any CUDA Graphs.
- After the warmup steps, the layer captures the forward and backward computations
  and replays them using CUDA Graphs in subsequent iterations.

Usage:
    model = Model()
    graphed_model = CUDAGraphedLayer(model)

Parameters:
- layer (paddle.nn.Layer): The PaddlePaddle model/layer to be converted.
- num_warmup_steps (int): The number of iterations before the CUDA Graph
  capture begins. Default is 3.

Notes:
- Restrictions:
    * CPU-GPU Synchronization: Operations that synchronize the CPU with the GPU, like device to host transfers, are not allowed.
    * CPU Work: Any operations on the CPU within the captured graph are not recorded.
    * Memory Address (Pointer) Consistency: Replays consistently read from and write to identical virtual memory addresses.
    * Dynamic Operations:
        - Control Flow: Dynamic control flows, especially those based on CPU data like if/else statements, are prohibited.
        - Tensor Shapes: Dynamic tensor shapes are not supported.

- Allowed Operations:
    * CUDA RNG Operations: CUDA-based Random Number Generation operations are allowed.
r   c                    > [         TU ]  5         [        X5      U l        U R	                  S[        U5      R                   3U5        g )NzGraphed )superri   r   r   add_sublayertyper   )rh   r   r_   	__class__s      r   ri   CUDAGraphedLayer.__init__  s<    '@HT%[%9%9$:;UCr   c                 ^    [        X5      n[        R                  " U R                  X4/UQ76 $ r   )rV   r   applyr   )rh   rT   rU   r   s       r   r   CUDAGraphedLayer.forward  s2    3DA &&LL4.
+6
 	
r   c                 6    U R                   R                  5       $ r   )r   r   r   s    r   r   CUDAGraphedLayer.is_warmup_step  s    ||**,,r   c                 6    U R                   R                  5       $ r   )r   r   r   s    r   r   #CUDAGraphedLayer.is_cuda_graph_step  s    ||..00r   )r   )r   )r   r   r   r   r   r!   nnLayerri   r   r   r   r   __classcell__)r   s   @r   r   r     s9    #JDfiioo D D

-1 1r   r   )UnnamedTFTN)r5   ))r   oscollectionsr   enumr   r!   paddle.baser   graphsr   boolintgetenvr   r   
get_loggerr   INFOr   r   r.   r5   r3   rJ   rQ   rV   rF   rZ   r\   r   r   r   r   r   autogradPyLayerr   r   r   r   r   r   r   <module>r
     s!    	    "  		?EF  .2		EsKL. * 
		gll =

 
	(6%#$ 
5G, G,V4 = = 6= 6=r $m// m`61vyy 61r   