
    Αif                         S SK r S SKrS SKJr  S SKJs  Jr  S r	S r
S rS r " S S\R                  R                  5      rg)    Nc                 T  ^ U R                   nX;   d   SU SU 35       eU R                  nUR                  U5      nU R                  nU4S jmT" XS5      n[	        [        U5      5       Vs/ s H  owU:w  d  M
  UPM     nnU Vs/ s H  n[	        X7   5      PM     n	n/ n
[        R                  " U	6  Hf  n/ n[	        X4   5       H?  n[        U5      nUR                  XG5        UnU H  nX   nM	     UR                  U5        MA     U
R                  U5        Mh     U
$ s  snf s  snf )Nzdim 'z' not in mesh.dim_names c           	         > U(       d  U S   $ [        [        U 5      US   -  5      n[        US   5       Vs/ s H  nT" XU-  US-   U-   USS  5      PM     sn$ s  snf )Nr      )intlenrange)flatshapestepinests       j/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/auto_parallel/ring_conv.pyr   $_get_comm_group_by_dim.<locals>.nest   ss    7N3t9a() 58_
$ $h!a%40%)<$
 	
 
s   !A)	dim_namesr
   indexprocess_idsr   r   	itertoolsproductlistinsertappend)meshdimr   r
   dim_idxidsmesh_ndr   
other_axesother_rangescomm_groupsr   groupidxvaljr   s                   @r   _get_comm_group_by_dimr$      s#   IMuSE)A)MMJJEooc"G


C
 3G"3u:.?.w,!.J?-78ZE%(OZL8K""L1u~&Au+CJJw"Cf LL ' 	5! 2  @8s   -	D :D D%c                 V   US:X  a  SnOSnS n[        U5       H0  u  pVU[        R                  " U5      :X  d  M!  U R                  U   n  O   U(       d  [	        SU SU S35      e[        X5      n[        R                  " 5       nU H  n	X;   d  M
  XI4s  $    [        SU SU  S35      e)	NNCHW      zInput tensor placements z" do not contain a Shard on W axis:.zRank z1 not found in any tensor parallel group for mesh )	enumeratedistShardr   
ValueErrorr$   get_rankRuntimeError)
x_meshx_placementsdata_format
shard_axis	axis_namer   	placement	tp_groupsrankr    s
             r   _get_conv_tp_groupr8   ;   s    f

I!,/

:..((+I 0
 &|n4VWaVbbcd
 	
 'v9I==?D=##  
vFvhaP     c                    [        U5      S:X  a  U $ [        U R                  5      S:X  d"  [        S[        U R                  5       S35      eUS:X  a  SnOUS:X  a  SnO[        S	U S
35      e[        S 5      /S-  n	[        U* S 5      X'   U [	        U	5         R                  5       n
[        S 5      /S-  n[        S U5      X'   U [	        U5         R                  5       n[        R                  " U5      n[        R                  " U
5      n[        R                  " [        R                  X5      n[        R                  " [        R                  X5      n[        R                  " [        R                  X5      n[        R                  " [        R                  X5      n[        R                  " UUUU/5      nU H  nUR                  5         M     XVS   :X  a&  [        R                  " X/US9nUR                  5       $ XVS   :X  a&  [        R                  " X/US9nUR                  5       $ [        R                  " UU U/US9nUR                  5       $ )Nr      zAInput tensor is expected to be 4D for NCHW/NHWC formats, but got zD.r&   r'   NHWCr(   Unsupported data_format: . Must be 'NCHW' or 'NHWC'.r   )axis)r   r
   r-   slicetuple
contiguouspaddle
zeros_liker+   P2POpisendirecvbatch_isend_irecvwaitconcat)local_input_tensorhalo_width_to_receive_from_left halo_width_to_receive_from_rightleft_neighbor_rankright_neighbor_rankcurrent_rankconv_tp_groupr2   width_dim_idxslices_for_send_rightsegment_to_send_rightslices_for_send_leftsegment_to_send_leftbuffer_for_halo_from_rightbuffer_for_halo_from_leftop_isend_to_rightop_isend_to_leftop_irecv_from_rightop_irecv_from_leftp2p_requestsreqreconstructed_tensors                         r   _ring_conv_halo_exchangera   X   s    =Q!! 	$$%*-3345R9
 	

 f		'}4OP
 	

 #4[MA-+0	(($,( /#$jl 
 "$K=1,*/.+' ."#jl  "(!2!23G!H & 1 12G H



) zz

( **

. 

- ))		
L 
  Q''%}}<= 
&  **,,! 
r*	*%}}&;- 
  **,,  &}})"*
  
  **,,r9   c                    [        U5      S:X  a  U $ US:X  a  SnOUS:X  a  SnO[        SU S35      e[        S 5      /S-  n	[        U* S 5      X'   U [        U	5         R	                  5       n
[        S 5      /S-  n[        S U5      X'   U [        U5         R	                  5       n[
        R                  " U
5      n[
        R                  " U5      n[        R                  " [        R                  X5      n[        R                  " [        R                  X5      n[        R                  " [        R                  X5      n[        R                  " [        R                  X5      n[        R                  " UUUU/5      nU H  nUR                  5         M     U nXVS	   :X  a{  [        S 5      /S-  n[        S U* 5      UU'   U[        U5         n[        S 5      /S-  n[        U* S 5      UU'   U[        U5         nUR                  U5        UR	                  5       $ XVS
   :X  ay  [        S 5      /S-  n[        US 5      UU'   U[        U5         n[        S 5      /S-  n[        S U5      UU'   U[        U5         nUR                  U5        UR	                  5       $ [        S 5      /S-  n[        X* 5      UU'   U[        U5         n[        S 5      /S-  n[        U* S 5      UU'   U[        U5         nUR                  U5        [        S 5      /S-  n[        S U5      UU'   U[        U5         nUR                  U5        UR	                  5       $ )Nr   r&   r'   r<   r(   r=   r>   r;   r   r@   )r   r-   rA   rB   rC   rD   rE   r+   rF   rG   rH   rI   rJ   add_)local_gradient_tensorhalo_width_send_lefthalo_width_send_rightrO   rP   current_process_rankrR   r2   rS   rT   rU   rV   rW   buffer_for_gradient_from_leftbuffer_for_gradient_from_rightrZ   r[   r\   r]   r^   r_   processed_gradient_tensorcrop_slices
agg_slicestarget_sliceagg_slices_right_edgetarget_slice_rightagg_slices_left_edgetarget_slice_lefts                                r   _ring_conv_halo_aggregaterr      s    =Q$$f		'}4OP
 	

 #4[MA-+0	,( 2#$jl  "$K=1,*/"+' 1"#jl 
 %+$5$56K$L!%+%6%67K%L"



) zz

( **

2 

1 ))		
L 
  !6Q//T{ma'%*42G1G%HM"$=+%
! Dk]Q&
$)+?*?$F
=!0z1BC89Z %//11W 
r!2	2T{ma'%*+?%FM"$=+%
!
 Dk]Q&
$)$0E$F
=!0z1BC78< %//115 T{ma'%* "8&
M" %>+%
!
 "'t 1/4!!40
m, 7'(
 	 >? !&d}q0.3D:O.P]+5&'
 	<=$//11r9   c                   ^    \ rS rSr\ SS j5       r\        SS j5       r\S 5       rSrg)	
RingConv2di@  c           	         SnSnUS:X  a  SnSnOUS:X  a  SnSnO[        SU S35      eUS   nUS   n	US   n
X   nX   nUS:w  a  [        S	U S
U S35      eU	S:X  a9  X-  S:w  a  [        SU SU
 SU S35      eX:w  a  [        SU
 SU SU S35      e gU
S:w  a  [        SU	 SU
 S
U S35      eUS-  U:  a  [        SUS-   SU SU S35      eg)Nr@   r&   r'   r<   r(   zUnsupported data_format 'z'. Expected 'NCHW' or 'NHWC'.r   z`Only dilation=1 on the W-dimension is supported for tensor-parallel convolution. Got dilation_w=z (data_format='z').r   zWhen padding_w=0, input_W=z must be divisible by stride_W=z/ for tensor-parallel convolution (data_format='zWhen padding_w=0, stride_W=z must equal kernel_W=z& to avoid halo exchange (data_format='zWhen padding_w=zC, stride_W must be 1 for tensor-parallel convolution. Got stride_W=zHalf of kernel_W (z) must not exceed input_W=z* to ensure halo region fits (data_format='T)r-   r/   )
input_sizekernel_sizestridepaddingdilationr2   idx_w_inputidx_w_kernel
dilation_w	padding_wstride_winput_wkernel_ws                r   _is_supportedRingConv2d._is_supportedA  s    &  KLF" KL+K=8UV  a[
AJ	!9),? "",_[MN 
 > !Q&"0	9XYaXb cEEPMQTV  #"1(;PQYPZ [<<G=M  $.  1}"%i[ 1$$,:_[MN  1}w&"(Q7QRYQZ [@@K}CQ 
 r9   Nc           
      	   [         R                  " 5       n[        R                  UR                  UR                  XEXy5      (       d   eUR                  5       (       d   S5       eUR                  5       (       du  [        [        UR                  5      5       Vs/ s H  n[         R                  " 5       PM     nn[         R                  R                  R                  X!R                  U5      nUb  UR                  5       (       du  [        [        UR                  5      5       Vs/ s H  n[         R                  " 5       PM     nn[         R                  R                  R                  X1R                  U5      nU R                  XU5        UR                  nUR                  n[         R                  R                  R                  XU5      n[         R                  R                  R                  X"R                  UR                  5      nUb>  [         R                  R                  R                  X3R                  UR                  5      nUUUUUU	4U l        [#        UUU	5      u  nnUS   S:X  d  [        U5      S::  a(  [$        R&                  R)                  UUUUUUUU	5      nGOU	S:X  a  SnSnOU	S:X  a  SnSnO[+        SU	 S	35      eUR                  U   nUS-
  nUS-  nUU-
  nUU-   U:X  d   eUU l        UR/                  U5      nUUS-   [        U5      -     nUUS-
  [        U5      -     n[1        UUUUUUUU	5      n[$        R&                  R)                  UUUUUUUU	5      nUS   nUR                  U   nU	S:X  aN  UUS   :X  a  US S 2S S 2S S 2S UU-
  24   nO}UUS
   :X  a  US S 2S S 2S S 2US 24   nObUS S 2S S 2S S 2UUU-
  24   nOMUUS   :X  a  US S 2S S 2S UU-
  2S S 24   nO/UUS
   :X  a  US S 2S S 2US 2S S 24   nOUS S 2S S 2UUU-
  2S S 24   nUU l        UU l        UU l        UU l        [         R                  R                  R                  UUU5      nUR;                  5       $ s  snf s  snf )Nz.Input tensor `x` must be a distributed tensor.r   r   r&   r'   r<   r(   r=   r>   r@   )r+   r.   rt   r   r
   is_distr   r   
placements	Replicateauto_parallelapidtensor_from_localprocess_meshsave_for_backwarddtensor_to_localattrsr8   rD   _C_opsconv2dr-   mesh_axis_namer   ra   left_halo_widthright_halo_widthoutput_halo_trim_widthoutput_width_dim_idxrC   ) ctxxweightbiasrx   ry   padding_algorithmrz   groupsr2   channel_dimr7   _weight_placementsbias_placementsr0   r1   r   rR   final_local_resultskernel_width_dim_idxr   kernel_widthkernel_total_halo_spanr   r   rank_idx	next_rank	prev_ranklocal_results_with_halor   width_before_trimmings                                    r   forwardRingConv2d.forward  s    }}''GGV\\6H
 
 	
 
 yy{{LLL{~~*/ALL0A*B!*BQ *B  ! ''++>>(9F DLLNN*/ALL0A*B*BQ *B   %%))<<nnoD 	a.||""33A|L##''88''):):
 %%))::''D
 
	 );L+)
% 1:?c-0A5"(--"6"6!	# f$'($'($&'($'($ /}<WX  "<<(<=L%1A%5"49O5G"%559OOOO!/C$**40H%x!|s=7I&IJI%x!|s=7I&IJI ) 	A '-mm&:&:!	'# &-QZ"$;$A$A$%! f$=++*AH/2HHHJ+' ]2..*A1a!7!88+' +B.1F021 12+' =++*AH/2HHH+' ]2..*A145q8+' +B.1F021 1	+' #2C#3C )?C&';C$"0044GG
 #--//!s   Q>'Rc           
      0
   [         R                  " 5       nU R                  5       u  p4nUR                  nUR                  nUb  UR                  OSnUR                  n	UR
                  n
[         R                  R                  R                  X9U
5      nUR                  nUR
                  n[         R                  R                  R                  XKU5      n[         R                  R                  R                  XR                  UR
                  5      nUbB  UR                  nUR
                  n[         R                  R                  R                  X]U5      nU R                  nUS   nUS   nS nS nS n[        XU5      u  nnUS   S:X  d  [        U5      S::  a'  [        R                  R                  " X4U/UQ76 u  nnGO$UR                  U5      nUUS-   [        U5      -     nUUS-
  [        U5      -     nU R                   nU R"                  nU R$                  nU R&                  n[)        UUUUUUUU5      nUS   nUS:X  a!  UUS   :X  a  SU/n O9UUS   :X  a  US/n O+UU/n O&UUS   :X  a  SUSS/n OUUS   :X  a  USSS/n OUUSS/n [*        R,                  " UU SSUS9n![        R                  R                  " UUU!/UQ76 u  n"nU(       d  [/        U"UUUUUUU5      nUb?  US:X  a  / S	QO/ S
Qn#[        R0                  " UU#SS9nUR3                  UR4                  5      nUb*  [         R                  R                  R7                  UX5      n[        XU5      u  n$n[9        UR:                  5       Hu  u  n%n&U&U$:X  d  M  [         R<                  " [         R>                  R@                  5      UU%'   Uc  MD  [         R<                  " [         R>                  R@                  5      WU%'   Mw     [         R                  R                  R7                  UX5      n[         RB                  " UU[E        [        U5      5       Vs/ s H  n[         RF                  " 5       PM     sn5      nUby  [         R                  R                  R7                  UWW5      n[         RB                  " UU[E        [        U5      5       Vs/ s H  n[         RF                  " 5       PM     sn5      nU(       a  S nU(       a  S nU(       a  S nUb  UUU4$ UU4$ s  snf s  snf )NTr@   r   r   r&   constantg        )modevaluer2   )r   r(   r'   )r   r   r(   )r?   keepdim)$r+   r.   saved_tensorstop_gradientr   r   r   r   r   r   r8   r   rD   r   conv2d_gradr   r   r   r   r   ra   Fpadrr   sumreshaper
   r   r*   r   Partial
ReduceTypekRedSumreshardr   r   )'r   grad_outrQ   r   r   r   x_stop_gradientweight_stop_gradientbias_stop_gradientr0   r1   weight_meshr   	bias_meshr   
conv_attrsr2   ry   grad_xgrad_weight	grad_biasr   rR   r   r   r   r   r   r   r   in_tensor_augmentedr~   padding_listgrad_out_paddedgrad_x_augmentedsum_axestp_axis_namer!   r4   s'                                          r   backwardRingConv2d.backward?  s   }}**,4//%33373CT//||""33A|L))"--##''88!2
 %%))::++X-@-@
 ))I"ooO%%))::D YY
 nQ-	-fKP=1:?c-0A5"(--";";8#&0#FK %**<8H%x!|s=7I&IJI%x!|s=7I&IJI!11O"33%(%?%?"#&#;#;  #; 	#  
If$=#33$%y>L!]2%66$-q>L$-y#9L=#33$%y!Q#7L!]2%66$-q!Q#7L$-y!Q#?Lee'O -3MM,E,E#V_-?I-)k
 #2$#$ !	 $/6$9yyH

8(DII!))$**5I''++>>F -V;Oa'(=(=>NCL()-doo6M6M)N!#&#+/<<8O8O+POC(	 ? ((,,??
 ll',S1B-C'DE'D!T^^'DE
 **..AA9oI +0_1E+FG+Fa!+FGI FKI;	11{""1 F Hs   TT )r&   )Nr   r   Nr   r   r&   r   )	__name__
__module____qualname____firstlineno__staticmethodr   r   r   __static_attributes__r   r9   r   rt   rt   @  sa    HND DL 
 t0 t0l d# d#r9   rt   )r   rD   paddle.distributeddistributedr+   paddle.nn.functionalnn
functionalr   r$   r8   ra   rr   autogradPyLayerrt   r   r9   r   <module>r      sG      !    "J:b-J@2Fd#(( d#r9   