
    x-j                        d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ dd	lmZ e G d
 de                      Ze G d de                      Z G d d          Z G d d          Z G d d          Z G d d          Z G d d          Z G d d          Z G d d          Z eej                  ZddZ dS )    N)IntEnumunique)get_all_custom_device_type)Node)KVClient)KVServer)SingleNodeTopology   )
get_loggerc                   &    e Zd ZdZdZdZdZdZdZdS )
DeviceTyper         r
         N)	__name__
__module____qualname__UNKNOWNCPUGPUXPUDCUNIC     o/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/distributed/auto_parallel/static/cluster.pyr   r       s,        G
C
C
C
C
CCCr   r   c                   2    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
S )LinkTyper   r   r   r
      r   r         N)r   r   r   r   LOCSYSPHBPIXPIBNVLNVBNETr   r   r   r   r   *   s;        G
C
C
C
C
C
C
C
CCCr   r   c                      e Zd Zd Zed             Zed             Zed             Zej        d             Zed             Z	e	j        d             Z	ed             Z
ed	             Ze
j        d
             Z
d Zd Zd Zd Zd Zd ZdS )Meshc                 Z    || _         || _        d | _        d | _        i | _        i | _        d S N)_id_name_type
_full_type	_machines_links)selfidnames      r   __init__zMesh.__init__8   s0    

r   c                     | j         S r.   r/   r5   s    r   r6   zMesh.id@   	    xr   c                     | j         S r.   )r0   r;   s    r   r7   z	Mesh.nameD   
    zr   c                     | j         S r.   r1   r;   s    r   typez	Mesh.typeH   r>   r   c                     || _         d S r.   r@   r5   values     r   rA   z	Mesh.typeL       


r   c                     | j         S r.   r2   r;   s    r   	full_typezMesh.full_typeP   
    r   c                     || _         d S r.   rG   rC   s     r   rH   zMesh.full_typeT       r   c                     | j         S r.   r3   r;   s    r   machineszMesh.machinesX   
    ~r   c                     | j         S r.   r4   r;   s    r   linksz
Mesh.links\   
    {r   c                     || _         d S r.   rM   rC   s     r   rN   zMesh.machines`       r   c                 $    || j         |j        <   d S r.   )r3   r6   )r5   machines     r   add_machinezMesh.add_machined   s    %,wz"""r   c                 8    | j                             |d           S r.   )r3   getr5   r6   s     r   get_machinezMesh.get_machineg   s    ~!!"d+++r   c                 *    t          | j                  S r.   )lenr3   r;   s    r   get_num_machineszMesh.get_num_machinesj   s    4>"""r   c                 2    || j         |j        |j        f<   d S r.   r4   sourcetargetr5   links     r   add_linkzMesh.add_linkm       26T[$+.///r   c                 <    | j                             ||fd           S r.   r4   rZ   r5   rb   rc   s      r   get_linkzMesh.get_linkp       {/666r   c                     | j         | j        d | j                                        D             d | j                                        D             dS )Nc                 6    g | ]}|                                 S r   to_json.0xs     r   
<listcomp>z Mesh.to_json.<locals>.<listcomp>w   s     EEEEEEr   c                 6    g | ]}|                                 S r   ro   rq   s     r   rt   z Mesh.to_json.<locals>.<listcomp>x        ???aaiikk???r   )r6   r7   rN   rR   )r6   r7   rN   valuesrR   r;   s    r   rp   zMesh.to_jsons   sY    'IEEdm.B.B.D.DEEE??4:+<+<+>+>???	
 
 	
r   N)r   r   r   r8   propertyr6   r7   rA   setterrH   rN   rR   rX   r\   r_   rf   rk   rp   r   r   r   r,   r,   7   sx            X   X   X 
[  [   X         X   X _  _- - -, , ,# # #7 7 77 7 7
 
 
 
 
r   r,   c                   d    e Zd Zd Zed             Zed             Zd Zd Zd Z	d Z
d Zd	 Zd
S )	MeshGroupc                 0    i | _         i | _        d| _        d S Nr   )_meshesr4   _global_device_numr;   s    r   r8   zMeshGroup.__init__}   s    "#r   c                     | j         S r.   )r~   r;   s    r   mesheszMeshGroup.meshes   
    |r   c                     | j         S r.   rQ   r;   s    r   rR   zMeshGroup.links   rS   r   c                 $    || j         |j        <   d S r.   )r~   r6   )r5   meshs     r   add_meshzMeshGroup.add_mesh   s     $TWr   c                 8    | j                             |d           S r.   )r~   rZ   r[   s     r   get_meshzMeshGroup.get_mesh   s    |D)))r   c                 2    || j         |j        |j        f<   d S r.   ra   rd   s     r   rf   zMeshGroup.add_link   rg   r   c                 <    | j                             ||fd           S r.   ri   rj   s      r   rk   zMeshGroup.get_link   rl   r   c                 4    | j         }| xj         dz  c_         |S Nr   )r   )r5   curr_device_ids     r   generate_global_device_idz#MeshGroup.generate_global_device_id   s$    01$r   c                     d | j                                         D             d | j                                        D             dS )Nc                 6    g | ]}|                                 S r   ro   rq   s     r   rt   z%MeshGroup.to_json.<locals>.<listcomp>   s     AAAqqyy{{AAAr   c                 6    g | ]}|                                 S r   ro   rq   s     r   rt   z%MeshGroup.to_json.<locals>.<listcomp>   rv   r   )r   rR   )r   rw   rR   r;   s    r   rp   zMeshGroup.to_json   sO    AADK,>,>,@,@AAA??4:+<+<+>+>???
 
 	
r   N)r   r   r   r8   rx   r   rR   r   r   rf   rk   r   rp   r   r   r   r{   r{   |   s        $ $ $
   X   X% % %* * *7 7 77 7 7  

 
 
 
 
r   r{   c                   <   e Zd Zej        ej        ej        gZddZe	d             Z
e
j        d             Z
e	d             Zej        d             Ze	d             Zej        d             Ze	d	             Zej        d
             Ze	d             Zej        d             Ze	d             Zej        d             Ze	d             Zej        d             Ze	d             Zej        d             Ze	d             Zej        d             Zd Zd Zd Zd ZdS )DeviceNc                     || _         || _        || _        || _        d | _        d | _        d | _        d | _        d | _        d | _	        i | _
        d S r.   )
_global_id	_local_id_machine_meshr1   _model
_dp_gflops
_sp_gflops
_hp_gflops_memoryr4   )r5   	global_idlocal_idrW   r   s        r   r8   zDevice.__init__   sU    #!

 r   c                     | j         S r.   r   r;   s    r   r   zDevice.global_id   rI   r   c                     || _         d S r.   r   rC   s     r   r   zDevice.global_id   rK   r   c                     | j         S r.   r   r;   s    r   r   zDevice.local_id   rO   r   c                     || _         d S r.   r   rC   s     r   r   zDevice.local_id   rU   r   c                     | j         S r.   r   r;   s    r   rW   zDevice.machine   
    }r   c                     || _         d S r.   r   rC   s     r   rW   zDevice.machine       r   c                     | j         S r.   r@   r;   s    r   rA   zDevice.type   r>   r   c                     || _         d S r.   r@   rC   s     r   rA   zDevice.type   rE   r   c                     | j         S r.   r   r;   s    r   modelzDevice.model   rS   r   c                     || _         d S r.   r   rC   s     r   r   zDevice.model   s    r   c                     | j         S r.   r   r;   s    r   	dp_gflopszDevice.dp_gflops   rI   r   c                     || _         d S r.   r   rC   s     r   r   zDevice.dp_gflops   rK   r   c                     | j         S r.   r   r;   s    r   	sp_gflopszDevice.sp_gflops   rI   r   c                     || _         d S r.   r   rC   s     r   r   zDevice.sp_gflops   rK   r   c                     | j         S r.   r   r;   s    r   	hp_gflopszDevice.hp_gflops   rI   r   c                     || _         d S r.   r   rC   s     r   r   zDevice.hp_gflops   rK   r   c                     | j         S r.   r   r;   s    r   memoryzDevice.memory   r   r   c                     || _         d S r.   r   rC   s     r   r   zDevice.memory       r   c                 2    || j         |j        |j        f<   d S r.   ra   rd   s     r   rf   zDevice.add_link   rg   r   c                 \    | j         | j        | j        | j        | j        | j        | j        dS )Nr   r   rA   r   r   r   r   r   r;   s    r   rp   zDevice.to_json  s4    IZk
 
 	
r   c                     d}|d| j          d| j         d| j        j         d| j        j         d| j         d| j         d| j         d	| j	         d
| j
         z  }|S )N zglobal_id: z, local_id: z, machine_id: , type: z	, model: z, dp_flops: z, sp_flops: z, hp_flops: z
, memory: )r   r   rW   r6   rA   r7   r   r   r   r   r   )r5   strs     r   __str__zDevice.__str__  s      @T^  @  @  @  @VZVbVe  @  @osoxo}  @  @  IM  IS  @  @  ae  ao  @  @  }A  }K  @  @  Y]  Yg  @  @  sw  s~  @  @  	@
r   c                 *    |                                  S r.   r   r;   s    r   __repr__zDevice.__repr__      ||~~r   r.   )r   r   r   r   r   r   r   NON_ACCELERATOR_TYPEr8   rx   r   ry   r   rW   rA   r   r   r   r   r   rf   rp   r   r   r   r   r   r   r      sx       &NJNJ<NO   &   X         X _  _   X ^  ^   X 
[  [   X \  \   X         X         X         X ]  ]7 7 7	
 	
 	
  
    r   r   c                      e Zd ZdZdZddZed             Zej        d             Zed             Z	e	j        d             Z	ed	             Z
e
j        d
             Z
ed             Zej        d             Zed             Zej        d             Zed             Zej        d             Zed             Zej        d             Zd Zd Zd ZdS )Linkr      Fc                 v    || _         || _        d | _        d | _        d | _        d | _        d | _        || _        d S r.   )_src_tgtr1   
_bandwidth_latency_link_level_hop_topo)r5   rb   rc   topos       r   r8   zLink.__init__  s?    		
	


r   c                     | j         S r.   )r   r;   s    r   rb   zLink.source(  
    yr   c                     || _         d S r.   )_sourcerC   s     r   rb   zLink.source,  r   r   c                     | j         S r.   )r   r;   s    r   rc   zLink.target0  r   r   c                     || _         d S r.   )_targetrC   s     r   rc   zLink.target4  r   r   c                     | j         S r.   r@   r;   s    r   rA   z	Link.type8  r>   r   c                     || _         d S r.   r@   rC   s     r   rA   z	Link.type<  rE   r   c                     | j         S r.   r   r;   s    r   	bandwidthzLink.bandwidth@  rI   r   c                     || _         d S r.   r   rC   s     r   r   zLink.bandwidthD  rK   r   c                     | j         S r.   r   r;   s    r   latencyzLink.latencyH  r   r   c                     || _         d S r.   r   rC   s     r   r   zLink.latencyL  r   r   c                     | j         S r.   r   r;   s    r   hopzLink.hopP  r   r   c                     || _         d S r.   r   rC   s     r   r   zLink.hopT  s    			r   c                     | j         S r.   r   r;   s    r   
link_levelzLink.link_levelX      r   c                     || _         d S r.   r   rC   s     r   r   zLink.link_level\       r   c                 D    | j         | j        | j        | j        | j        dS )N)	source_id	target_idrA   r   r   )rb   rc   rA   r   r   r;   s    r   rp   zLink.to_json`  s*    I|
 
 	
r   c                     d}| j         r| j        n| j        j        }| j         r| j        n| j        j        }|d| d| d| j         d| j         d| j         
z  }|S )Nr   zsource_global_id: z, target_global_id: r   z, bandwidth: z, latency: )r   rb   r   rc   rA   r   r   )r5   r   r   r   s       r   r   zLink.__str__i  s    #':HDKK4;3H	#':HDKK4;3H	  XI  X  X9  X  XVZV_  X  Xnrn|  X  X  JN  JV  X  X  	X
r   c                 *    |                                  S r.   r   r;   s    r   r   zLink.__repr__p  r   r   N)F)r   r   r   default_hopdefault_nic_bandwidthr8   rx   rb   ry   rc   rA   r   r   r   r   rp   r   r   r   r   r   r   r     s       K      X ]  ]   X ]  ]   X 
[  [   X         X ^  ^   X 	Z  Z     X  ! ! !
 
 
      r   r   c                      e Zd Zd"dZed             Zej        d             Zed             Zej        d             Zed             Zej        d	             Zed
             Z	e	j        d             Z	ed             Z
e
j        d             Z
ed             Zej        d             Zed             Zej        d             Zed             Zej        d             Zed             Zej        d             Zed             Zed             Zed             Zed             Zej        d             Zd Zd Zd Zd Zd Zd  Zd! ZdS )#MachineNFc                     || _         d | _        d | _        d | _        d | _        d | _        d | _        d | _        d | _        i | _	        i | _
        i | _        d| _        i | _        || _        || _        d S r}   )r/   	_hostname_addrr   r   r   r   r   _port_devicesr4   _accelerators!_non_accelerator_cumulative_count_topo_linksr   r   )r5   r6   r   r   s       r   r8   zMachine.__init__u  sy    

12.



r   c                     | j         S r.   r:   r;   s    r   r6   z
Machine.id  r<   r   c                     || _         d S r.   r:   rC   s     r   r6   z
Machine.id  s    r   c                     | j         S r.   r   r;   s    r   hostnamezMachine.hostname  rO   r   c                     || _         d S r.   r
  rC   s     r   r  zMachine.hostname  rU   r   c                     | j         S r.   r  r;   s    r   addrzMachine.addr  r>   r   c                     || _         d S r.   r  rC   s     r   r  zMachine.addr  rE   r   c                     | j         S r.   r   r;   s    r   r   zMachine.sp_gflops  rI   r   c                     || _         d S r.   r   rC   s     r   r   zMachine.sp_gflops  rK   r   c                     | j         S r.   r   r;   s    r   r   zMachine.dp_gflops  rI   r   c                     || _         d S r.   r   rC   s     r   r   zMachine.dp_gflops  rK   r   c                     | j         S r.   r   r;   s    r   r   zMachine.memory  r   r   c                     || _         d S r.   r   rC   s     r   r   zMachine.memory  r   r   c                     | j         S r.   r   r;   s    r   r   zMachine.bandwidth  rI   r   c                     || _         d S r.   r   rC   s     r   r   zMachine.bandwidth  rK   r   c                     | j         S r.   r   r;   s    r   r   zMachine.latency  r   r   c                     || _         d S r.   r   rC   s     r   r   zMachine.latency  r   r   c                     | j         S r.   r  r;   s    r   portzMachine.port  r>   r   c                     || _         d S r.   r  rC   s     r   r  zMachine.port  rE   r   c                     | j         S r.   )r  r;   s    r   deviceszMachine.devices  r   r   c                 ,    | j         r| j        S | j        S r.   )r   r  r4   r;   s    r   rR   zMachine.links  s    : 	$##{r   c                     | j         S r.   )r  r;   s    r   acceleratorszMachine.accelerators  s    !!r   c                     | j         S r.   r   r;   s    r   r   zMachine.mesh  r>   r   c                     || _         d S r.   r%  rC   s     r   r   zMachine.mesh  rE   r   c                 l    || j         |j        <   |j        t          j        vr|| j        |j        <   d S d S r.   )r  r   rA   r   r   r  r5   devices     r   
add_devicezMachine.add_device  s?    *0f&';f99939Dv/000 :9r   c                 8    | j                             |d           S r.   )r  rZ   r[   s     r   
get_devicezMachine.get_device  s    }  T***r   c                     | j         r|| j        |j        |j        f<   d S || j        |j        j        |j        j        f<   d S r.   )r   r  rb   rc   r4   r   rd   s     r   rf   zMachine.add_link  sJ    : 	O;?Ddk4;7888JNDK.0EFGGGr   c                     | j         r| j                            ||fd           S | j                            ||fd           S r.   )r   r  rZ   r4   )r5   source_global_idtarget_global_ids      r   rk   zMachine.get_link  sQ    : 	#''!#34d   { 02BCTJJJr   c                     | j         | j        | j        | j        | j        | j        | j        | j        d | j        	                                D             d | j
        	                                D             d
S )Nc                 6    g | ]}|                                 S r   ro   rq   s     r   rt   z#Machine.to_json.<locals>.<listcomp>	  s     CCC		CCCr   c                 6    g | ]}|                                 S r   ro   rq   s     r   rt   z#Machine.to_json.<locals>.<listcomp>
  rv   r   )
r6   r  r  r   r   r   r   r   r   rR   )r6   r  r  r   r   r   r   r   r   rw   rR   r;   s    r   rp   zMachine.to_json  sw    'Ik|CCT\-@-@-B-BCCC??4:+<+<+>+>???
 
 	
r   c                     d}| j                                         D ]
}|d| z  }| j                                        D ]
}|d| z  }|S )Nr   z
, device: z, link: )r   rw   rR   )r5   r   r)  re   s       r   r   zMachine.__str__  sp    l))++ 	) 	)F((((CCJ%%'' 	% 	%D$d$$$CC
r   c                 *    |                                  S r.   r   r;   s    r   r   zMachine.__repr__  r   r   )NF)r   r   r   r8   rx   r6   ry   r  r  r   r   r   r   r   r  r   rR   r#  r   r*  r,  rf   rk   rp   r   r   r   r   r   r   r   t  s          *   X Y  Y   X _  _   X 
[  [   X         X         X ]  ]   X         X ^  ^   X 
[  [   X   X
 " " X"   X 
[  [: : :+ + +O O OK K K
 
 
      r   r   c                       e Zd Zd Zed             Zed             Zed             Zed             Zed             Z	ed             Z
ed             Zd	S )
AlphaLatencyc                    t          |t                    sJ |                    dd           | _        |                    dd           | _        |                    dd           | _        |                    dd           | _        | j        /	 t          | j                  | _        n#  t          d          xY w| j        | j                            dd           nd | _	        | j        | j                            dd           nd | _
        | j        | j                            dd           nd | _        | j	        /	 t          | j	                  | _	        n#  t          d          xY w| j
        /	 t          | j
                  | _
        n#  t          d          xY w| j                            dd           | _        | j                            dd           | _        | j                            dd           | _        | j                            dd           | _        | j        lt          | j        t                     r#| j        d	v sJ t"          | j                 | _        n/	 t          | j                  | _        n#  t          d
          xY w| j        lt          | j        t                     r#| j        d	v sJ t"          | j                 | _        n/	 t          | j                  | _        n#  t          d          xY w| j        lt          | j        t                     r#| j        dv sJ t"          | j                 | _        n/	 t          | j                  | _        n#  t          d          xY w| j        nt          | j        t                     r$| j        dv sJ t"          | j                 | _        d S 	 t          | j                  | _        d S #  t          d          xY wd S )Nbaseinterintraswitchz The switch latency must be floatringtreez$The base ring latency must be float.)r*   z%The inter ring latency must be float.z%The inter tree latency must be float.)r(   r%   z%The intra ring latency must be float.z%The intra tree latency must be float.)
isinstancedictrZ   _base_inter_intra_switchfloat	TypeError
_base_ring
_base_tree_base_inter_inter_ring_inter_tree_intra_ring_intra_treer   r   )r5   alpha_latencys     r   r8   zAlphaLatency.__init__  s   -....."&&vt44
#''66#''66$((488<#D$T\22D BCCC,0J,BDJNN64((( 	 -1J,BDJNN64((( 	 .2Z-CDJNN7D))) 	 ?&H"'"8"8H FGGG?&H"'"8"8H FGGG;??6488;??6488;??6488;??6488'$*C00 M'72222#+D,<#=  M',T-='>'>D$$M#$KLLL'$*C00 M'72222#+D,<#=  M',T-='>'>D$$M#$KLLL'$*C00 M'>9999#+D,<#=  M',T-='>'>D$$M#$KLLL'$*C00 M'>9999#+D,<#=   M',T-='>'>D$$$M#$KLLL ('sT   B& &B7=E E(3F F&J   JK3 3LM& &M7 O O,c                     | j         S r.   )rG  r;   s    r   	base_ringzAlphaLatency.base_ringf  rI   r   c                     | j         S r.   )rH  r;   s    r   	base_treezAlphaLatency.base_treej  rI   r   c                     | j         S r.   )rD  r;   s    r   r<  zAlphaLatency.switchn  r   r   c                     | j         S r.   )rJ  r;   s    r   
inter_ringzAlphaLatency.inter_ringr  r   r   c                     | j         S r.   )rK  r;   s    r   
inter_treezAlphaLatency.inter_treev  r   r   c                     | j         S r.   )rL  r;   s    r   
intra_ringzAlphaLatency.intra_ringz  r   r   c                     | j         S r.   )rM  r;   s    r   
intra_treezAlphaLatency.intra_tree~  r   r   N)r   r   r   r8   rx   rP  rR  r<  rU  rW  rY  r[  r   r   r   r7  r7    s        JM JM JMX   X   X   X     X      X      X      X     r   r7  c                   v   e Zd ZdZd Zed             Zej        d             Z	 	 	 	 	 	 	 	 	 	 	 	 	 d-dZed             Z	ed             Z
ed             Zej        d             Zed             Zd Zed             Zd Zd Zd Zd Zd Zd Zd Zd  Zd! Zd" Zd# Zd$ Zd% Zd& Zd' Zd( Zd) Zd* Z d+ Z!d,S ).Clusterz
    The cluster is an abstract of the hardware resource for training, which contains the cluster topology and
    related hardware information. It will serve the task mapping, cost model and auto searching.
    c                     d| _         d| _        i | _        d | _        d | _        i | _        i | _        d | _        d | _        d| _	        d | _
        d| _        d| _        d S )Nr   F)_num_meshes_num_machinesr3   	_topology_alpha_latency_rank_to_device_id_device_id_to_rank_num_devices_per_machine
_gpu_model_initialized_mesh_groupr   _heteror;   s    r   r8   zCluster.__init__  sk    ""$"$ )-%!
r   c                     | j         S r.   rg  r;   s    r   initializedzCluster.initialized  s      r   c                     || _         d S r.   rk  rC   s     r   rl  zCluster.initialized  s    !r   V1006271Cr         r      x  T=  z  K      c                 0  ./0 g d/dg0dg./0z   .z   }|| _         || _        ./0fd}d }d }i }g |d<   d}i }i }t          |          D ]}i }d	t          |          z   |d
<   d|d<   d|d<   g |d<   g }d}t          |          D ]}i }|dk    r|dk    r|n|dz   }|dz  } ||          } |||          }|}||d<   ||d<   ||d<   ||d<   ||d<   |
|d<   |	|d<   ||d<   d|d<   |||<   |||<   |                    |           i } ||          \  } }!}|}"|}#|dz  }d}|}d}| |d<   |!|d<   ||d<   |"|d<   |#|d<   ||d<   ||d<   ||d<   ||d<   |||<   |||<   |                    |           i }$|dz  }d}d}%d}&d}||$d<   ||$d<   ||$d<   |||<   |||<   |                    |$           ||d<   |d                             |           t          d|dz             D ]}t          d|dz             D ]}||k    r	||         }'||         }(||         })||         }*i }+|},|}-|,|+d <   |-|+d!<   |'|(k    r|)|*k    rd"|+d<   ||+d#<   n
d$|+d<   ||+d#<   |d         |'         d                             |+           |                     |           d%S )&z#Generate cluster by default config.)rn  A100H100A2A10A16A30A40r   r   c                 B    d }| v rd}n| v rd}n	| v rd}nd}|J |S )Nr   r   r   r   )	gpu_modelrA   
dcu_models
gpu_models
xpu_modelss     r   _convert_to_typez<Cluster.gen_default_config_cluster.<locals>._convert_to_type  sT    DJ&&j((j((###Kr   c                     d }| dk    rdt          |          z   dz   }nM| dk    rdt          |          z   dz   }n1| dk    rdt          |          z   dz   }n| t          |          z   dz   }|J |S )Nrn  zTesla V100-SXM2-GBry  zTesla A100-SXM-r~  zTesla A30-SXM-)r   )r  
gpu_memoryr   s      r   _convert_to_modelz=Cluster.gen_default_config_cluster.<locals>._convert_to_model  s    EF""*S__<tCf$$)C
OO;dBe##(3z??:TA!C
OO3d:$$$Lr   c                 b    d\  }}}| dk    rd}d}d}n| dk    rd}d}d}|J |J |J |||fS )N)NNNro  x86_64GenuineIntelz'Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G6148z&Intel(R) Xeon(R) Gold 6148 CPU @ 2.40Gr   )	cpu_modelarchvendorr   s       r   _convert_to_cpu_infoz@Cluster.gen_default_config_cluster.<locals>._convert_to_cpu_info  su    "2D&%G##'Af$$'@###%%%$$$&&r   rN   r   host_r  z	127.0.0.1r  ii  r  rR   r   r   r   rA   r   r   r   r   r   r   r   r  r  r   g      )@r   r/  r0  r(   r   r%   N)re  rf  ranger   append_build_from_dict)1r5   r  r  
node_countdevice_countr  
cpu_memoryinter_bandwidthintra_bandwidthgpu_dp_gflopsgpu_sp_gflopsgpu_hp_gflopscpu_dp_gflopscpu_sp_gflopsall_gpu_modelsr  r  r  cluster_infor   global_id_to_device_typeglobal_id_to_nodeirW   r   r   jr)  rA   r   r   
cpu_devicer  r  r   r   
nic_devicewidthip	node_id_i	node_id_jdevice_type_idevice_type_jre   r/  r0  r  r  r  s1                                                 @@@r   gen_default_config_clusterz"Cluster.gen_default_config_cluster  sY   " POO
W
W
#j0:=(4%#	 	 	 	 	 	 		 	 		' 	' 	'  #%Z 	#% z"" I	5 I	5AG")CFF"2GJ)GFO#GFO!GGGH<(( ' ')*aAFFII	A	A''	22)))Z@@#&/{#%-z"!%v"'w#)x &3{#&3{#&3{#!&v6:(3/0!),v&&&& J"6"6y"A"AD&%%I%INIHFD!%Jv#)Jx "'Jw&/J{#&/J{#&/J{#%-Jz"#)Jx !%Jv+,i(26$Y/NN:&&&JNI DEBH!%Jv%)Jz"&/J{#26$Y/+,i(NN:&&&!(GI$++G4444 q)a-(( 	J 	JA1i!m,, J J66-a0	-a0	 8 ; 8 ;#$ #$ +;'(+;'(	))m}.L.L#(DL(7D%%#(DL(7D%Z(3G<CCDIIII'J* 	l+++++r   c                     | j         S r.   )rc  r;   s    r   rank_to_device_idzCluster.rank_to_device_idT      &&r   c                     | j         S r.   )rd  r;   s    r   device_id_to_rankzCluster.device_id_to_rankX  r  r   c                     | j         S r.   rh  r;   s    r   
mesh_groupzCluster.mesh_group\  r   r   c                     || _         d S r.   r  rC   s     r   r  zCluster.mesh_group`  r   r   c                     | j         S r.   rM   r;   s    r   rN   zCluster.machinesd  rO   r   c                    t          |t                    sJ || j        |j        <   |j        dk    r| j        |j        dz
           }|j        }|j        D ]9}|j        |         j        t          j        vr||z
  }|| j	        |<   || j
        |<   :t          |j                  t          |j                  z
  |j        z   |_        d S |j        D ]K}|j        |         j        t          j        vr+|}|| j	        |<   || j
        |<   |j        |         |j        |<   Lt          |j                  t          |j                  z
  |_        d S )Nr   r   )r?  r   r3   r6   r  r   rA   r   r   rc  rd  r^   r#  )r5   rW   prev_machineoffsetr   rank_ids         r   rX   zCluster.add_machineh  sv   '7+++++%,wz" :??>'*q.9L!CF$_ A A	OI.3!67 7 (&0G7@D+G49@D+I6GO$$g*++,@A 555 %_ Q Q	OI.3!67 7 (G7@D+G49@D+I66=oi6PG(38;9 9G())9*G555r   c                     | j         S r.   )rb  r;   s    r   rN  zCluster.alpha_latency  s    ""r   c                 h    t          |t                    sJ |j                            |           d S r.   )r?  r   rW   r*  r(  s     r   r*  zCluster.add_device  s4    &&)))))!!&)))))r   c                 r    t          |t                    sJ |j        j                            |           d S r.   )r?  r   rb   rW   rf   rd   s     r   rf   zCluster.add_link  s7    $%%%%%$$T*****r   c                 D   d }| j         rPg }| j        j                                        D ].}|                    |j                                                   /n| j                                        }|D ]*}||j                                        v r|j        |         }+|S r.   )r   r  r   rw   extendrN   r   keys)r5   device_global_idr)  target_machinesr   rW   s         r   r,  zCluster.get_device  s    : 	5 O.5577 ? ?&&t}';';'='=>>>>? #m2244O& 	; 	;G7?#7#7#9#999 )9:r   c                 X   |d         }|D ]}|                                  }t          |          }|                    d          |_        |                    d          |_        |                    d          |_        |                    dg           }|D ]G}|                    d          }|                    d          }	t          ||	|          }
|                    dd           }|t          |         }nt          j        }||
_	        |                    d	d           |
_
        t          |                    d
d                    |
_        t          |                    dd                    |
_        t          |                    dd                    |
_        t          |                    dd                    |
_        |                     |
           I|                     |           |D ]{}|                    dg           }|D ]^}|                    d          }|                    d          }|                     |          }|                     |          }t'          ||          }|                    dd           }|t(          |         }nt(          j        }||_	        t          |                    dd                    |_        t          |                    dd                    |_        |                    dd           |_        |j        7|j        }|j        }|j        |j        k    rd|_        nt&          j        |_        |                     |           `}d|v r)t9          |                    d                    | _        d S d | _        d S )NrN   r  r  r  r   r   r   rA   r   r   r   r   r   r   rR   r/  r0  r   r   r   rN  )_generate_machine_idr   rZ   r  r  r  r   r   r   rA   r   rE  r   r   r   r   r*  rX   r,  r   r   r   r   r   rW   r6   r   rf   r7  rb  )r5   r  machines_infomachine_info
machine_idrW   devices_infodevice_infor  device_local_idr)  device_type
links_info	link_infor/  r0  rb   rc   re   	link_typesource_machinetarget_machines                         r   r  zCluster._build_from_dict  sg   $Z0) 	& 	&L2244Jj))G+//
;;G'++F33GL'++F33GL'++Ir::L+ ( (#.??;#?#? "-//*"="= 0/7KK)oofd;;*",["9KK","4K)*w==#(a)H)H#I#I #(a)H)H#I#I #(a)H)H#I#I  %kooh&B&B C C''''W%%%%) 	$ 	$L%))'266J' $ $	#,==1C#D#D #,==1C#D#D )9::)9::FF++%MM&$77	( ( 3II ( 0I%	!&y}}[!'D'D!E!E$Y]]9a%@%@AA$==558#%+^N%+^N%(N,===#$#'#3d####/$2 l**".  11# #D #'Dr   c           	      
   t                      | _        |                                D ]\  }}|                                 }t	          ||          }|                    d          }|d         |_        d                    |dd                    |_        t          t          t          |                              }t          t          |                    D ]y}	||	         }
t          |	|d          }|
                    d          |_        |
                    d          |_        t!          |
                    d                    |_        t!          |
                    d	                    |_        t!          |
                    d
                    |_        t!          |
                    d                    |_        t!          |
                    d                    |_        t          |
                    d                    | _        |
                    d          D ]}t/          |                    d          |                    d          ||          }|                    d          |_        |                    d          |_        t!          |                    d                    |_        t!          |                    d	                    |_        t!          |                    d
                    |_        |                    |           |
                    d          D ]}|                    d          }|                    d          }t5          ||d          }|                    d          |_        t!          |                    d                    |_        t!          |                    d                    |_        d|_        |                    dd           |_        |j        4||k    rd|_        nt4          j        |_        |                    |           |                    |           {|j         D ]V}|j         D ]L}||k    r	t5          ||d          }d|_        d|_        d|_        d|_        |                    |           MW| j        !                    |           | j        j"        D ]`}| j        j"        D ]Q}||k    r	t5          ||d          }d|_        d|_        d|_        d|_        | j                            |           Rad| _#        d S )N-r   r   T)r6   r   r   r  r  r   r   r   r   r   r   r   r   rA   r   rR   r/  r0  )rb   rc   r   r)  r   )r   r*      g      ?rW   r   )$r{   r  items_generate_mesh_idr,   splitrA   joinrH   listr  r^   r   rZ   r  r  intr   r   r   r   r   re  r   r   r*  r   r   r   r   rf   rX   rN   r   r   r   )r5   	topo_info
local_sizemesh_keymesh_valmesh_idr   mesh_fieldsmachine_idsr  machine_valrW   
device_valr)  link_valsource_device_idtarget_device_iddevice_linkr  r  machine_link	mesh_links                         r   _build_from_topozCluster._build_from_topo  s   #++"+//"3"3 E	+ E	+Hh,,..G**D"..--K#ADI XXk!""o66DN uS]]3344K#CMM22 /* /*
&z2!ZdFFF#.??:#>#> *v66$'(D(D$E$E!$'(D(D$E$E!!$[__X%>%>!?!?$'(D(D$E$E!"%kooi&@&@"A"A 14KOOI4N4N0O0O-"-//)"<"< / /J#"{33"z22	 F #-.."8"8FK#->>'#:#:FL'*:>>++F+F'G'GF$'*:>>++F+F'G'GF$$'
x(@(@$A$AFM&&v.... + 8 8 6 6H'/||4F'G'G$'/||4F'G'G$"&//!# # #K
 (0||F';';K$,/[0I0I,J,JK)*-hll9.E.E*F*FK'-5K*&.ll5$&?&?KO".+/???./KOO.2.>KO((555  ))))] 	0 	0 0 0AAvv #'14#8#8#8L(-L%-/L*+.L(.7L+MM,////0 O$$T****' 		4 		4A_+ 4 466 AD111	!&	&(	#$'	!'-	$((33334 


r   c                     t          |          5 }t          j        |          }d d d            n# 1 swxY w Y   |                     |           d S r.   )openjsonloadr  )r5   json_file_path	json_filer  s       r   build_from_filezCluster.build_from_file6  s    .!! 	0Y9Y//L	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0l+++++s   155c                 4    | j         }| xj         dz  c_         |S r   )r_  )r5   cur_mesh_ids     r   r  zCluster._generate_mesh_id;  s$    &Ar   c                 4    | j         }| xj         dz  c_         |S r   )r`  )r5   cur_machine_ids     r   r  zCluster._generate_machine_id@  s$    +ar   c                 x   g }| j         rKg }| j                                        D ].}|                    |j                                                   /n| j                                        }|D ]I}|j                                        D ]-}|j        t          |         k    r|	                    |           .J|S r.   )
r   r  r   r  rN   rw   r   rA   r   r  )r5   r  r   r  r   rW   r)  s          r   get_all_deviceszCluster.get_all_devicesE  s    : 	5 O..00 ? ?&&t}';';'='=>>>>? #m2244O& 	+ 	+G!/0022 + +;*["999NN6***+ r   c                 D   d }d}|                      |          }|                      |          }|j        }|j        }|j        }	|j        }
|	j        |
j        k    r&| j                            |	j        |
j                  }n|j        |j        k    r@| j                            |	j                  }|                    |j        |j                  }nO| j                            |	j                  }|                    |j                  }|                    ||          }|S )N  )r,  rW   r   r6   r  rk   r   r\   )r5   r  r  betaconvert_base
src_device
tgt_devicesrc_machinetgt_machinesrc_meshtgt_meshre   r   rW   s                 r   get_beta_topozCluster.get_beta_topoT  s   __%566
__%566
 ( (##;(+%%?++HKEEDD^{~--?++HK88D==@@DD?++HK88D&&{~66G##$46FGGDr   c                    | j         r|                     ||          }n2|                     |          }|j        }|                    ||          }d }d}d }|t
          j        }n|j        }|dk    rd}nd||dz  dz  z  z  }|S )Nr  g        r   r   r
   i@B )r   r  r,  rW   rk   r   r   r   )	r5   r  r  re   r)  rW   r  r  r   s	            r   get_betazCluster.get_betak  s    : 	H%%&68HIIDD__%566FnG##$46FGGD	<2IIIDD	\1_u%<=>Dr   c                     d }d }|                      |          }|j        }|                    ||          }||j        }nt          j        }|S r.   )r,  rW   rk   r   r   r   )r5   r  r  r  r   r)  rW   re   s           r   get_hopzCluster.get_hop  sX    !122. 02BCC(CC"C
r   c                    t                      }t                      }|D ]e}|                     |          }|j        j        }|                    |           | j        r&|j        j        j        }|                    |           f| j        r*t          |          dk    rt          |          dk    rdS dS t          |          dk    rdS dS )Nr   FT)setr,  rW   r6   addr   r   r^   )r5   
device_idsr  mesh_ids	device_idr)  r  r  s           r   cross_machinezCluster.cross_machine  s    ee55# 	& 	&I__Y//F*JOOJ'''z & .-0W%%%: 	8}}!!c+&6&6!&;&;u4""54r   c                 f    | j         r|S g }|D ]"}|                    | j        |                    #|S r.   )r   r  r  )r5   group_ranksr  ranks       r   convert_rank_to_device_idz!Cluster.convert_rank_to_device_id  sM     : 	
 	< 	<Dd4T:;;;;r   c                     t                      }|D ]8}|                     |          }|j        j        }|                    |           9t          |          }|dk    sJ |S r}   )r	  r,  rW   r6   r
  r^   )r5   r  r  r  r)  r  counts          r   get_involved_machine_countz"Cluster.get_involved_machine_count  si    ee# 	( 	(I__Y//F*JOOJ''''K  qyyyyr   c                     | j         r<d}| j        j                                        D ]}||                                z  }|S t          | j                  S r}   )r   r  r   rw   r_   r^   r3   )r5   nr   s      r   r_   zCluster.get_num_machines  s^    : 	'A.5577 - -T**,,,Ht~&&&r   c                 "    | j         sJ | j         S r.   )re  r;   s    r   get_num_devices_per_machinez#Cluster.get_num_devices_per_machine  s     ,,,,,,r   c                 T    d}| j                                         D ]}|d| dz  }|S )Nr   z	machine: 
)rN   rw   )r5   r   rW   s      r   r   zCluster.__str__  sA    }++-- 	+ 	+G*w****CC
r   c                 *    |                                  S r.   r   r;   s    r   r   zCluster.__repr__  r   r   N)rn  ro  r   r   rp  rq  r   rr  rs  rt  ru  rv  rw  )"r   r   r   __doc__r8   rx   rl  ry   r  r  r  r  rN   rX   rN  r*  rf   r,  r  r  r  r  r  r  r  r  r  r  r  r  r_   r  r   r   r   r   r   r]  r]    s        
  * ! ! X! " " "
 l, l, l, l,\ ' ' X' ' ' X'     X  ! ! !   X!* !* !*F # # X#* * *+ + +
  :' :' :'xR R Rh, , ,
  
  
    .  0
 
 
  *	 	 	  ' ' '- - -      r   r]  c                    d }t                      }| rr ||           rgd| d         v r#|                    | d         d                    |S | d         d         }| d         d         }| d         d         }| d         d         }nC|rt          j        d          }t	                      }	|	                                 t          t          j        d	                    }
t          j        d
          }t          t          j        d                    }t          t          j        d                    }t          t          j        d                    }t          ||z
  |z            }|
dk    r^|[t                      }|                    d          \  }}d}| d| }|dk    rD||v r@t          |          }|
                                 t                              d|            t          |          }|	j        d         }|dk    r(d}|s$|                    d| d| |	j                  }|$d}|r|                    d          }|rt%          |          |
k    ri }|                                D ]m\  }}|                    d          \  }}}}}||vrg ||<   t%          ||                   } t)          j        |          }!||                             |!           n|                    ||           d}nH|rt%          |          nd}"t                              d|" d|
 d           t1          j        d           |d}|s<|                    d| d          }|st                              d | d!           |<|dk    rd}t          t          j        d"                    }"|r|                    d#          }|rDt%          |          |"k    r1|                                 d}t                              d$           n.t                              d%           t1          j        d           |t                              d&t)          j        |j                                        d'(                      d }#|j        j                                        D ]}$|#|$j         }#|#|$j         k    rd|_!        |S |	j        d         d|	j        ii}|                    ||           d|_!        t                              d&t)          j        |j                                        d'(                      |S t          j        d          }|d}nt          |          }t          j        d"          }%|%d}n,t          |%          }%|%|z  dk    sJ t          |%          |z  }t          j        d)d           d*k    rctE                      }&|&rtG          |&d                   nd }'|'}t          tH          j%        j&        j'        (                    |'                    d+z  }ntH          j)        j*        +                                }(|(s
J d,            |(j         }'	 tY          j        d-|'          })|)d         }t          |)d.         d d/                   }n #  t          |(j-                  d+z  }|'}Y nxY wt                              d0.                    ||||tH          j/        0                                t          j        d
d                                d1d2d3d4d5d6d7d4d8}*|d9k    r|*d9         n|*d:         }+|1                    |||||+d;         |+d<         |+d=         >           |S )?Nc                     | sdS d| vrdS d| d         vr2d| d         vrdS d| d         vrdS d| d         vrdS d| d         vrdS dS dS )	NFclusterpath	num_nodesnum_gpusr  r  Tr   )json_configs    r   is_by_json_configz.get_default_cluster.<locals>.is_by_json_config  s     	5K''5[333k)&<<< 5[%;;; 5k)&<<< 5{9'=== 5ttr   r!  r   r"  r#  r  r  PADDLE_MASTERPADDLE_NNODESPADDLE_CURRENT_ENDPOINTPADDLE_GLOBAL_RANKPADDLE_LOCAL_RANKPADDLE_LOCAL_SIZEr   :i:0  zserver start at: device_type_fullFz/topo/data//)keyrD   Tz
/topo/data)r/  z%get global_topo failed, actual size: z, expected size: z, retry later!r   z/topo/status/okzput ok status for rank z failed, retry later!PADDLE_GLOBAL_SIZEz/topo/statuszserver stopped successz"server stopped failed! retry laterzcluster_topo_info: r
   )indentPADDLE_DISTRI_BACKENDxccli ʚ;z#Auto parallel just runs on gpu now.z[ , -]zeNode Count: {}, Local Device Size: {}, GPU Model: {}, GPU Memory: {}GB, World size: {}, EndPoint: {}.rs  rt  iH )dpsphpi%  i,L  i	 )rn  ry  ry  rn  r7  r8  r9  )r  r  r  r  r  r  r  )2r]  r  osgetenvr	   detectr  r   r  r   startloggerinfor   rW   putjson_object
get_prefixr^   r  r  loadsr  r  timesleepstopdumpsr  rp   r   rw   r7   ri  r   r   paddler9  core	libpaddle_get_device_total_memoryr)  cudaget_device_propertiesretotal_memoryformatdistributedget_world_sizer  ),r$  auto_configr%  r   r  local_device_countr  r   master_endpoint
local_toponnodescurr_endpointglobal_rank
local_rankr  node_idnode	master_ip_	free_portserver_endpointserverclientr  respretryglobal_topo	topo_dictr/  rD   	mesh_typeidxmesh_idxglobal_topo_valueglobal_sizer7   r   global_device_countcustom_device_typesgpu_namegpu_info	re_resultgflops_infodefault_gflopss,                                               r   get_default_clusterrs    sc     & iiG R%((55 R%[+++##K	$:6$BCCCN$Y/<J!,Y!7
!C#I.{;I +L9FF	 H%)O44'))
RY//00	";<<")$899::#67788
#67788
{Z/:=>>A::/566D*0055LIqI!*88Y88OQ9#=#=!),,AAABBBo..F$,-?@KQ !::A+AAAA(4 &  D   E "$//L/AA "3{#3#3v#=#= "I&1&7&7&9&9 G G
U25))C../1aC$I5535Ii0#&y';#<#<,0Ju,=,=)!),334EFFFF,,Y
CCC!EE6A"H#k"2"2"2qKKKttt^dttt   JqMMM%  "( D zz&Ck&C&C4zPP KKT+TTT     a!"),@"A"ABB &!,,,@@D &D		[ 8 8 %$<====$HIII
1  & KKZdj1C1K1K1M1MVW&X&X&XZZ   D*188:: / /<9DDty((*.N "#56z)9I
 $$Y
;;;#GOKKZdj1C1K1K1M1MVW&X&X&XZZ   N  Y':;;%!"!$%7!8!8 i(<==&JJ"%&9":":&);;q@@@@0115GGJ9,d33v=="<">">/BL'*+++  !I *CCHMM FF })??AAHBBBBB8}H%HY99	%aL	Yr]3B3/00%X233@$			
KKovv--//I/66	
 	
	 	 	 577577 K
  )F22FF8K  &&'$T*$T*$T* '    Ns   .:Z) )[)NN)!r  loggingr:  rN  rD  enumr   r   rH  paddle.base.corer   &paddle.distributed.launch.context.noder   )paddle.distributed.launch.utils.kv_clientr   )paddle.distributed.launch.utils.kv_serverr   (paddle.distributed.launch.utils.topologyr	   utils.log_utilsr   r   r   r,   r{   r   r   r   r7  r]  INFOr>  rs  r   r   r   <module>r}     s     				 				                   7 7 7 7 7 7 7 7 7 7 7 7 > > > > > > > > > > > > G G G G G G ) ) ) ) ) )         	 	 	 	 	w 	 	 	B
 B
 B
 B
 B
 B
 B
 B
J#
 #
 #
 #
 #
 #
 #
 #
Lr r r r r r r rjZ Z Z Z Z Z Z Zzb b b b b b b bJg  g  g  g  g  g  g  g TL	 L	 L	 L	 L	 L	 L	 L	^ 
GL	!	!E E E E E Er   