
    Bj"?              
          d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
Z
d dlmc mc mc mZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, g dZ- e*e.          Z/e G d d                      Z0 G d d          Z1dee2z  dz  de3e	         de2fdZ4de&de5e2dz  e6dz  f         fdZ7de0dee2z  dz  de3e	         de8e6e	f         fdZ9dS )    N)Callable)	dataclassfield)Any)get_default_numa_optionsjustknobs_check)eventsmetrics)
WorkerSpec)create_healthcheck_server)_AliveCallbackProxyLocalElasticAgentTORCHELASTIC_HEALTH_CHECK_PORT)DefaultLogsSpecs	LogsSpecsSignalException)ChildFailedError)RendezvousParameters)parse_rendezvous_endpoint)
get_logger)NumaOptions)LaunchConfigelastic_launchlaunch_agentc                      e Zd ZU dZeed<   eed<   eed<   dZedz  ed<   dZe	ed<   d	Z
e	ed
<   dZe	ed<   dZe	ed<    ee          Zee	ef         ed<   dZeed<   dZeed<   dZeed<   dZe	ed<   dZe	dz  ed<    ee          Zee	e	f         ed<   dZe	dz  ed<   dZe	ed<   dZedz  ed<   dZe	ed<   dZee	         dz  ed <   dZee	         dz  ed!<   d"Z e!ed#<   dZ"edz  ed$<   d% Z#dS )&r   a  
    Creates a rendezvous config.

    Args:
        min_nodes: Minimum amount of nodes that the user function will
                        be launched on. Elastic agent ensures that the user
                        function start only when the min_nodes amount enters
                        the rendezvous.
        max_nodes: Maximum amount of nodes that the user function
                        will be launched on.
        nproc_per_node: On each node the elastic agent will launch
                            this amount of workers that will execute user
                            defined function.
        rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
        rdzv_endpoint: The endpoint of the rdzv sync. storage.
        rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
        rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
            to be removed in future versions, see the note below. The default timeout is 900 seconds.
        run_id: The unique run id of the job (if not passed a unique one will be
                deduced from run environment - flow workflow id in flow - or auto generated).
        role: User defined role of the worker (defaults to "trainer").
        max_restarts: The maximum amount of restarts that elastic agent will conduct
                    on workers before failure.
        monitor_interval: The interval in seconds that is used by the elastic_agent
                        as a period of monitoring workers.
        start_method: The method is used by the elastic agent to start the
                    workers (spawn, fork, forkserver).
        metrics_cfg: configuration to initialize metrics.
        local_addr: address of the local node if any. If not set, a lookup on the local
                machine's FQDN will be performed.
        local_ranks_filter: ranks for which to show logs in console. If not set, show from all.
        event_log_handler: name of the event logging handler as registered in
          `elastic/events/handlers.py <https://docs.pytorch.org/docs/stable/elastic/events.html>`_.
        duplicate_stdout_filters: If non-empty, duplicates stdout to a file containing only lines
                                that match _any_ of the filter strings.
        duplicate_stderr_filters: If non-empty, duplicates stderr to a file containing only lines
                                that match _any_ of the filter strings.
        virtual_local_rank: Enable virtual local rank mode for workers (defaults to False).
                           When enabled, LOCAL_RANK is set to 0 for all workers and
                           CUDA_VISIBLE_DEVICES is adjusted so each worker accesses its
                           assigned GPU at device index 0.
        shutdown_timeout: Time in seconds to wait for graceful shutdown of workers before
                        sending SIGKILL. Can also be set via TORCH_ELASTIC_SHUTDOWN_TIMEOUT
                        environment variable. Defaults to 30 seconds.


    .. note::
        `rdzv_timeout` is a legacy argument that will be removed in future.
        Set the timeout via `rdzv_configs['timeout']`

    	min_nodes	max_nodesnproc_per_nodeN
logs_specs run_iddefault_rolerolerdzv_endpointetcdrdzv_backend)default_factoryrdzv_configsrdzv_timeout   max_restartsg?monitor_intervalspawnstart_methodlog_line_prefix_templatemetrics_cfg
local_addrnullevent_log_handlernuma_optionszSIGTERM,SIGINT,SIGHUP,SIGQUITsignals_to_handleduplicate_stdout_filtersduplicate_stderr_filtersFvirtual_local_rankshutdown_timeoutc                 R   d}| j         dk    r| j         | j        d<   nd| j        vr
|| j        d<   | j        t                      | _        | j        xt
          j                                        rZt
          j                                        | j	        k    r3t                      | _        t                              d| j                   | j        4t          t          j                            dd                    | _        d S | j        dk     rt%          d| j                   d S )	Ni  r)   timeoutzUsing default numa options = %rTORCH_ELASTIC_SHUTDOWN_TIMEOUT30r   z+shutdown_timeout must be non-negative, got )r*   r(   r   r   r5   torchcudais_availabledevice_countr   r   loggerinfor:   intosenvironget
ValueError)selfdefault_timeouts     a/var/www/html/Carbon-Document/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py__post_init__zLaunchConfig.__post_init__|   s.   ""+/+<Di((d///+:Di( ?".00DO %
'')) & 
''))T-@@@ 8 : :DKK94;LMMM  ($'
?FF% %D!!! "Q&&Ud>SUU   '&    )$__name__
__module____qualname____doc__rE   __annotations__r   r   r!   strr#   r$   r&   r   dictr(   r   r*   r,   r-   floatr/   r0   r1   r2   r4   r5   r   r6   r7   listr8   r9   boolr:   rM    rN   rL   r   r   .   s        2 2h NNNNNN#'J	D '''FCD#M3L##(5#>#>#>L$sCx.>>>L#L#!e!!!L#+/cDj///"'%"="="=Kc3h===!Jd
!!!#s###'+L+$+++<s<<<15d3i$.55515d3i$.555$$$$#'cDj'''    rN   r   c                   4    e Zd ZdZdedeez  dz  fdZd ZdS )r   a  
    Launches an torchelastic agent on the container that invoked the entrypoint.

        1. Pass the ``entrypoint`` arguments as non ``kwargs`` (e.g. no named parameters)/
           ``entrypoint`` can be a function or a command.
        2. The return value is a map of each worker's output mapped
           by their respective global rank.

    Usage

    ::

    def worker_fn(foo):
        # ...

    def main():
        # entrypoint is a function.
        outputs = elastic_launch(LaunchConfig, worker_fn)(foo)
        # return rank 0's output
        return outputs[0]

        # entrypoint is a command and ``script.py`` is the python module.
        outputs = elastic_launch(LaunchConfig, "script.py")(args)
        outputs = elastic_launch(LaunchConfig, "python")("script.py")
    config
entrypointNc                 "    || _         || _        d S N)_config_entrypoint)rJ   r[   r\   s      rL   __init__zelastic_launch.__init__   s    
 %rN   c                 R    t          | j        | j        t          |                    S r^   )r   r_   r`   rW   )rJ   argss     rL   __call__zelastic_launch.__call__   s    DL$*:DJJGGGrN   )	rO   rP   rQ   rR   r   r   rT   ra   rd   rY   rN   rL   r   r      sa         4&& sNT)& & & &H H H H HrN   r   r\   rc   returnc                     t          | t                    r| j        S t          | t                    r,| t          j        k    rt          d |D             d          S | S dS )a  Retrieve entrypoint name with the rule:
    1. If entrypoint is a function, use ``entrypoint.__qualname__``.
    2. If entrypoint is a string, check its value:
        2.1 if entrypoint equals to ``sys.executable`` (like "python"), use the first element from ``args``
            which does not start with hifen letter (for example, "-u" will be skipped).
        2.2 otherwise, use ``entrypoint`` value.
    3. Otherwise, return empty string.
    c              3   2   K   | ]}|d          dk    |V  dS )r   -NrY   ).0args     rL   	<genexpr>z'_get_entrypoint_name.<locals>.<genexpr>   s*      >>A#>>rN   r    )
isinstancer   rO   rT   sys
executablenext)r\   rc   s     rL   _get_entrypoint_namerp      sh     *h'' ""	J	$	$ ''>>>>>CCCrrN   rdzv_parametersc                     | j         dk    rdS | j        }|                                }|st          d          t	          |d          \  }}|dk    rt          d| d          ||fS )Nstatic)NNzKEndpoint is missing in endpoint. Try to add --master-addr and --master-portr)   )default_portzport is missing in endpoint: z. Try to specify --master-port)backendendpointstriprI   r   )rq   rv   master_addrmaster_ports       rL   _get_addr_and_portrz      s     (**|'H~~H 
Y
 
 	
  9PRSSSKbTHTTT
 
 	
 %%rN   r[   c                 R   | j         sGt          t          j                    j                  }t
                              d|           || _         t          ||          }t
                              di d|d| j	        d| j
        d| j        d| j         d| j        d	| j        d
| j        d| j        d| j        d| j        j        d| j        d| j        d| j        d| j        d| j        d| j                   t3          d"| j        | j        | j         | j	        | j
        | j        d| j        }t7          |          \  }}| j        t8          j        d<   d }d }	t9          j        t>                    }
|
tA          dd          r	 tC                      }	tE          |	t	          |
          d          }|#                                 t
                              d|
           n0# tH          $ r# t
                              dd           d }d }	Y nw xY wtK          | j&        | j        |tO          |          tQ          j)        |          | j        | j        ||| j        | j        | j        | j        | j        | j*                  }tW          || j        | j,        | j-        | j.        |           }|	|	/                    |j0                   d}	 tc          j2        tc          j3        | j                             |4                                }tk          j6        |7                                | j                   |8                                rts          ||j:        !          |j;        |r|j<        =                                 S S # tr          $ r  t|          $ r0 d}tk          j6        |?                                | j                    tH          $ r. tk          j6        |?                                | j                    w xY w# |r|j<        =                                 w w xY w)#Nz3config has no run_id, generated a random run_id: %saR  Starting elastic_operator with launch configs:
  entrypoint               : %(entrypoint)s
  min_nodes                : %(min_nodes)s
  max_nodes                : %(max_nodes)s
  nproc_per_node           : %(nproc_per_node)s
  run_id                   : %(run_id)s
  rdzv_backend             : %(rdzv_backend)s
  rdzv_endpoint            : %(rdzv_endpoint)s
  rdzv_configs             : %(rdzv_configs)s
  max_restarts             : %(max_restarts)s
  monitor_interval         : %(monitor_interval)s
  log_dir                  : %(log_dir)s
  metrics_cfg              : %(metrics_cfg)s
  event_log_handler        : %(event_log_handler)s
  numa_options             : %(numa_options)s
  signals_to_handle        : %(signals_to_handle)s
  duplicate_stdout_filters : %(duplicate_stdout_filters)s
  duplicate_stderr_filters : %(duplicate_stderr_filters)s
r\   r   r   r   r!   r&   r$   r(   r,   r-   log_dirr1   r4   r5   r6   r7   r8   )ru   rv   r!   r   r   r2   TORCHELASTIC_SIGNALS_TO_HANDLEzNai_infra/pytorch_distributed:torchelastic_enable_healthcheck_before_rendezvousF)default<   )alive_callbackportr<   z>Started early health check server on port %s before rendezvousz)Failed to start early health check serverT)exc_info)r#   local_world_sizer\   rc   rdzv_handlerr,   r-   rx   ry   r2   r4   r5   r7   r8   r9   )specr   r/   r0   r:   health_check_server)namefailuresrY   )@r!   rT   uuiduuid4rE   rC   warningrp   rD   r   r   r   r&   r$   r(   r,   r-   r   root_log_dirr1   r4   r5   r6   r7   r8   r   r2   rz   rF   rG   getenvr   r   r   r   start	Exceptionr   r#   tuplerdzv_registryget_rendezvous_handlerr9   r   r/   r0   r:   set_delegate_get_alive_timer
   initialize_metricsMetricsConfigrunr	   recordget_event_succeeded	is_failedr   r   return_valuesr   shutdownr   get_event_failed)r[   r\   rc   r!   entrypoint_namerq   rx   ry   r   alive_callback_proxyhealthcheck_portr   agentshutdown_rdzvresults                  rL   r   r      s   
 = TZ\\%&&LfUUU*:t<<O
KK	F$	
/	
)	
 )	
 f3		

 fm	
 F/	
 V1	
 F/	
 F/	
  7	
 v(5	
 6-	
  !9	
 F/	
  !9	
  '(G!	
" '(G#	
'& & &P + #%}""$  
 O  2/BBK 4:3KBJ/0 y!?@@#X) ) )#	(#6#8#8 ";3)**# # #
  %%'''KKP      	( 	( 	(NNFQUNVVV"&#'   	(
 [.4[["9/JJ(0$ 2(!'!@!'!@!4  D$ $(!'!@0/  E '))%*?@@@M )"7#89K#L#LMMMe//1163KLLL 	
 #$   
 #  	)&&((((	)        e,,..0HIII   e,,..0HIII  	)&&((((	)s-   AG, ,*HHBN A:PP P&):rF   rm   r   collections.abcr   dataclassesr   r   typingr   r?   -torch.distributed.elastic.rendezvous.registrydistributedelastic
rendezvousregistryr   torch._utils_internalr   r   torch.distributed.elasticr	   r
   *torch.distributed.elastic.agent.server.apir   :torch.distributed.elastic.agent.server.health_check_serverr   :torch.distributed.elastic.agent.server.local_elastic_agentr   r   r   )torch.distributed.elastic.multiprocessingr   r   r   0torch.distributed.elastic.multiprocessing.errorsr   $torch.distributed.elastic.rendezvousr   *torch.distributed.elastic.rendezvous.utilsr   'torch.distributed.elastic.utils.loggingr   torch.numa.bindingr   __all__rO   rC   r   r   rT   rW   rp   r   rE   rz   rU   r   rY   rN   rL   <module>r      s   
			 



  $ $ $ $ $ $ ( ( ( ( ( ( ( (        E E E E E E E E E E E E E E E K K K K K K K K 5 5 5 5 5 5 5 5 A A A A A A              
         
 N M M M M M E E E E E E P P P P P P > > > > > > * * * * * * =
<
<	H		 i i i i i i i iX$H $H $H $H $H $H $H $HNX^d%: $s) PS    (&)&
3:sTz!"& & & &&^)^)3%^) s)^) 
#s(^	^) ^) ^) ^) ^) ^)rN   