
    IЦi*              
          S SK r S SKrS SKJrJr  S SKJrJrJrJ	r	J
r
JrJr  S SKJs  Js  Js  Jr  S SKJrJr  S SKJr  S SKJr  S SKJrJrJr  S SKJr  S S	K J!r!  S S
K"J#r#  S SK$J%r%  / SQr&\%" \'5      r(\ " S S5      5       r) " S S5      r*S\\\+S4   S\	\   S\+4S jr,S\!S\\
\+   \
\-   4   4S jr.S\)S\\\+S4   S\	\   S\\-\4   4S jr/g)    N)	dataclassfield)AnyCallableDictListOptionalTupleUnion)eventsmetrics)
WorkerSpec)LocalElasticAgent)DefaultLogsSpecs	LogsSpecsSignalException)ChildFailedError)RendezvousParameters)parse_rendezvous_endpoint)
get_logger)LaunchConfigelastic_launchlaunch_agentc                   *   \ rS rSr% Sr\\S'   \\S'   \\S'   Sr\\	   \S'   Sr
\\S	'   S
r\\S'   Sr\\S'   Sr\\S'   \" \S9r\\\4   \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\   \S'   \" \S9r\\\4   \S'   Sr\\   \S'   S rSrg)r   "   a  
Creates a rendezvous config.

Args:
    min_nodes: Minimum amount of nodes that the user function will
                    be launched on. Elastic agent ensures that the user
                    function start only when the min_nodes amount enters
                    the rendezvous.
    max_nodes: Maximum amount of nodes that the user function
                    will be launched on.
    nproc_per_node: On each node the elastic agent will launch
                        this amount of workers that will execute user
                        defined function.
    rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
    rdzv_endpoint: The endpoint of the rdzv sync. storage.
    rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
    rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
        to be removed in future versions, see the note below. The default timeout is 900 seconds.
    run_id: The unique run id of the job (if not passed a unique one will be
            deduced from run environment - flow workflow id in flow - or auto generated).
    role: User defined role of the worker (defaults to "trainer").
    max_restarts: The maximum amount of restarts that elastic agent will conduct
                on workers before failure.
    monitor_interval: The interval in seconds that is used by the elastic_agent
                    as a period of monitoring workers.
    start_method: The method is used by the elastic agent to start the
                workers (spawn, fork, forkserver).
    metrics_cfg: configuration to initialize metrics.
    local_addr: address of the local node if any. If not set, a lookup on the local
            machine's FQDN will be performed.
    local_ranks_filter: ranks for which to show logs in console. If not set, show from all.
..note:
    `rdzv_timeout` is a legacy argument that will be removed in future.
    Set the timeout via `rdzv_configs['timeout']`

	min_nodes	max_nodesnproc_per_nodeN
logs_specs run_iddefault_rolerolerdzv_endpointetcdrdzv_backend)default_factoryrdzv_configsrdzv_timeout   max_restartsg?monitor_intervalspawnstart_methodlog_line_prefix_templatemetrics_cfg
local_addrc                     SnU R                   S:w  a  U R                   U R                  S'   OSU R                  ;  a  XR                  S'   U R                  c  [        5       U l        g g )Ni  r)   timeout)r*   r(   r   r   )selfdefault_timeouts     ]/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torch/distributed/launcher/api.py__post_init__LaunchConfig.__post_init__Z   sb    "+/+<+<Di(d///+:i( ??".0DO #    )r   )__name__
__module____qualname____firstlineno____doc__int__annotations__r   r	   r   r!   strr#   r$   r&   r   dictr(   r   r   r*   r,   r-   floatr/   r0   r1   r2   r8   __static_attributes__ r:   r7   r   r   "   s    #J NN&*J#*FCD#M3L##(#>L$sCx.>L#L#!e!L#.2hsm2"'"=Kc3h= $J$	1r:   r   c                   <    \ rS rSrSrS\S\\\S4   4S jr	S r
Srg)	r   f   a  
Launches an torchelastic agent on the container that invoked the entrypoint.

    1. Pass the ``entrypoint`` arguments as non ``kwargs`` (e.g. no named parameters)/
       ``entrypoint`` can be a function or a command.
    2. The return value is a map of each worker's output mapped
       by their respective global rank.

Usage

::

def worker_fn(foo):
    # ...

def main():
    # entrypoint is a function.
    outputs = elastic_launch(LaunchConfig, worker_fn)(foo)
    # return rank 0's output
    return outputs[0]

    # entrypoint is a command and ``script.py`` is the python module.
    outputs = elastic_launch(LaunchConfig, "script.py")(args)
    outputs = elastic_launch(LaunchConfig, "python")("script.py")
config
entrypointNc                     Xl         X l        g N_config_entrypoint)r5   rI   rJ   s      r7   __init__elastic_launch.__init__   s    
 %r:   c                 V    [        U R                  U R                  [        U5      5      $ rL   )r   rN   rO   list)r5   argss     r7   __call__elastic_launch.__call__   s    DLL$*:*:DJGGr:   rM   )r;   r<   r=   r>   r?   r   r   r   rB   rP   rU   rE   rF   r:   r7   r   r   f   s0    4&& (C-.&Hr:   r   rJ   rT   returnc                     [        U [        5      (       a  U R                  $ [        U [        5      (       a)  U [        R
                  :X  a  [        S U 5       S5      $ U $ g)a  Retrieve entrypoint name with the rule:
1. If entrypoint is a function, use ``entrypoint.__qualname__``.
2. If entrypoint is a string, check its value:
    2.1 if entrypoint equals to ``sys.executable`` (like "python"), use the first element from ``args``
        which does not start with hifen letter (for example, "-u" will be skipped).
    2.2 otherwise, use ``entrypoint`` value.
3. Otherwise, return empty string.
c              3   :   #    U  H  oS    S:w  d  M  Uv   M     g7f)r   -NrF   ).0args     r7   	<genexpr>'_get_entrypoint_name.<locals>.<genexpr>   s     >A#s   	r    )
isinstancer   r;   rB   sys
executablenext)rJ   rT   s     r7   _get_entrypoint_namerc      sR     *h''"""	J	$	$'>>CCr:   rdzv_parametersc                     U R                   S:w  a  gU R                  nUR                  5       nU(       d  [        S5      e[	        USS9u  p#US:X  a  [        SU S35      eX#4$ )Nstatic)NNzKEndpoint is missing in endpoint. Try to add --master-addr and --master-portr)   )default_portzport is missing in endpoint: z. Try to specify --master-port)backendendpointstrip
ValueErrorr   )rd   ri   master_addrmaster_ports       r7   _get_addr_and_portrn      s~     (*''H~~HY
 	
  9PRSKb+H:5ST
 	
 %%r:   rI   c                    U R                   (       dD  [        [        R                  " 5       R                  5      n[
        R                  SU5        X0l         [        X5      n[
        R                  SUU R                  U R                  U R                  U R                   U R                  U R                  U R                  U R                  U R                   U R"                  R$                  U R&                  S.5        [)        S
U R                  U R                  U R                   U R                  U R                  U R*                  S.U R                  D6n[-        U5      u  pg[/        U R0                  U R                  U[3        U5      [4        R6                  " U5      U R                  U R                   UUU R*                  S9
n[9        UU R"                  U R:                  U R<                  S9n	Sn
 [>        R@                  " [>        RB                  " U R&                  5      5        U	RE                  5       n[F        RH                  " U	RK                  5       5        URM                  5       (       a  [O        UURP                  S9eURR                  U
(       a  URT                  RW                  5         $ $ ! [N         a    e [X         a(    S	n
[F        RH                  " U	R[                  5       5        e [\         a&    [F        RH                  " U	R[                  5       5        e f = f! U
(       a  URT                  RW                  5         f f = f)Nz3config has no run_id, generated a random run_id: %sa  Starting elastic_operator with launch configs:
  entrypoint       : %(entrypoint)s
  min_nodes        : %(min_nodes)s
  max_nodes        : %(max_nodes)s
  nproc_per_node   : %(nproc_per_node)s
  run_id           : %(run_id)s
  rdzv_backend     : %(rdzv_backend)s
  rdzv_endpoint    : %(rdzv_endpoint)s
  rdzv_configs     : %(rdzv_configs)s
  max_restarts     : %(max_restarts)s
  monitor_interval : %(monitor_interval)s
  log_dir          : %(log_dir)s
  metrics_cfg      : %(metrics_cfg)s
)rJ   r   r   r   r!   r&   r$   r(   r,   r-   log_dirr1   )rh   ri   r!   r   r   r2   )
r#   local_world_sizerJ   rT   rdzv_handlerr,   r-   rl   rm   r2   )specr   r/   r0   T)namefailuresFrF   )/r!   rB   uuiduuid4r@   loggerwarningrc   infor   r   r   r&   r$   r(   r,   r-   r   root_log_dirr1   r   r2   rn   r   r#   tuplerdzv_registryget_rendezvous_handlerr   r/   r0   r   initialize_metricsMetricsConfigrunr   recordget_event_succeeded	is_failedr   ru   return_valuesrr   shutdownr   get_event_failed	Exception)rI   rJ   rT   r!   entrypoint_namerd   rl   rm   rs   agentshutdown_rdzvresults               r7   r   r      s   
 ==TZZ\%%&LfU*:<O
KK	1 *))))$33mm"//#11"//"// & 7 7((55!--	
< + ##%%}}""""$$ 

O  2/BK[[..4["99/J((00$$D $$((!'!@!@	E M )""7#8#89K9K#LMe//12
 #$ 
 ## &&(     e,,./ e,,./ &&( s   /BJ. .A*LL $L?)0r`   rv   dataclassesr   r   typingr   r   r   r   r	   r
   r   -torch.distributed.elastic.rendezvous.registrydistributedelastic
rendezvousregistryr}   torch.distributed.elasticr   r   *torch.distributed.elastic.agent.server.apir   :torch.distributed.elastic.agent.server.local_elastic_agentr   )torch.distributed.elastic.multiprocessingr   r   r   0torch.distributed.elastic.multiprocessing.errorsr   $torch.distributed.elastic.rendezvousr   *torch.distributed.elastic.rendezvous.utilsr   'torch.distributed.elastic.utils.loggingr   __all__r;   rx   r   r   rB   rc   r@   rn   r   rF   r:   r7   <module>r      s     ( D D D E E 5 A X 
 N E P > =	H	 @1 @1 @1F$H $HNhT)*26s),&)&
8C=(3-'(&&k)k)hT)*k) s)k) 
#s(^	k)r:   