
    Αi                    N    S SK Jr  S SKrS SKJr  SqSS jr\S:X  a  \" 5         gg)    )annotationsN)Contextc                 tX   [        5       q[        R                  5       (       a  SSKJn   U R	                  5         g[        R                  5       (       G+a  SSKnSSKnSSKnSSK	nSSK
nSSKnSSKJn  SSKJn  SSKJn	Jn
JnJnJnJnJn  SSKJn  UR                  5       n[        R4                  R6                  R9                  S5      (       d  [;        S	5      e [=        [        R4                  R6                  S
5       nUR?                  U5      nSSS5        URA                  S5      nURC                  URD                  5        URF                  RI                  URF                  RK                  [        R4                  R6                  5      URF                  RM                  [        R4                  R6                  5      RO                  S5      S    S35      nURQ                  USS9nURC                  URD                  5        URS                  S5      nURU                  U5        URW                  U5        [        R4                  RX                  R9                  S5      (       at  URZ                  R]                  S5      S:X  a-  UR^                  SSSSSS[        R4                  RX                  /nOUR^                  S[        R4                  RX                  /nOp[        R4                  RX                  R9                  S5      (       a'  UR^                  [        R4                  RX                  /nO[        R4                  RX                  /nURa                  [        R4                  Rb                  5        URe                  [        R4                  Rb                  5      n[        R4                  Rf                  (       d  SnO2[i        [        R4                  Rf                  RO                  S5      5      n[        R4                  Rj                  n[m        U[n        5      (       a  [q        URO                  S5      S   5      nO[q        U5      nUWS'   UUS '   UUS   -  US!'   UR]                  S"S5      (       d  S#S$0US"'   UR]                  S%S5      nURF                  RI                  URF                  RK                  [        R4                  R6                  5      URF                  RM                  [        R4                  R6                  5      RO                  S5      S    S&35      n/ nSnUS':  GaB  SS(K9J:n   [        R4                  Rv                  Ry                  S)5      (       d   e[        R4                  Rv                  R{                  S)5      RO                  S5      u  n!n"U " U!U"S*9n#U#R}                  S+5        U#R                  S5        SSK@n$ U$R                  5       n%U$R                  U$R                  U%5      5      nUS,:w  d   eUS"   R]                  S-S5      (       GaP  UR                  S.5        S/U 3n&U#R                  U&U R                  S05      5      (       d:  UR                  S'5        U#R                  U&U R                  S05      5      (       d  M:  [        U#R                  S/5      5      n'[i        U'5      n(U(U:w  a`  UR                  S'5        U#R                  U&U R                  S05      5        [        U#R                  S/5      5      n'[i        U'5      n(U(U:w  a  M`  [        U' V)s/ s H  n)U)S   R                  5       PM     sn)5      nUR                  S1[i        U5       S2U S35        UR]                  S3S45      n*U*US3'   U*[        lL        S5U;  a  U*OUR]                  S55      n+UR]                  S6S5      n,UR]                  S7S5      n-UR]                  S8S5      n.S9n/U" U5      n0Sn1Sn2S:[        R4                  lM        URe                  [        5      n3UR]                  S;0 5      R]                  S<S=5      S=:X  Ga  URe                  U5      n4S>U4S"'   U" U45      n5U5R                  5       n6Sn7U1S'-  n1S?[o        U15      -   n8U8[        R4                  lO        U6(       Ga  URe                  U35      qS@R                  U1U6S<   U6SA   U6SB   U6SC   U6SD   U6SE   U6SF   U6SG   U6SH   5
      n9U9[        R4                  lQ        U" UU6U45      n:U:[        R4                  l1        [        R                  R                  SIU8 SJU9 SKU6 35        UR                  SIU8 SJU9 SKU6 35        UR                  [        5      n;U;R                  5         U" [        R4                  R                  SLUSM   S#   [        R4                  R                   SN3SO9u  n<n=n>U>S'-  (       aK  [        R                  R                  SPU9 35        UR                  SPU9 35        S:U6SQ'   SU6USM   S#   '   U=U6S8'   U>SR-  (       aK  [        R                  R                  SSU9 35        UR                  SSU9 35        S:U6SQ'   SU6USM   S#   '   STU6S8'   U>SU-  (       aE  U>SR-  (       d;  [        R                  R                  SVU9 35        UR                  SVU9 35        SU6S8'   U>(       d  U<U6SQ'   U<U6USM   S#   '   U=U6S8'   U>S'-  (       d
  U>SR-  (       a  OU1U6SW'   U6S<   n7U0R                  " S0 U6D6  U;R                  SXSY9  U0R                  SZ5        U5R                  5       n?URe                  U?5      n6U5R                  U65        UR                  S[5        U6(       a  GM  U7c  [;        S\U4 35      eU7US;   S<'   U0R                  SZ5        U0R                  5         UR                  5       n@[        R                  R                  S]U@U-
   S^35        UR                  S]U@U-
   S^35        U" U5      nAUR                  S_[i        UAR                  R                  5       S`35        UR]                  SaU5      nBUAR                  UB5        UAR                  5       nCUAR                  UC5        SbnDUCc   WD5       eWC(       Ga  UR                  5       nEURe                  U35      qU/(       a  U+[        lL        SXn/S<WC;   a  WCS<   OUS;   S<   nFUFWCSA   -  UCSD   -  UCSF   -  nGUGUCSc'   UFUCS<'   U1S'-  n1Sd[o        U15      -   n8U8[        R4                  lO        SeR                  U1UFUCSA   UCSB   UCSC   UCSf   UCSD   UCSE   UCSF   UCSG   UCSH   UCSc   5      n9SgUC;   a  U9ShWCSg    3-   n9SiU;   aL  USi    HC  nHSjRI                  Sk UHRO                  Sl5       5       5      nIUI[o        WCUH   5      -  nIU9Sl-   UI-   n9ME     SmU;   aL  USm    HC  nHSjRI                  Sn UHRO                  Sl5       5       5      nIUI[o        WCUH   5      -  nIU9Sl-   UI-   n9ME     URF                  RI                  URF                  RK                  [        R4                  R6                  5      U95      [        R4                  lQ        U9WCSo'   U" UUCU5      nJUJ[        R4                  l1        UCR                  So5        [        R                  R                  SpU8 SJU9 35        UR                  SpU8 SJU9 35        WAR                  UC5      nKUK(       Ga?  WKnCU1UCSW'   WAR                  R                  S:5        UAR                  UC5        U0R                  c  SqWC;   a
  WCSq   U0l`        U0R                  " S0 WCD6  U0R                  USM   S#   USM   Sr   U-U.Ss9u  nLn>U>(       dH  UR                  WL5      nM[        R                  R                  StUM 35        UR                  StUM 35        O0[        R                  R                  Su5        UR                  Su5        WCR]                  Sg5      (       a  U	" WCUU0R                  5        WCSv   (       a  U2S'-  n2WCSv   nN[i        WAR                  R                  5      nOUAR                  R                  nP[        R                  R                  SwR                  UPUOUPU1-
  U2UN[        UOUP-
  U*-  Sx-  SR5      5      5        UR                  SwR                  UPUOUPU1-
  U2UN[        UOUP-
  U*-  Sx-  SR5      5      5        U0R                  U5        UAR                  5       nQURe                  UQ5      nCUAR                  UC5        GM  U" [        WCU5      q[m        [        R4                  Rj                  [p        5      (       d5  [q        [        R4                  Rj                  RO                  S5      S   5      O[        R4                  Rj                  nRU(       Ga*  USWR nSUUS;  Ga  W#R]                  SyU9 35      S   nCUR                  5       nTUC(       d_  UR                  5       nUUUWT-
  US3   Sz-   :  a  [;        S{U9 S|35      eUR                  S[5        U#R]                  SyU9 35      S   nCUC(       d  M_  UR                  S}U9 S~35        [        R                  R                  S}U9 S~35        UR                  WCR                  5       5      nCWAR                  R                  S:5        UAR                  UC5        U0R                  c  SqWC;   a
  WCSq   U0l`        U0R                  " S0 WCD6  U0R                  USM   S#   USM   Sr   U-U.Ss9u  nLn>U>(       dH  UR                  WL5      nM[        R                  R                  StUM 35        UR                  StUM 35        O0[        R                  R                  Su5        UR                  Su5        WCR]                  Sg5      (       a  U	" WCUU0R                  5        WCS   nVUV(       a  U2S'-  n2WCSv   nN[i        WAR                  R                  5      nOUAR                  R                  nP[        R                  R                  SwR                  UPUOUPU1-
  U2UN[        UOUP-
  U*-  Sx-  SR5      5      5        UR                  SwR                  UPUOUPU1-
  U2UN[        UOUP-
  U*-  Sx-  SR5      5      5        U0R                  U5        UAR                  5       nQURe                  UQ5      nCUAR                  UC5        GMj  US"   S#   S$:w  a-  SnWS-US"   ;   a  WCSD   S':X  a  SnW[        R                  SWW05        UR                  [        5      n;U;R                  5         UR                  5       nX[        UXWE-
  SR5      WCS'   [        R                  R                  SR                  U8U9UCS   5      5        UR                  SR                  U8U9UCS   5      5        U" [        R4                  R                  SLUSM   S#   [        R4                  R                   SN3SO9u  n<n=n>S9nYU>SR-  nZWRS':  GaU  SyU1 SU 3n&U" [        R4                  R                  5      n[WZ(       a  W#R                  U&STR                  S05      5      (       d9  UR                  S'5        U#R                  U&STR                  S05      5      (       d  M9  [        R                  R                  SU& 35        UR                  SU& 35        GOW[(       a  W#R                  U&SR                  S05      5      (       d9  UR                  S'5        U#R                  U&SR                  S05      5      (       d  M9  [        R                  R                  SU& 35        UR                  SU& 35        GO[        U;S5      (       a  U;R                  S:X  a  W#R                  U&SR                  S05      5      (       d9  UR                  S'5        U#R                  U&SR                  S05      5      (       d  M9  [        R                  R                  SU& 35        UR                  SU& 35        GOV[        U;S5      (       d  U;R                  R                  S:X  a  W#R                  U&SR                  S05      5      (       d9  UR                  S'5        U#R                  U&SR                  S05      5      (       d  M9  [        R                  R                  SU& 35        UR                  SU& 35        OW#R                  U&SR                  S05      5      (       d9  UR                  S'5        U#R                  U&SR                  S05      5      (       d  M9  [        R                  R                  SU& 35        UR                  SU& 35        [        U#R                  SyU1 S35      5      n\[i        U\5      n(U(WR:w  aB  UR                  S'5        [        U#R                  SyU1 S35      5      n\[i        U\5      n(U(WR:w  a  MB  W\ V)s/ s H  n)U)S   R                  5       PM     n]n)[        R                  R                  SU1 SU] 35        UR                  SU1 SU] 35        STU];   a  SXnYS9nZOSW];  a  SXnYSXnVU>S'-  (       aX  [        R                  R                  SU9 S35        UR                  SU9 S35        S:WCSQ'   SUCUSM   S#   '   WZ(       d  U=OSTWCS8'   S9nVU>SR-  (       aM  [        R                  R                  U9 S35        UR                  U9 S35        S:WCSQ'   SUCUSM   S#   '   STUCS8'   S9nVU>SU-  (       aP  U>SR-  (       dF  [        R                  R                  SU9 S35        UR                  SU9 S35        WZ(       d  SOSTWCS8'   WV(       d%  WY(       a  U<WCSQ'   U<UCUSM   S#   '   WZ(       d  U=OSTWCS8'   WV(       d%  WY(       d  S:WCSQ'   SUCUSM   S#   '   WZ(       d  SOSTWCS8'   USM   S#   WC;  a  SWCUSM   S#   '   SU1 SU 3n&US':  Gaa  W#R                  U&[o        WCS8   5      R                  S05      5      (       dE  UR                  S'5        U#R                  U&[o        WCS8   5      R                  S05      5      (       d  ME  [        U#R                  SU1 35      5      n\[i        U\5      n(U(U:w  aB  UR                  S'5        [        U#R                  SU1 S35      5      n\[i        U\5      n(U(U:w  a  MB  W\ V)s/ s H  n)U)S   R                  5       PM     n^n)U^ HQ  n=U=b  WCS8   c  M  U=ST:X  a  U=WCS8'     O:[        [q        [        U=5      5      [q        [        WCS8   5      5      5      UCS8'   MS     URZ                  R]                  SSX5      (       a  Sn_SSKJnn`  UR                  [        R4                  R                  5       Ho  u  nanbncUc Hb  ndUdRy                  S5      (       d  M  W`" [        R4                  R                  Wd5      neUeb  W_c  Wen_MF  We(       d  MO  W_(       d  MX  WeW_:  d  M`  Wen_Md     Mq     W_WCS'   U1WCSW'   SUS"   ;   Ga  SUS"   S   ;   Ga  SgWC;  Ga  WCUSM   S#      nfUS"   S   S   ngU" [        R4                  R                  SLUgS9nhUS"   S   R]                  SS/5      ni[q        US"   S   R]                  SS5      5      njUS"   S   R]                  SSX5      nk[q        UCS!   5      nl[q        US;   R]                  SS5      5      nmUf(       a,  US"   S   R]                  SSX5      (       a  [        WfWl-  SR5      OWfWCSUSM   S#    3'   Wi H  nnWk(       a	  WjS-  Wn-  noOWjSU-  Wn-  noWf(       a  Wh(       a  [        WhUhWo-   -  Wf-  S.5      OSnpUpWCSWn SlUSM   S#    3'   Up(       a,  US"   S   R]                  SSX5      (       a  [        WpWl-  SR5      OWpWCSWn SlUSM   S#    3'   U0R                  b  M  SWn SlUSM   S#    3U0l`        U0R                  WCSq'   M     SnNWVWCS'   UV(       Ga  / nNU2S'-  n2WZ(       a  WNR                  S5        GOWRS':  Gaj  SU1 SU 3n&U
" [        R4                  R                  5      nq[i        Uq5      S:  a  W#R                  U&WqR                  S0S5      5      (       d:  UR                  S'5        U#R                  U&WqR                  S0S5      5      (       d  M:  [        R                  R                  SWq SU& 35        UR                  SUq SU& 35        OW#R                  U&SR                  S05      5      (       d9  UR                  S'5        U#R                  U&SR                  S05      5      (       d  M9  [        R                  R                  SU& 35        UR                  SU& 35        [        U#R                  SU1 S35      5      n\[i        U\5      n(U(WR:w  aB  UR                  S'5        [        U#R                  SU1 S35      5      n\[i        U\5      n(U(WR:w  a  MB  W\ V)s/ s H1  n)SU)S   R                  SS5      ;  d  M  U)S   R                  5       PM3     n]n)[        [        U]5      5      nN[        R                  R                  SU1 SUN 35        UR                  SU1 SUN 35        O/WNR                  U
" [        R4                  R                  5      5        WNWCSv'   [i        WAR                  R                  5      nOUAR                  R                  nP[        R                  R                  SwR                  UPUOUPU1-
  U2UN[        UOUP-
  U*-  Sx-  SR5      5      5        UR                  SwR                  UPUOUPU1-
  U2UN[        UOUP-
  U*-  Sx-  SR5      5      5        U(       a  US   n!UU!:X  a  W#R                  SyU9 3UR                  WC5      R                  S05      5      (       dK  UR                  S'5        U#R                  SyU9 3UR                  WC5      R                  S05      5      (       d  MK  UR                  U SU9 S35        U0R                  " S0 WCD6  U0R                  USM   S#   USM   Sr   U-U.Ss9u  nLn>U>(       dH  UR                  WL5      nM[        R                  R                  StUM 35        UR                  StUM 35        O0[        R                  R                  S5        UR                  S5        WCR]                  Sg5      (       a  U	" WCUU0R                  5        U0R                  U5        U;R                  SXSY9  WAR                  5       nQURe                  UQ5      nCUAR                  UC5        [o        UR                  5       5      nr[        R                  R                  S5      (       a   UR                  S5      R                  5       nsOY[        R                  " 5       (       a   UR                  S5      R                  5       nsOUR                  S5      R                  5       nsWs H8  nt[o        UtR                  5       5      nuUuWr:w  d  M$  UR                  SWu-   5        M:     UR                  S[5        UR                  5       n@SU1 SU 3n&U,(       az  W@U-
  [q        U,5      :  ah  US':  a`  W#R                  U&SR                  S05      5      (       d9  UR                  S'5        U#R                  U&SR                  S05      5      (       d  M9  OgGOUS':  a_  W#R                  U&SR                  S05      5      (       d9  UR                  S'5        U#R                  U&SR                  S05      5      (       d  M9  US':  a  [        W#R                  SU1 35      5      n\[i        U\5      n(U(U:w  aB  UR                  S'5        [        U#R                  SU1 S35      5      n\[i        U\5      n(U(U:w  a  MB  W\ V)s/ s H  n)U)S   R                  5       PM     n]n)SU];   a  O
WC(       a  GM  U0R                  U5        SnvURe                  U35      qUS':  a  URZ                  R]                  SS5      nwUwc   eUWw:X  ao  U0R                  USM   S#   USM   Sr   U-U.Ss9u  nvn>U>(       a  [;        S5      eUR                  Wv5      nxW#R                  S+Wx5      (       d  UR                  S'5        M*  O[        S5       H@  n) W#R]                  S+5      S   R                  5       nxUR                  Ux5      nvWv(       d  M@    O   Wv(       d   eO3U0R                  USM   S#   USM   Sr   U-U.Ss9u  nvn>U>(       a  [;        S5      eWv(       a	  WvSQ   S::w  d   eUR                  5       n@[        R                  R                  SU@U-
   S^35        UR                  SU@U-
   S^35        UR]                  SS95      (       a  US"   R]                  S-S5      (       a  UR                  5         U" UWvUS9S9nJS9[        l~        UJ[        R4                  l1        S+[        R4                  lO        UR                  Uv5      nM[        R                  R                  SUM 35        UR                  SUM 35        UR]                  SS5      (       a  US   [        R4                  lQ        O`URF                  RI                  URF                  RK                  [        R4                  R6                  5      S+5      [        R4                  lQ        UR                  [        5      n;U;R                  5         U;R                  S9SY9  gSSKJn  UR                  [        5      n;U;R                  5         U;R                  5         g! , (       d  f       G*N= f!   [;        S5      e= f!   S,n G%N= fs  sn)f s  sn)f s  sn)f s  sn)f s  sn)f ! [         aL  ny[        R                  R                  Uy5        UR                  Uy5        UR                  SR5         SnyAyGNSnyAyff = f)a1  
Paddle distribution training entry ``python -m paddle.distributed.launch``.

Usage:
    .. code-block:: bash
        :name: code-block-bash1

        python -m paddle.distributed.launch [-h] [--master MASTER] [--rank RANK]
               [--log_level LOG_LEVEL] [--nnodes NNODES]
               [--nproc_per_node NPROC_PER_NODE] [--log_dir LOG_DIR]
               [--run_mode RUN_MODE] [--job_id JOB_ID] [--devices DEVICES]
               [--host HOST] [--servers SERVERS] [--trainers TRAINERS]
               [--trainer_num TRAINER_NUM] [--server_num SERVER_NUM]
               [--gloo_port GLOO_PORT] [--with_gloo WITH_GLOO]
               [--max_restart MAX_RESTART] [--elastic_level ELASTIC_LEVEL]
               [--elastic_timeout ELASTIC_TIMEOUT]
               training_script ...


Base Parameters:
    - ``--master``: The master/rendezvous server, support ``http://`` and ``etcd://``, default with ``http://``. e.g., ``--master=127.0.0.1:8080``. Default ``--master=None``.

    - ``--rank``: The rank of the node, can be auto assigned by master. Default ``--rank=-1``.

    - ``--log_level``: The log level to set for logging.setLevel which can be CRITICAL/ERROR/WARNING/INFO/DEBUG/NOTSET, case insensitive. Default ``--log_level=INFO``.

    - ``--nnodes``: The number of nodes for a distributed job, it can be a range in elastic mode, e.g., ``--nnodes=2:3``. Default ``--nnodes=1``.

    - ``--nproc_per_node``: The number of processes to launch on a node. In gpu training, it should be less or equal to the gpus number of you system.  e.g., ``--nproc_per_node=8``

    - ``--log_dir``: The path for each process's log. e.g., ``--log_dir=output_dir``. Default ``--log_dir=log``.

    - ``--run_mode``: The run mode of job, can be:collective/ps/ps-heter/rpc. e.g., ``--run_mode=ps``. Default ``--run_mode=collective``.

    - ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``.

    - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.

    - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``

    - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``

Collective Parameters:
    - ``--ips``: [DEPRECATED] Paddle cluster nodes ips, e.g., ``--ips=192.168.0.16,192.168.0.17``. Default ``--ips=127.0.0.1``.

Parameter-Server Parameters:
    - ``--servers``: User defined servers ip:port, e.g., ``--servers="192.168.0.16:6170,192.168.0.17:6170"``

    - ``--trainers``: User defined trainers ip:port, e.g., ``--trainers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172"``

    - ``--workers``: [DEPRECATED] The same as trainers.

    - ``--trainer_num``: Number of trainers on each node, can be 0.

    - ``--worker_num``: [DEPRECATED] The same as trainer_num.

    - ``--server_num``: Number of servers on each node, can be 0.

    - ``--heter_workers``: User defined heter workers ip1:port1;ip2:port2, e.g., ``--heter_workers="192.168.0.16:6172;192.168.0.17:6172"``

    - ``--heter_worker_num``: Number of heter_workers in each stage (It recommend to set when in the emulated distributed environment using single node)

    - ``--heter_devices``: Type of heter_device in each stage

    - ``--gloo_port``: Gloo http Port. Default ``--gloo_port=6767``.

    - ``--with_gloo``: Using gloo or not. Default ``--with_gloo=0``.

Elastic Parameters:
    - ``--max_restart``: The maximum restart times for an elastic job. Default ``--max_restart=3``.

    - ``--elastic_level``: The elastic level: -1: disable, 0: failed exit, peers hold, 1: internal restart. Default ``--elastic_level=-1``.

    - ``--elastic_timeout``: Seconds to wait before elastic job begin to train. Default ``--elastic_timeout=30``.

IPU Parameters:
    IPU distributed launch only requires and allows three arguments ``--devices``, ``training_script`` and ``training_script_args``.
    The ``--devices`` is the number of IPU devices. e.g., ``--devices=4`` will launch the training program with four IPU devices.
    The ``training_script`` is only allowed to set as ``ipu``.
    The ``training_script_args`` includes arguments required by IPU distributed launch and illustrated as below.
    ``Examples 10`` has provided a example of paddle.distributed.launch with IPUs.

    - ``--hosts``: The hosts for IPU distributed training. Each host is able to include multiple processes.

    - ``--nproc_per_host``: The number of processes launched per host. Each process is able to include multiple replicas.

    - ``--ipus_per_replica``: The number of IPUs requested per replica. Each replica is able to include multiple IPUs.

    - ``--ipu_partition``: The partition name of IPU devices.

    - ``--vipu_server``: The ip of the IPU device manager.

    - ``training_script``: The full path to the IPU distributed training program/script to be launched in parallel. e.g., ``training.py``.

    - ``training_script_args``: The args of the IPU distributed training program/script. e.g., ``--lr=0.1``.

Returns:
    - ``None``

Examples 0 (master, ip/port auto detection):
    .. code-block:: bash
        :name: code-block-example-bash0

        # For training on multi node, run the following command in one of the nodes

        python -m paddle.distributed.launch --nnodes 2 train.py

        # Then the following info will be print

        # Copy the following command to other nodes to run.
        # --------------------------------------------------------------------------------
        # python -m paddle.distributed.launch --master 10.0.0.1:38714 --nnodes 2 train.py
        # --------------------------------------------------------------------------------

        # Follow the instruction above and paste the command in other nodes can launch a multi nodes training job.

        # There are two ways to launch a job with the same command for multi nodes training
        # 1) using the following command in every nodes, make sure the ip is one of the training node and the port is available on that node
        # python -m paddle.distributed.launch --master 10.0.0.1:38714 --nnodes 2 train.py
        # 2) using the following command in every nodes with a independent etcd service
        # python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2 train.py

        # This functionality works will for both collective and ps mode and even with other arguments.


Examples 1 (collective, single node):
    .. code-block:: bash
        :name: code-block-example-bash1

        # For training on single node using 4 gpus.

        python -m paddle.distributed.launch --devices=0,1,2,3 train.py --lr=0.01

Examples 2 (collective, multi node):
    .. code-block:: bash
        :name: code-block-example-bash2

        # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17

        # On 192.168.0.16:

        python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01

        # On 192.168.0.17:
        python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01

Examples 3 (ps, cpu, single node):
    .. code-block:: bash
        :name: code-block-example-bash3

        # To simulate distributed environment using single node, e.g., 2 servers and 4 workers.

        python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01

Examples 4 (ps, cpu, multi node):
    .. code-block:: bash
        :name: code-block-example-bash4

        # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.

        # On 192.168.0.16:

        python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01

        # On 192.168.0.17:

        python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01

        # Or with master, the following command run 2 server and 2 trainer on each node.

        python -m paddle.distributed.launch --master 192.168.0.16:9090 --server_num=2 --trainer_num=2 --nnodes 2 train.py


Examples 5 (ps, gpu, single node):
    .. code-block:: bash
        :name: code-block-example-bash5

        # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.

        export CUDA_VISIBLE_DEVICES=0,1,2,3
        python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01

Examples 6 (ps, gpu, multi node):
    .. code-block:: bash
        :name: code-block-example-bash6

        # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.

        # On 192.168.0.16:

        export CUDA_VISIBLE_DEVICES=0,1
        python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01

        # On 192.168.0.17:

        export CUDA_VISIBLE_DEVICES=0,1
        python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01

Examples 7 (ps-heter, cpu + gpu, single node):
    .. code-block:: bash
        :name: code-block-example-bash7

        # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.

        export CUDA_VISIBLE_DEVICES=0,1
        python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01

Examples 8 (ps-heter, cpu + gpu, multi node):
    .. code-block:: bash
        :name: code-block-example-bash8

        # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server, 1 gpu worker, 1 cpu worker.

        # On 192.168.0.16:

        export CUDA_VISIBLE_DEVICES=0
        python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.17:6171" --heter_workers="192.168.0.16:6172,192.168.0.17:6172" train.py --lr=0.01

        # On 192.168.0.17:

        export CUDA_VISIBLE_DEVICES=0
        python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.17:6171" --heter_workers="192.168.0.16:6172,192.168.0.17:6172" train.py --lr=0.01

Examples 9 (elastic):
    .. code-block:: bash
        :name: code-block-example-bash9

        # With the following command, the job will begin to run immediately if 4 nodes are ready,
        # or it will run after elastic_timeout if only 2 or 3 nodes ready
        python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:4 train.py

        # once the number of nodes changes between 2:4 during training, the strategy holds

Examples 10 (ipu):
    .. code-block:: bash
        :name: code-block-example-bash10

        # With the following command, the job will begin to run the distributhed program with IPUs
        # Require `devices` as the number of IPUs
        # Require `training_script` to be set as `ipu`
        # Require `training_script_args` as the arguments of IPU distributed training instead of the arguments of the training program/script
        # Please Check the `IPU Parameters` for details
        python -m paddle.distributed.launch --devices 4 ipu --hosts=localhost --nproc_per_host=2 --ipus_per_replica=1 --ipu_partition=pod16 --vipu_server=127.0.0.1 train.py

Examples 11 (rpc, cpu, single node):
    .. code-block:: bash
        :name: code-block-example-bash11

        # Training on single node with two local servers
        python -m paddle.distributed.launch --master 127.0.0.1:8765 --nnodes 1 --nproc_per_node 2 --rank 0 --run_mode rpc train.py

Examples 12 (rpc, cpu, multi node):
    .. code-block:: bash
        :name: code-block-example-bash12

        # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 2 servers.

        # On 192.168.0.16

        python -m paddle.distributed.launch --master 192.168.0.16:8765 --nnodes 2 --nproc_per_node 2 --rank 0 --run_mode rpc train.py

        # On 192.168.0.17

        python -m paddle.distributed.launch --master 192.168.0.16:8765 --nnodes 2 --nproc_per_node 2 --rank 1 --run_mode rpc train.py

r   )launchN)HistoryRecorder)	AutoTuner)add_overlap_performancefind_error_from_loggen_new_argsgen_new_ctxread_completedread_logread_step_time_log)controllersz.jsonz+Please use '.json' as the file name suffix.rz0Please check your auto tuner json whether valid.
auto_tuner.z_auto_tuner.logw)modez4%(asctime)s - %(name)s - %(levelname)s - %(message)sz.pyWITH_COVERAGEONz-uz-mcoveragerunz--branchz-pz.pyxes   ,:nodesgpus_per_nodenum_gpussearch_algonamegridr   z_history.csv   )
ETCDClientzetcd://)hostportbest_cfgz	127.0.0.1estimated_num_gpus   zauto_tuner/ip/zlatin-1zThe total count of nodes is z and sorted ips are max_time_per_taski  warmup_timemax_search_timebuffermax_mem_usageT	model_cfgglobal_batch_sizeautogbs
gbs_tuner_zbJob{}_GBSSearch/GBS{}_DP{}_MP{}_PP{}_Sharding_degree_{}_stage_{}_MBS{}_Recompute_{}_granularity_{}	dp_degree	mp_degree	pp_degreesharding_degreesharding_stagemicro_batch_sizeuse_recomputerecompute_granularityz$Launch task from auto tuner: job_id z
, log_dir z	, config zworkerlog.0
metric_cfgz.gpu.log)pathmetric_filetarget_metricmemory_filez#Read metric failed for parameters: time   zOut of memory for parameters: OOM   z)Read memory usage failed for parameters: job_idF)exitz./tuner_gbs_history.csv   zRNo valid global batch size found, check memory or valid search time. cur_tuner_cfgz!AutoTuner for GBS search ends in zs.zLaunch z tasks by auto tuner: resume_csv_file_patha  No config can search. Please check if there are any situations where GBS is unable to divide dp degree or shading degree, or if there are related configurations of the model such as hidden_size cannot be evenly divided by mp degree, num_ Layers cannot divide pp degree.	acc_stepsauto_tuner_z_Job{}_GBS{}_DP{}_MP{}_PP{}_VPP{}_Sharding{}_Stage{}_MBS{}_Recompute_{}_Granularity_{}_AccStep{}
vpp_degreesharding_overlap	_Overlap_refined_recompute c              3  @   #    U  H  oR                  5       v   M     g 7fN
capitalize.0is     ^/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/launch/main.py	<genexpr>launch.<locals>.<genexpr>       &N~!||~~~   _custom_search_dimc              3  @   #    U  H  oR                  5       v   M     g 7frR   rS   rU   s     rX   rY   rZ     r[   r\   log_dir_namezLaunch task: job_id additional_metric_keyOptimizationDirection)metric	directionr-   r.   zCurrent best config: z7Get best config failed. Currently no config can be run.
error_infozaAuto Tuner Schedule: [{}/{}], Pruned nums {}, Error nums {}, Error info {}, Remaining time {} min<   zauto_tuner/   zWait z failedzReceive that task z has ended by etcd.	has_error01$FLAGS_shard_bypass_dygraph_optimizer	exec_timez(Task: job_id {}, log_dir {} ended in {}s/zPut OOM to OKz
Put OK to sigint   ErrorzPut Error to zStatus of auto_tuner/z/: zRead metric of z failed.z OOM.zRead memory usage of zauto_tuner/mem/FLAGS_log_memory_stats)read_allocated_memory_log	workerlogmax_peak_memory
conversion	step_time)r>   filer@   comm_bwd   model_size_b   ampmax_seq_lengthi   
need_unifyunified_   bw_unified_bw_zOut of memoryzauto_tuner/error/ignorezPut Error info: z to zutf-8zStatus of auto_tuner/error/z put auto_tuner/z successfully.z-Get best config failed, no config can be run.npuz<fuser -v /dev/davinci* |awk '{for(i=1;i<=NF;i++) print $i;}'z8fuser -v /dev/xpu* |awk '{for(i=1;i<=NF;i++) print $i;}'z;fuser -v /dev/nvidia* |awk '{for(i=1;i<=NF;i++) print $i;}'zkill -9 zauto_tuner/exit/errorokCOLLECTIVE_MASTER_IPzCGet best config failed. Currently there are no appropriate configs.
   zAutoTuner ended in run_best)r   zLaunch best cfg: best_cfg_dir )r   ctxis_legacy_modepaddle.distributed.fleetr   is_auto_tuner_modecopyjsonloggingossysrB   &paddle.distributed.auto_tuner.recorderr   #paddle.distributed.auto_tuner.tunerr   #paddle.distributed.auto_tuner.utilsr	   r
   r   r   r   r   r   paddle.distributed.launchr   argsauto_tuner_jsonendswith
ValueErroropenload	getLoggersetLevelINFOr>   joindirnamebasenamesplitFileHandler	FormattersetFormatter
addHandlertraining_scriptenvironget
executableextendtraining_script_argsdeepcopydeviceslennnodes
isinstancestrint+paddle.distributed.launch.utils.etcd_clientr$   master
startswithremoveprefixdeletedelete_prefixsocketgethostnamegethostbynamegetfqdnsleepputencodelist
get_prefixsorteddecodeinfor*   max_restartsearch_oncerF   formatlog_dirloggerinitr   warningadd_cfgfinalizestore_historyclean_historyalgo	all_tasksresume_form_historypopget_cfg_from_resumehistory_cfgsra   get_bestdumpshistoryidxroundloadsset_envshasattrro   pod	exit_codemaxfloatrs   walkappendsetgetpidpaddledeviceis_compiled_with_custom_devicepopen	readlinesis_compiled_with_xpustripsystemrange	ExceptionrG   r   )zr   r   r   r   r   r   rB   r   r   r	   r
   r   r   r   r   r   r   
start_timef	tuner_cfgr   auto_tuner_log_pathhandler	formatter
entrypointraw_argsr   r   r   history_file_path
sorted_ipsipr$   	master_ipr&   clientr   hostnamer>   ipssizerW   r*   r+   r,   r-   r.   is_first_taskrecorderrF   error_task_numsraw_ctxgbs_tuner_cfg	gbs_tunergbs_cur_cfgbest_gbstask_job_idr   gbs_new_argscrc   memerrgbs_new_cfgend_timer   rI   cur_cfg	error_msgtask_start_timer1   rJ   keydir_namenew_argscur_resume_cfgcur_best_cfgsto_json_strre   	task_numscur_task_idnew_cfgactual_nnodesactual_exec_ipswait_start_timewait_end_timerh   bypass_optimizer_flagtask_end_timetimeout_flagOOM_flag	completedresultstatusmem_allnodesru   rs   rootdirsfilesrx   peak_memorysingle_dp_performancestep_time_metricrw   ry   r{   r}   r   
seq_lengthbw	comm_timemulti_dp_performancesingle_error_infoself_pid	processesprocesspidr'   collective_master_ipdataesz                                                                                                                             rX   r   r      s*   \ )C
3				!	!JA	
 	
 	
 	:YY[
xx''0099JKK	Qchh..4 IIaL	 5
 ""<0% ggllGGOOCHH445ww 8 89??DQGHX
 %%&9%D&%%B
	 	Y''" 88##,,U33zz~~o.$6NNHH,,	
 "nndCHH4L4LM
XX%%..x88..#((*B*BCJ((223J#((778==!>!>? xxM 0 0 6 6s ;<Mfc""c*1-.F[F#	'%2	/" -	'0B B	*}}]D11(.'7Im$}}VT*GGLLGGOOCHH445ww 8 89??DQGHU
 
A:N88??--i8888!hhoo::9EKKCPOItYT:FMM*%  .!!--/))&..*BC $$$'++,@$GG 

1't, **TbT>>)+DEEJJqM !**TbT>>)+DEE 6,,-=>?3xfnJJqMJJtt^^I%>?v001ABCCs8D	 fn
 $C$@CqQqT[[]C$@A
23z?2CCWXbWccde
 &MM*=tD):	%& 1
 I- }- 	 $--(94@ x.!ot<"9-!--$ MM+r*../BFK !MM)4M+0M-(!-0I#//1KH aKF&V4K)CHHOmmG,~  F  F 34,,, 12 01 230 78 $+   ,k=  1=- 

:;-zRYQZZcdocpq :;-zRYQZZcdocpq  $$S)
 $,)) -"+L"9&"A#&88??"38 <	$ S &>JJ&&=gYG NN=gYG +-K'CGK	, 7 ?@36K0&>JJ&&8	B NN%CG9#MN*,K'CGK	, 7 ?@38K0 &>#.JJ&&CG9M NNCG9M 48K0*0K'CIK	, 7 ?@36K0&>SF^ )/H%&':;  /;/


&&&'@A (335"mmK8!!+. 

1C +F  hivhwx  ;CIk"#67""#<=""$yy{HJJOO3Hz4I3J"M KK3Hz4I3J"M
 y)
c*//33455KL	
  )}}"$5 
 	&&';<((*7#5 	 "-I-""iikO--(C(3%!M ''1 +,{+,?@  ";'(,-. -./  $-GK +<G'( aKF'#f+5K)CHHOw~~!$$$%)*()*+(/0$G "W,!i8J0K/L$MM"i/$%89C!ww&Nsyy~&NNHGCL 11H%mh6G :
 #i/$%89C!ww&Nsyy~&NNHGCL 11H%mh6G :
  "ww|| 8 897 CHH
 '.GN##HgyAH,4CHH)KK' JJOO&{m:gYG KK.{m:gYOP';;GDN($*!''++B/""7+22:/7:5</6H2   +7+%-%6%6$\26:'56MN!"/	 &7 &"s "&**]";KJJOO&;K=$IJKK"7} EFJJOOQ KKQ ;;122+H,<,< <(#q(O$\2

 9 9:	(oo11

w~~#!#f,'"&48IIBN
 w~~#!#f,'"&48IIBN
 &&'89$002--0""7+ c7I6C "#((//377 CHHOO))#.q12XX__ 
 ",^m"<_,$jj;wi)@A!DG&*iikO%(,		)O;'(;<rAB #-uWIW-E"FF

1"(**{7)-D"Ea"H &g KK,WI5HI JJOO,WI5HI #jj)9:G++//3&&w/ 66>3w>9@3:6 $$/w/)1):):(6v>"+L"93#  &&3 *; *&M3 &*jj&?

*?}(MN&;K=$IJ

U U {{#566/#Y0@0@ !( 4I '1,!(!6J #JOO$=$= >I",//"5"5KJJOO{  C  C'%'&0+&!!*[!8"3!4"$!% !"	 KK{  C  C'%'&0+&!!*[!8"3!4"$!% !"	 **+<=(446G"mmG4G&&w/ '/69(+%(Im,DD 12a7,/)>@U
   %AEEG IIKM#()H!#LGK JJOO:AA'+*>
 KK:AA'+*>  (XX%%)'5f="xx/x8	 FC  Lf~Hq $VHAbT2*388+;+;<	$jju||I/FGG

1 %jju||I/FGGJJOOk$$89KK+dV 45$jjt{{9/EFF

1 %jjt{{9/EFFJJOOj$78KK*TF 34Q))ahh"n$jjt{{9/EFF

1 %jjt{{9/EFFJJOOj$78KK*TF 34 H--!%%//Q2F$jjt{{9/EFF

1 %jjt{{9/EFFJJOOj$78KK*TF 34$jjw~~i/HII

1 %jjw~~i/HIIJJOOmD6$:;KK-v 67f//+fXQ0GHI6{m+JJqM!&"3"3k&4K"LMFv;D m+
 288A!A$++-8

"7xs6( KL3F83vhGHF?#(L#H'#(LIf~

""_WIX#FG	BC"$;?	,/786>3E( 	f~

""gYe#45'%01"$;?	,/78+0( 	 f~cVn

""%:7)8#LM!6wixHI7?4U("(;A	,/786>3E(\"$;?	,/787?4U(&v.g=;?	,/78$VHAbT2Dz **#go67>>yI  JJqM !**#go67>>yI  f///&0JKL6{fnJJqM!))OF81*EFF v;D fn 8>>v!!v>'C{go&>&F e|360/2E#JU7?3K-L)M0GO, ( zz~~6>>"& *,1A1A)B%D$ %#{;;$&?HH,,d' '27N.9O([__*_<2= !& *C .=)* &GH 	- 889]#;L#II&g5(/	,0G0O(P%#,]#;L#I$  /))&"2	 $M2<@DDu  #m,\:>>& 
  .|<@@Nwz23 k*../?F

 -!-0>BB$e  /(:A>
 / (9\#:6#B"CDE "B$0E$:R$?	$01$4r$9	 1Y %(94634 	 " ) - c"Qy'>v'F&GHI 0%m4\BFF(%  2X=qA
 2 %bT9\+B6+J*KL  55=)"Qy/Fv/N.OP !6 %::   78A "H J#,GK 
1$%%o6$q(!26(!B4@,?HH,,-) 01A5&,jj $ 1 8 8H M' ' !%

1	 '-jj $ 1 8 8H M' '
  JJOO"23D2ET$ P #KK"23D2ET$ P '-jjt{{97M&N&N $

1 '-jjt{{97M&N&NJJOOj,?@"KK*TF(;<!%"--0A&.KL"  #6{"m3 JJqM%) & 1 14EfXQ2O P&F $'v;D #m3 &,"%+#1Q4;;w+II *AaDKKM%+  "
 &*#f+%6


9&ZLQ 9&ZLQ #))*=chh>N>N*OP$.GL!JOO556I$//--KJJOOszz&(#"[04EEJ
 KKszz&(#"[04EEJ
 &qM	?$jj%gY/

7+229=  

1	 %jj%gY/

7+229= 
 KK2$&6wi~ NO'w'!)!2!2 .v6#L12IJ+	 "3 "M3 "jj7

"7} EF3K=AB

 OPKL {{-..'H<L<LM""#45JJEJ" !,,.GmmG,Gw' 299;'H}};;EBBHHR)+  ,,..HHN)+  HHQ)+  %'--/*(?IIj3./ % JJqMyy{H &fXQrd3DHz$9S> $ A:$jjw~~i/HII

1 %jjw~~i/HII A:$jjt{{9/EFF

1 %jjt{{9/EFF zf//2B6(0KLM6{fnJJqM!)),<VHA*FGF v;D fn 288A!A$++-8f$u gx 	01 mmG$A:#%::>>2H$#O '333)) ( 1 1$\26:'56MN!"/	 !2 !# $]  zz(+ **Z66JJqM 7 rA&%zz*5a8??A#'::d#3
  x #  x$-- .v6#L12IJ+	 . MHc  Y  HV,22299;

-h.C-DBGH)(Z*?)@CD }}Z..)M2J2N2N $3
 3
 HHJ(IM(0%$jj*

+K=9:'}56==..(8CHH!ww|| 8 89: CHH S!		


 	: S! 	
 	


]  54	QOPPV! " %A~ 9~  ?n"| 9B % &

**1-q)

1&s   $Bn. ?BnBn. 0Bn> &BohBos0BoH*BoIBo_9Boc83Bo!n
Bn+n&Bn. n+Bn. n.Bn;n>Boo!
Bp7o+ABp2p2Bp7__main__)returnNone)
__future__r   r   !paddle.distributed.launch.contextr   r   r   __name__r       rX   <module>rE     s0    #  5
~B* z
H rD  