
    Αi˒                   6   S SK Jr  S SKrS SKJrJrJr  S SKrS SK	rS SK
r
S SKJr  S SKJr  S SKJr  S SKJr  \(       a  S SKJr   " S	 S
\SS9r " S S\SS9r " S S\SS9r " S S\SS9r " S S\SS9r " S S\SS9r " S S\SS9r " S S\SS9r " S S\SS9r " S S\SS9r " S S \SS9r " S! S"\SS9r  " S# S$\SS9r! " S% S&\SS9r" " S' S(\SS9r# " S) S*\SS9r$/ r%S+q&S, r'\" \'5      r(S- r)S. r*S/ r+S0 r,S1 r- " S2 S35      r.\/r0 " S4 S55      r1g)6    )annotationsN)TYPE_CHECKINGAny	TypedDict)_global_flags)wrap_decorator)distributed_strategy_pb2)logger)BuildStrategyc                  \    \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S'   S\S	'   S\S
'   Srg)	_SyncConf!   intk_stepmax_merge_var_numsend_queue_sizeboolindependent_recv_threadthread_pool_sizesend_wait_timesruntime_split_send_recv N__name__
__module____qualname____firstlineno____annotations____static_attributes__r       r/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/fleet/base/distributed_strategy.pyr   r   !   s+    !%%!%%r    r   F)totalc                  >    \ rS rSr% S\S'   S\S'   S\S'   S\S'   Srg	)
_TrainerDescConf*   strdump_fields_path	list[str]
dump_field
dump_paramstat_var_namesr   Nr   r   r    r!   r$   r$   *   s    !!r    r$   c                  >    \ rS rSr% S\S'   S\S'   S\S'   S\S'   Srg)	_FsClientParam0   r&   uriuserpasswd
hadoop_binr   Nr   r   r    r!   r-   r-   0   s    	r    r-   c                      \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   Srg)_AmpConf6   floatinit_loss_scalingr   use_dynamic_loss_scalingr   incr_every_n_stepsdecr_every_n_nan_or_inf
incr_ratio
decr_ratior(   custom_white_listcustom_black_listcustom_black_varnamesuse_pure_fp16use_pure_bf16use_fp16_guardr   Nr   r   r    r!   r4   r4   6   sJ      "&&!$$$$$$((r    r4   c                  H    \ rS rSr% S\S'   S\S'   S\S'   S\S'   S	\S
'   Srg)
_QATConfigD   r   channel_wise_abs_maxr   weight_bitsactivation_bitsr(   not_quant_patternr&   algor   Nr   r   r    r!   rD   rD   D   s    ""$$	r    rD   c                  4    \ rS rSr% S\S'   S\S'   S\S'   Srg	)
_RecomputeConfigK   r(   checkpointsr   enable_offloadz	list[int]checkpoint_shaper   Nr   r   r    r!   rL   rL   K   s    ##r    rL   c                      \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   Srg)_ShardingConfigP   r&   sharding_segment_strategyr6   segment_broadcast_MBr(   segment_anchorsr   sharding_degreegradient_merge_acc_stepr   optimize_offload	dp_degree	mp_degree	pp_degreepp_allreduce_in_optimizeoptimize_castr   Nr   r   r    r!   rR   rR   P   sA    #&&##""!$$"&&r    rR   c                       \ rS rSr% S\S'   Srg)_PipelineConfig]   r   micro_batch_sizer   Nr   r   r    r!   r`   r`   ]   s    r    r`   c                  *    \ rS rSr% S\S'   S\S'   Srg)_TensorParallelConfig`   r   tensor_parallel_degreetensor_init_seedr   Nr   r   r    r!   rd   rd   `   s     ##r    rd   c                  \    \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S	\S
'   Srg)_HybridConfigd   r   rZ   r[   r\   
sep_degree	cp_degreerW   r(   orderr   Nr   r   r    r!   ri   ri   d   s'    r    ri   c                  *    \ rS rSr% S\S'   S\S'   Srg)_LocalSGDConfigm   r   k_steps
begin_stepr   Nr   r   r    r!   ro   ro   m   s    r    ro   c                  *    \ rS rSr% S\S'   S\S'   Srg)_AdaptiveLocalSGDConfigq   r   init_k_stepsrr   r   Nr   r   r    r!   rt   rt   q   s    r    rt   c                  4    \ rS rSr% S\S'   S\S'   S\S'   Srg)	
_DGCConfigu   r   rampup_begin_steprampup_stepzlist[float]sparsityr   Nr   r   r    r!   rx   rx   u   s    r    rx   c                  *    \ rS rSr% S\S'   S\S'   Srg)_GradientMergeConfigz   r   rq   r   avgr   Nr   r   r    r!   r~   r~   z   s    	r    r~   c                  >    \ rS rSr% S\S'   S\S'   S\S'   S\S'   Srg	)
_LarsConfig~   r6   
lars_coefflars_weight_decayepsilonr(   exclude_from_weight_decayr   Nr   r   r    r!   r   r   ~   s      #,,r    r   c                  *    \ rS rSr% S\S'   S\S'   Srg)_LambConfig   r6   lamb_weight_decayr(   r   r   Nr   r   r    r!   r   r      s      #,,r    r   Tc                   ^  U 4S jnU$ )Nc                    > Sq T" U 0 UD6$ )NF)non_auto_func_called)argskwargsfuncs     r!   __impl__*__non_auto_func_called__.<locals>.__impl__   s    $T$V$$r    r   )r   r   s   ` r!   __non_auto_func_called__r      s    %
 Or    c                T   / nU  H  nUR                   R                  n0 nU Hm  n[        X%R                  5      nUR                  [
        R                  R                  R                  R                  :X  a  [        U5      nXdUR                  '   Mo     UR                  U5        M     U$ N)
DESCRIPTORfieldsgetattrnamelabelgoogleprotobuf
descriptorFieldDescriptorLABEL_REPEATEDlistappend)msgres_listitemr   res_dictfvs          r!   get_repeated_msg_dictr      s    H''Aff%A??--==LLM G QVV  	!  Or    c                   0 nU R                   R                  nU H  n[        XR                  5      nUR                  [
        R                  R                  R                  R                  :X  aS  UR                  [
        R                  R                  R                  R                  :w  a  [        U5      nO[        U5      nXAUR                  '   M     U$ r   )r   r   r   r   r   r   r   r   r   r   typeTYPE_MESSAGEr   r   )r   r   r   r   r   s        r!   get_msg_dictr      s    H^^""FC  77foo00@@OOO??--==JJK G)!,   Or    c                |   U GH5  nU R                  5       nUR                  R                  nU GH  nX%R                  :X  d  M  UR                  S:X  a  XR                     b  [        XR                  5      nUR                  [        R                  R                  R                  R                  :w  a  UR                  XR                     5        M  [        X1UR                     5        M  M  UR                  S:X  d  UR                  S:X  d  M  [        XR                  XR                     5        GM     GM8     g N         )addr   r   r   r   r   r   r   r   r   r   r   extendassign_configs_valuesetattr)r   configkeynew_itemr   r   s         r!   assign_repeated_msgr      s    779$$++Aff} 77a<ff~1#*3#7FF%99IIVVW %OOF66N;0!&&>J 2 WW\QWW\C8!  r    c                \   U R                   R                  nU GH  nU GH  nX4R                  :X  d  M  UR                  S:X  a  XR                     b  [	        XR                  5      nUR
                  [        R                  R                  R                  R                  :w  a  UR                  XR                     5        M  [        XQUR                     5        M  M  UR                  S:X  d  UR                  S:X  d  M  [        XR                  XR                     5        GM     GM     g r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   s         r!   r   r      s    ^^""FAff} 77a<ff~1#*3#7 FF%99IIVVW %OOF66N;/.I 2 WW\QWW\C8#  r    c                    U R                   R                  R                  5       nU H  nXC;   a  M
   SU SU 35       e   g )Nzkey:z not in )r   fields_by_namekeys)r   r   
field_namekey_listr   s        r!   check_configs_keyr      s@    ~~,,113H@$se8J< @@ r    c                  T    \ rS rSrSrS rS rS rS rS r	S r
S	 rS
 rS rS rSrg)DistributedJobInfo   z~
DistributedJobInfo will serialize all distributed training information
Just for inner use: 1) debug 2) replicate experiments
c                8    [         R                  " 5       U l        g r   )r	   r   job_infoselfs    r!   __init__DistributedJobInfo.__init__   s    0CCEr    c                $    XR                   l        g r   )r   
worker_num)r   r   s     r!   _set_worker_num"DistributedJobInfo._set_worker_num       #- r    c                $    XR                   l        g r   )r   
server_num)r   r   s     r!   _set_server_num"DistributedJobInfo._set_server_num  r   r    c                N    U R                   R                  R                  U5        g r   )r   
worker_ipsr   )r   r   s     r!   _set_worker_ips"DistributedJobInfo._set_worker_ips  s      ''
3r    c                N    U R                   R                  R                  U5        g r   )r   server_endpointsr   )r   r   s     r!   _set_server_endpoints(DistributedJobInfo._set_server_endpoints  s    &&--.>?r    c                8    [        U5      U R                  l        g r   )r&   r   origin_startup)r   origin_startup_progs     r!   _set_origin_startup&DistributedJobInfo._set_origin_startup
  s    '*+>'?$r    c                8    [        U5      U R                  l        g r   )r&   r   origin_main)r   origin_main_progs     r!   _set_origin_main#DistributedJobInfo._set_origin_main  s    $'(8$9!r    c                8    [        U5      U R                  l        g r   )r&   r   distributed_main)r   distributed_main_progs     r!   _distributed_main$DistributedJobInfo._distributed_main  s    ),-B)C&r    c                $    XR                   l        g r   )r   optimizer_name)r   r   s     r!   _optimizer_name"DistributedJobInfo._optimizer_name  s    '5$r    c                $    XR                   l        g r   )r   strategy)r   dist_strategys     r!   _set_distributed_strategy,DistributedJobInfo._set_distributed_strategy  s    !.r    )r   N)r   r   r   r   __doc__r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r   r      s=    
F..4@@:D6/r    r   c                  (   \ rS rSrSrSS jrSS jrSS jrSS jr\	SS j5       r
\
R                  \SS j5       5       r
\	SS	 j5       r\R                  \SS
 j5       5       r\	SS j5       r\R                  \SS j5       5       r\	SS j5       r\R                  \SS j5       5       r\	SS j5       r\	SS j5       r\R                  \SS j5       5       r\R                  \SS j5       5       r\	SS j5       r\R                  \SS j5       5       r\	SS j5       r\R                  \SS j5       5       r\R                  SS j5       r\	SS j5       r\R                  \SS j5       5       r\	SS j5       r\R                  \SS j5       5       r\	SS j5       r\R                  \SS j5       5       r\	SS j5       r\R                  \SS j5       5       r\	SS  j5       r\R                  SS! j5       r\	SS" j5       r\R                  \SS# j5       5       r\	SS$ j5       r\R                  \SS% j5       5       r\	SS& j5       r\R                  \SS' j5       5       r\	SS( j5       r\R                  \SS) j5       5       r\	SS* j5       r\R                  \SS+ j5       5       r\	SS, j5       r\R                  \SS- j5       5       r\	SS. j5       r \ R                  \SS/ j5       5       r \	SS0 j5       r!\!R                  \SS1 j5       5       r!\	SS2 j5       r"\"R                  \SS3 j5       5       r"\	SS4 j5       r#\#R                  \SS5 j5       5       r#\	SS6 j5       r$\$R                  \SS7 j5       5       r$\	SS8 j5       r%\%R                  \SS9 j5       5       r%\	SS: j5       r&\&R                  \SS; j5       5       r&\	SS< j5       r'\'R                  \SS= j5       5       r'\	SS> j5       r(\(R                  \SS? j5       5       r(\	SS@ j5       r)\)R                  \SSA j5       5       r)\	SSB j5       r*\*R                  \SSC j5       5       r*\	SSD j5       r+\+R                  \SSE j5       5       r+\	SSF j5       r,\	SSG j5       r-\-R                  \SSH j5       5       r-\	SSI j5       r.\.R                  \SSJ j5       5       r.\,R                  \SSK j5       5       r,\	SSL j5       r/\/R                  \SSM j5       5       r/\	SSN j5       r0\0R                  \SSO j5       5       r0\	SSP j5       r1\1R                  \SSQ j5       5       r1\	SSR j5       r2\2R                  SSS j5       r2\	SST j5       r3\3R                  \SSU j5       5       r3\	SSV j5       r4\4R                  \SSW j5       5       r4\	SSX j5       r5\5R                  \SSY j5       5       r5\	SSZ j5       r6\6R                  \    SS[ j5       5       r6\	SS\ j5       r7\7R                  \SS] j5       5       r7\	SS^ j5       r8\8R                  \SS_ j5       5       r8\	SS` j5       r9\9R                  \SSa j5       5       r9\	SSb j5       r:\:R                  \SSc j5       5       r:\	SSd j5       r;\;R                  \SSe j5       5       r;\	SSf j5       r<\<R                  \SSg j5       5       r<\	SSh j5       r=\=R                  \SSi j5       5       r=\	SSj j5       r>\>R                  \SSk j5       5       r>\	SSl j5       r?\?R                  \SSm j5       5       r?\	SSn j5       r@\@R                  \SSo j5       5       r@\	SSp j5       rA\AR                  SSq j5       rA\	SSr j5       rB\BR                  SSs j5       rB\	SSt j5       rC\CR                  SSu j5       rC\	SSv j5       rD\DR                  SSw j5       rD\	SSx j5       r\R                  SSy j5       r\	SSz j5       r\R                  SS{ j5       r\	SS| j5       rE\ER                  SS} j5       rE\	SS~ j5       rF\FR                  \SS j5       5       rF\	SS j5       rG\GR                  \SS j5       5       rG\	SS j5       rH\HR                  \SS j5       5       rHSS jrISS jrJSS jrKSrLg)DistributedStrategyi  Fc                   [         R                  " 5       U l        Sn[        5       R	                  U5      (       a%  [        [        5       U   5      U R                  l        Sn[        5       R	                  U5      (       a%  [        [        5       U   5      U R                  l        Sn[        5       R	                  U5      (       a%  [        [        5       U   5      U R                  l	        Sn[        5       R	                  U5      (       a%  [        [        5       U   5      U R                  l
        / SQU l        / SQU l        SU l        [        R                  " S5        g	)
a  

DistributedStrategy is the main configuration entry for distributed training of Paddle.
All of the distributed training configurations can be configured in DistributedStrategy,
such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS),
asynchronous update parameter server(ASGD), etc.

DistributedStrategy can be serialized into protobuf file or deserialized from protobuf file

Users who run local training usually configure BuildStrategy, and
DistributedStrategy supports configurations from BuildStrategy.

(FLAGS_cudnn_batchnorm_spatial_persistentFLAGS_conv_workspace_size_limitFLAGS_cudnn_exhaustive_searchFLAGS_sync_nccl_allreduce)dpppshardingsepcpmp)	embedding
layer_normz.b_Tz distributed strategy initializedN)r	   r   r   r   	is_publicr   "cudnn_batchnorm_spatial_persistentr   conv_workspace_size_limitcudnn_exhaustive_searchsync_nccl_allreducehybrid_parallel_ordersync_param_name_DistributedStrategy__lock_attrr
   info)r   r   s     r!   r   DistributedStrategy.__init__   s    6IIK 9?$$S))EI$FDMM< 0?$$S));>$<DMM3 .?$$S)):>$;DMM1 *?$$S))6:=?3;O6PDMM-1
" +M67r    c                    U R                   (       a4  [        X5      (       d$  [        U SU R                  R                   35      e[
        R                  XU5        g )Nz is not a attribute of )r  hasattr	TypeError	__class__r   object__setattr__)r   r   values      r!   r  DistributedStrategy.__setattr__Q  sN    GD$6$6%.t~~/F/F.GH  	4e,r    c                    [        US5       nUR                  [        U R                  5      5        SSS5        g! , (       d  f       g= f)a  

Serialize current DistributedStrategy to string and save to output file

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.dgc = True
        >>> strategy.recompute = True
        >>> strategy.recompute_configs = {"checkpoints": ["x"]}
        >>> strategy.save_to_prototxt("dist_strategy.prototxt")

wN)openwriter&   r   )r   outputfouts      r!   save_to_prototxt$DistributedStrategy.save_to_prototxtX  s1      &#$JJs4==)* s	   %;
A	c                    [        US5       n[        R                  R                  R	                  [        UR                  5       5      U R                  5      U l        SSS5        g! , (       d  f       g= f)a  

Load from prototxt file for DistributedStrategy initialization

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.dgc = True
        >>> strategy.recompute = True
        >>> strategy.recompute_configs = {"checkpoints": ["x"]}
        >>> strategy.save_to_prototxt("dist_strategy.prototxt")

        >>> strategy.load_from_prototxt("dist_strategy.prototxt")

rN)r  r   r   text_formatMerger&   readr   )r   pb_filer   s      r!   load_from_prototxt&DistributedStrategy.load_from_prototxtk  sK    $ '31"OO77==AFFHt}}DM  s   AA''
A5c                   [         R                  R                  5       nU R                  R                  R
                  R                  nU H|  n[        U R                  R                  UR                  5      nUR                  S:X  a)  [         R                  R                  R                  U5      n[        XR                  U5        M~     U$ )a$  

Configure BuildStrategy for DistributedStrategy
Note that the properties of BuildStrategy are valid in DistributedStrategy
only if the property is non-distributed strategy.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> build_strategy = paddle.static.BuildStrategy()
        >>> build_strategy.fuse_elewise_add_act_ops = True
        >>> build_strategy.fuse_bn_act_ops = True
        >>> build_strategy.enable_auto_fusion = True
        >>> build_strategy.fuse_relu_depthwise_conv = True
        >>> build_strategy.fuse_broadcast_ops = True
        >>> build_strategy.fuse_all_optimizer_ops = True
        >>> build_strategy.enable_inplace = True

        >>> strategy = paddle.distributed.fleet.DistributedStrategy()
        >>> strategy.build_strategy = build_strategy

reduce_strategy)paddlestaticr   r   build_strategyr   r   r   r   ReduceStrategyr   )r   r*  r   r   r  s        r!   r*  "DistributedStrategy.build_strategy  s    4  446--88??ADMM88!&&AEvv**33BB5INFFE2	 
 r    c                    U R                   R                  R                  R                  nU H  nUR                  S:X  d  UR                  S:X  a]  [        XR                  5      nUR                  S:X  a  [        U5      n[        U R                   R                  UR                  U5        M  UR                  S:X  d  M  [        U R                   R                  UR                  5      R                  [        XR                  5      5        M     g )Nr   r   r'  r   )
r   r*  r   r   r   r   r   ReduceStrategyFleetr   r   )r   r   r   r   r  s        r!   r*  r,    s     --88??Aww!|qww!|&&166../6E44affeDA44aff=DDHff- r    c                @    [        U R                  R                  5      $ )a:  

Set the strategy of gradient scale

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.gradient_scale_configs = {'scale_strategy': 'avg'}

Note that, strategy must be in 'avg', 'sum' or 'customized'

)r   r   gradient_scale_configsr   s    r!   r0  *DistributedStrategy.gradient_scale_configs  s      DMM@@AAr    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )Nr0  )r   r   r0  r   )r   r   s     r!   r0  r1    s5     	MM00$	

 	T]]AA6Jr    c                .    U R                   R                  $ )a  

Indicating whether we are using asynchronous stochastic gradient descent updates
for training. This property is valid when we are using parameter server training,
which is implied by setting appropriate RoleMaker
Default value: True

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> role_maker = fleet.PaddleCloudRoleMaker()
        >>> fleet.init(role_maker)

        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.a_sync = True  # by default this is True

        >>> # code block for defining loss and local optimizer
        >>> # sgd = fleet.distributed_optimizer(optimizer, strategy)

)r   a_syncr   s    r!   r4  DistributedStrategy.a_sync  s    . }}###r    c                    [        U[        5      (       a  XR                  l        SS0U l        g [        S[        U5       35      e)Nrq   r   CThe type of `flag` is invalid, expected type is bool, but received )
isinstancer   r   r4  a_sync_configs
ValueErrorr   r   flags     r!   r4  r5    sF     dD!!#'MM #,a.DUVZ[_V`Uab r    c                @    [        U R                  R                  5      $ )a  

Set a_sync update configurations. In general, asynchronous parameter server
training has several configurable settings that can be configured through
a dict.

**Notes**:
    k_step(int): number of local optimization updates before communication

    max_merge_var_num(int): maximum number of merged gradients before communication

    send_queue_size(int): a buffer size of worker communication

    independent_recv_thread(bool): if we are using independent recv thread for communication

    thread_pool_size(int): number of thread pool

    send_wait_times(int): waiting time for sending gradients

    runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> role_maker = fleet.PaddleCloudRoleMaker()
        >>> fleet.init(role_maker)

        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.a_sync = True  # by default this is True
        >>> configs = {"k_steps": 1024, "send_queue_size": 32}
        >>> strategy.a_sync_configs = configs

        >>> # code block for defining loss and local optimizer
        >>> # sgd = fleet.distributed_optimizer(optimizer, strategy)

)r   r   r9  r   s    r!   r9  "DistributedStrategy.a_sync_configs  s    N DMM8899r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )Nr9  )r   r   r9  r   r   configss     r!   r9  r>    s4     	MM(('3C	
 	T]]997Cr    c                @    [        U R                  R                  5      $ )a  

Set trainer desc configurations.

**Notes**:
    dump_fields_path(str): the path of dump fields

    dump_fields(list(str)): the fields that you want to dump

    dump_param(list(str)): the param that you want to dump

    stat_var_names(list(str)):

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> role_maker = fleet.PaddleCloudRoleMaker()
        >>> fleet.init(role_maker)

        >>> strategy = fleet.DistributedStrategy()
        >>> configs = {"dump_fields_path": "./dump_data", "dump_fields": ["xxx", "yyy"]}
        >>> strategy.trainer_desc_configs = configs

        >>> # code block for defining loss and local optimizer
        >>> # sgd = fleet.distributed_optimizer(optimizer, strategy)

)r   r   trainer_desc_configsr   s    r!   rC  (DistributedStrategy.trainer_desc_configs%  s    < DMM>>??r    c                .    U R                   R                  $ )a  

set adam_d2sum
Default value: False

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> role_maker = fleet.PaddleCloudRoleMaker()
        >>> fleet.init(role_maker)

        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.adam_d2sum = True  # by default this is False

        >>> # code block for defining loss and local optimizer
        >>> # sgd = fleet.distributed_optimizer(optimizer, strategy)

)r   
adam_d2sumr   s    r!   rF  DistributedStrategy.adam_d2sumE  s    * }}'''r    c                |    [        U[        5      (       a  XR                  l        g [	        S[        U5       35      e)Nr7  )r8  r   r   rF  r:  r   r;  s     r!   rF  rG  \  s:     dD!!'+MM$UVZ[_V`Uab r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )NrC  )r   r   rC  r   r@  s     r!   rC  rD  f  s4     	MM..9O	
 	T]]??Ir    c                .    U R                   R                  $ )a  

Set fs client configurations.

Note:
    uri(str): the uri of fs client

    user(str): the user_name of fs client

    passwd(str): the passwd of fs client

    hadoop_bin(str):

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> role_maker = fleet.PaddleCloudRoleMaker()
        >>> fleet.init(role_maker)
        >>> strategy = fleet.DistributedStrategy()
        >>> configs = {"uri": "xxx", "user": "xxx", "passwd": "xxx"}
        >>> strategy.fs_client_param = configs
        >>> # code block for defining loss and local optimizer
        >>> # sgd = fleet.distributed_optimizer(optimizer, strategy)

)r   fs_client_paramr   s    r!   rK  #DistributedStrategy.fs_client_paramn  s    8 }},,,r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )NrK  )r   r   rK  r   r@  s     r!   rK  rL    s4     	MM))74E	
 	T]]::GDr    c                .    U R                   R                  $ r   )r   downpour_table_paramr   s    r!   sparse_table_configs(DistributedStrategy.sparse_table_configs  s    }}111r    c                  ^^ SSK Jm  U R                  R                  n S         SUU4S jjjmU(       d  [        R
                  " S5        g U H(  nUR                  5       nX4l        T" USU-   X   5        M*     g )Nr   )r   c                t  > U R                   R                   GH  nUS-   UR                  -   nUR                  T	R                  :X  a  [
        R                  " SU 35        UR                  T	R                  :X  an  US-   U;  a  Mn  X%S-      n[
        R                  " SU SU 35        [        U5       H/  n[        XR                  5      R                  5       nT
" XX'5        M1     M  T
" [        XR                  5      XR5        M  [
        R                  " SU5        XR;  a  GM  UR                  T	R                  :X  a)  [        XR                  5      R                  X%   5        GMQ  [        X%   5      [        :X  a  [        XR                  X%   U   5        GM  [        XR                  X%   5        GM     g )N.z	message: z.numzmessage num:  znot message: %s)r   r   r   r   r   r
   debugr   r   ranger   r   r   r   r   )r   config_namerA  indexfieldr   numidatar   set_table_configs            r!   r^  BDistributedStrategy.sparse_table_configs.<locals>.set_table_config  sW    .."S(5::5::!=!==LL9TF!34{{o&D&DD&=7$%Vm4}TF!C5%AB!&sA#*3

#;#?#?#AD,TD ", )#C4d LL!2D9* {{o&D&DDZZ077F.$6#CW]55IJ#CW]C5 /r    ztable configs is emptyztable_parameters.)r   )
r   r&   rX  r&   rA  dict[str, Any]rY  r   returnNone)google.protobuf.descriptorr   r   rO  r
   r  r   
table_name)r   rA  table_paramrd  
table_datar   r^  s        @@r!   rP  rQ    s     	?mm88 OP	D	D#&	D1?	DHK	D	D 	D> KK01%
(__.
(2% '*4' &r    c                T  ^^^^^	^
 / SQm	SS/m
/ SQmU R                   R                  nS mS mUUUUU	U
4S jnT(       d  [        R                  " S5        g T HK  nUS	:X  d  US
:X  a  M  [	        TU   5      [
        :w  a  M)  UR                  5       nXEl        U" UTU   5        MM     g )N)1sparse_table_classsparse_compress_in_savesparse_shard_numsparse_accessor_classsparse_learning_ratesparse_initial_g2sumsparse_initial_rangesparse_weight_boundssparse_fea_dimsparse_embedx_dimsparse_embedx_thresholdsparse_nonclk_coeffsparse_click_coeffsparse_base_thresholdsparse_delta_thresholdsparse_delta_keep_dayssparse_delete_after_unseen_dayssparse_show_click_decay_ratesparse_delete_thresholdsparse_convertersparse_deconvertersparse_enable_cachesparse_cache_ratesparse_cache_file_numsparse_beta1_decay_ratesparse_beta2_decay_ratesparse_ada_epsilonsparse_optimizersparse_ssd_unseenday_thresholdembed_sparse_optimizerembed_sparse_learning_rateembed_sparse_weight_boundsembed_sparse_initial_rangeembed_sparse_initial_g2sumembed_sparse_beta1_decay_rateembed_sparse_beta2_decay_rateembedx_sparse_optimizerembedx_sparse_learning_rateembedx_sparse_weight_boundsembedx_sparse_initial_rangeembedx_sparse_initial_g2sumembedx_sparse_beta1_decay_rateembedx_sparse_beta2_decay_ratefeature_learning_ratenodeid_slotsparse_load_filter_slotssparse_save_filter_slotssparse_zero_inituse_gpu_graphDownpourSparseTableDownpourSparseSSDTable)DownpourSparseValueAccessorDownpourCtrAccessorDownpourCtrDoubleAccessorDownpourUnitAccessorDownpourDoubleUnitAccessorDownpourCtrDymfAccessorc                `    UR                  SS5      U l        UR                  SS5      U l        g )Nr  皙?r  i0#  )getr  r  )graphr   s     r!   add_graph_config@DistributedStrategy.fleet_desc_configs.<locals>.add_graph_config  s-    *2,,'+E' !)]D AEr    c                $
   UR                  US-   S5      nX0l        US:X  a  SU l        UR                  US-   S5      U R                  l        UR                  US-   S5      U R                  l        UR                  US	-   S
S/5      nU R                  R
                  R                  U5        g US:X  a  SU l        UR                  US-   S5      U R                  l        UR                  US-   S5      U R                  l        US:X  a  SU R                  l        UR                  US-   S5      U R                  l        UR                  US	-   S
S/5      nU R                  R
                  R                  U5        g US:X  a  SU l        UR                  US-   S5      U R                  l        UR                  US-   S5      U R                  l        US:X  a  SU R                  l        UR                  US-   S5      U R                  l        UR                  US	-   S
S/5      nU R                  R
                  R                  U5        g US:X  a  SU l        UR                  US-   S5      U R                  l        UR                  US-   S5      U R                  l        US:X  a  SU R                  l        UR                  US-   S5      U R                  l        UR                  US	-   S
S/5      nU R                  R
                  R                  U5        g US:X  a  SU l        UR                  US-   S5      U R                  l        UR                  US-   S5      U R                  l        UR                  US-   S5      U R                  l
        UR                  US-   S5      U R                  l        UR                  US-   S5      U R                  l        UR                  US	-   S
S/5      nU R                  R
                  R                  U5        g US:X  a  SU l        UR                  US-   S5      U R                  l        UR                  US-   S5      U R                  l        UR                  US-   S5      U R                  l
        UR                  US-   S5      U R                  l        UR                  US-   S5      U R                  l        UR                  US	-   S
S/5      nU R                  R
                  R                  U5        g g ) Nr  adagradnaiveSparseNaiveSGDRulerl  r  rn  g-C6?ro  i
   SparseAdaGradSGDRuleembed_r   rm  r   
adagrad_v2SparseAdaGradV2SGDRulestd_adagradStdAdaGradSGDRuleadamSparseAdamSGDRulegMbP?r  g?r  g+?r  g:0yE>shared_adamSparseSharedAdamSGDRule)r  r   r  learning_rateinitial_rangeweight_boundsr   r  initial_g2sumr  beta1_decay_ratebeta2_decay_rateada_epsilon)sgdr   prefixr   boundss        r!   sparse_optimizer_configGDistributedStrategy.fleet_desc_configs.<locals>.sparse_optimizer_config  s   %\\++YN &H(/*2,,33T+		' +3,,33T+		' "33c2Y 		''..v69,1,4LL33T-) -5LL33T-) X%01CKK-,4LL33Q-) "33c2Y ))008</3,4LL33T-) -5LL33T-) X%01CKK-,4LL33Q-) "33c2Y ))008=0.,4LL33T-) -5LL33T-) X%01CKK-,4LL33Q-) "33c2Y ))0086).)133U*& *233T*& -5LL66-) -5LL66-) (0||114($ "33c2Y &&--f5=04)133U*& *233T*& -5LL66-) -5LL66-) (0||114($ "33c2Y &&--f5) 1r    c                b  > U H  nUT;  d  M  [        SU S35      e   UR                  SS5      nUT;  a  [        SU 35      eUS:X  a  SU l        OSU l        UR                  S	S
5      U l        UR                  SS5      U l        UR                  SS5      U l        UR                  SS5      U l        UR                  SS5      U l        UR                  SS5      nUT;  a  [        SU 35      eUR                  S5      S:  a  SU R                  l
        O8UR                  S5      S:  a  SU R                  l
        OSU R                  l
        TR                  SS5      (       d  SU R                  l
        UR                  SS5      U R                  l        U R                  R                  S -   U R                  l        UR                  S!S"5      U R                  l        US#:X  a  SU R                  R                  l        OSU R                  R                  l        UR                  S$S%5      U R                  R                  l        UR                  S&S'5      U R                  R                  l        UR                  S(S)5      U R                  R                  l        UR                  S*S+5      U R                  R                  l        UR                  S,S5      U R                  R                  l        UR                  S-S.5      U R                  R                  l        UR                  S/S05      U R                  R                  l        UR                  S1S25      U R                  R                  l        UR                  S3S'5      U R                  R                  l        UR                  S4/ 5      nU R                  R                  R2                  R5                  U5        UR                  S5/ 5      nU R                  R                  R6                  R5                  U5        UR                  S6S5      U R                  R                  l        U R                  SL a  SU R                  R                  l        UR                  S7S85      nUR                  S9S85      nU R                  R:                  R=                  5       n	S'U	l        Xyl         Xl!        U R                  R:                  R=                  5       n
S:U
l        Xzl         Xl!        US:X  d  US;:X  a=  T" U R                  RD                  US85        T" U R                  RF                  US85        O<T" U R                  RD                  US<5        T" U R                  RF                  US=5        T" U R                  RH                  U5        g )>Nzstrategy key 'z' not supportrh  r  zXsupport sparse_table_class: ['DownpourSparseTable, DownpourSparseSSDTable'], but actual r  SSDSparseTableMemorySparseTablerj  i  r}  Tr~  g/nB?r     r  Frk  r  zsupport sparse_accessor_class: ['DownpourSparseValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor', 'DownpourCtrDymfAccessor'], but actual Doubler   CtrDoubleAccessorDymfCtrDymfAccessorCtrCommonAccessoruse_cvmSparseAccessorrq     r   rr  r  r  rs  g?rt  r   ru  g      ?rv  g      ?rw  ry  g\(\?rz  g?rx     r  r  r  r  r{   r|  r   r  r  embedx_)%r:  r  table_class	shard_numenable_sparse_table_cachesparse_table_cache_ratesparse_table_cache_file_numr  findaccessoraccessor_class
embedx_dimfea_dimembedx_thresholdctr_accessor_param
show_scalenonclk_coeffclick_coeffbase_thresholddelta_thresholddelta_keep_daysshow_click_decay_ratedelete_thresholddelete_after_unseen_daysssd_unseenday_thresholdload_filter_slotsr   save_filter_slots	zero_inittable_accessor_save_paramr   param	converterdeconverterembed_sgd_paramembedx_sgd_paramgraph_sgd_param)rf  r   r   r  r  r  r  r  r  
save_data1
save_data2r  rA  r  support_sparse_accessor_classsupport_sparse_key_listsupport_sparse_table_classs              r!   set_sparse_table_configGDistributedStrategy.fleet_desc_configs.<locals>.set_sparse_table_config  sU   55$~cU-%HII  !**$&;K "<< nozn{|  66)9
&)<
&#)::.@$#GJ 39::%t4J0 28#W2J. 6<ZZ'6J2 (.zz/5'IJ$#ZZ')>N %BB  g  hv  gw  x  ""8,15H
##2$$V,15F
##25H
##2;;y$//5E
##2-3ZZ8KQ-OJ**4*=*=*H*H1*LJ'39::)24J0 !77DI
##66ADH
##66ABH**%sCJ22? BH$aBJ22> EKJJ'EJ22A FLZZ($FJ22B FLZZ("FJ22B 

94@ 22H 

4c: 22C 

<bA 22K 

;Q? 22J !'

+Er J22DDKK! !'

+Er J22DDKK! @Fzz"D@J22< ''4/CH
##66@

#5r:I **%92>K#,,FFJJLJ J#, %0"#,,FFJJLJ J#, %0" "77!%@@'''77 (''88&" (''77 (''88&) Z00@@&Ir    zfleet desc config is emptydense_tabledatanorm_table)r   rO  r
   r  r   dictr   rd  )r   rA  re  r  rd  rf  r  r  r  r  r  s    `    @@@@@r!   fleet_desc_configs&DistributedStrategy.fleet_desc_configs  s    2#
h "$&
")
% mm88	Bm	6^	J 	JB KK45%
-/!%55
+,4(__.
(2%'
GJ4GH &r    c                .    U R                   R                  $ )a%  
Indicating whether we are using automatic mixed precision training
Default Value: False

Examples:

    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.amp = True # by default this is false

)r   ampr   s    r!   r  DistributedStrategy.amp       }}   r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz"amp should have value of bool type)r8  r   r   r  r
   warningr;  s     r!   r  r  #  (     dD!! $MMNN?@r    c                @    [        U R                  R                  5      $ )a  

Set automatic mixed precision training configurations. In general, amp has several configurable
settings that can be configured through a dict.

**Notes**:
    init_loss_scaling(float): The initial loss scaling factor. Default 32768.

    use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. Default True.

    incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients. Default 1000.

    decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.

    incr_ratio(float): The multiplier to use when increasing the loss scaling. Default 2.0.

    decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.

    custom_white_list(list[str]): Users' custom white list which always execution fp16.

    custom_black_list(list[str]): Users' custom black list which forbidden execution fp16.

    custom_black_varnames(list[str]): Users' custom black variables' names.

    use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.

    use_pure_bf16(bool): Whether to use the pure bf16 training. Default False.

    use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
    Default True. Only takes effect when `use_pure_fp16` is turned on.

Examples:
    .. code-block:: python
        :name: example_1

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.amp = True
        >>> strategy.amp_configs = {
        ...     "init_loss_scaling": 32768,
        ...     "custom_white_list": ['conv2d']
        ... }

    .. code-block:: python
        :name: example_2

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.amp = True
        >>> # pure fp16
        >>> strategy.amp_configs = {
        ...     "init_loss_scaling": 32768,
        ...     "use_pure_fp16": True
        ... }

)r   r   amp_configsr   s    r!   r  DistributedStrategy.amp_configs+  s    t DMM5566r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )Nr  )r   r   r  r   r@  s     r!   r  r  g  .     	$--33WmLT]]66@r    c                .    U R                   R                  $ )a  

Indicating whether we are using automatic sparsity training
Default Value: False

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.asp = True # by default this is false

)r   aspr   s    r!   r
  DistributedStrategy.aspm  r   r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz"asp should have value of bool type)r8  r   r   r
  r
   r  r;  s     r!   r
  r  ~  r  r    c                .    U R                   R                  $ )a  
Indicating whether we are using quantization aware training
Default Value: False

Examples:

    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.qat = True # by default this is false

r   qatr   s    r!   r  DistributedStrategy.qat  r   r    c                \    [        U[        5      (       d   S5       eXR                  l        g Nz"qat should have value of bool type)r8  r   r   r  r;  s     r!   r  r    s'     $%%K'KK% r    c                @    [        U R                  R                  5      $ )a  
Set quantization training configurations. In general, qat has several configurable
settings that can be configured through a dict.
**Notes**:
    channel_wise_abs_max(bool): Whether to use `per_channel` quantization training. Default is True.
    weight_bits(int): quantization bit number for weight. Default is 8.
    activation_bits(int): quantization bit number for activation. Default is 8.
    not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope,
        the corresponding op will not be quantized.
    algo(str): Other quantization training algorithm.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.qat = True
        >>> strategy.qat_configs = {
        ...     "channel_wise_abs_max": True,
        ...     "weight_bits": 8,
        ...     "activation_bits": 8,
        ...     "not_quant_pattern": ['skip_quant']
        ... }

r   r   qat_configsr   s    r!   r  DistributedStrategy.qat_configs  s    6 DMM5566r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g Nr  r   r   r  r   r@  s     r!   r  r    ,    $--33WmLT]]66@r    c                .    U R                   R                  $ )a  
Indicating whether we are using forward recomputation for memory optimization
Default value: False

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.recompute = True
        >>> # suppose x and y are names of checkpoint tensors for recomputation
        >>> strategy.recompute_configs = {"checkpoints": ["x", "y"]}

)r   	recomputer   s    r!   r  DistributedStrategy.recompute  s      }}&&&r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz(recompute should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r    s(     dD!!&*MM#NNEFr    c                .    U R                   R                  $ )ae  

Indicating whether we are using synchronized all reduce in each communication thread
We note that system overhead is usually lower when sync_nccl_allreduce = True

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.sync_nccl_allreduce = True

)r   r  r   s    r!   r  'DistributedStrategy.sync_nccl_allreduce       }}000r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz2sync_nccl_allreduce should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r     (     dD!!04MM-NNOPr    c                .    U R                   R                  $ )a  

Indicating whether we are using hierarchical allreduce in collective communication
Hierarchical allreduce often does allreduce within a certain node group and then do
allreduce among the leaders of each group

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.use_hierarchical_allreduce = True

)r   use_hierarchical_allreducer   s    r!   r%  .DistributedStrategy.use_hierarchical_allreduce  s      }}777r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz9use_hierarchical_allreduce should have value of bool type)r8  r   r   r%  r
   r  r;  s     r!   r%  r&    *     dD!!7;MM4NNKr    c                .    U R                   R                  $ )aP  

Number of ranks for low level node groups in hierarchical allreduce
Default value: number of GPU cards on each single GPU machine

Example:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.hierarchical_allreduce_inter_nranks = 8

)r   #hierarchical_allreduce_inter_nranksr   s    r!   r*  7DistributedStrategy.hierarchical_allreduce_inter_nranks  s     }}@@@r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )NzAhierarchical_allreduce_inter_nranks should have value of int type)r8  r   r   r*  r
   r  r   r  s     r!   r*  r+    s*     eS!!@EMM=NNSr    c                .    U R                   R                  $ )aD  

Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes.

Default value: False

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.sync_batch_norm = True

)r   sync_batch_normr   s    r!   r/  #DistributedStrategy.sync_batch_norm)  s    " }},,,r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz.sync_batch_norm should have value of bool type)r8  r   r   r/  r
   r  r;  s     r!   r/  r0  <  (     dD!!,0MM)NNKLr    c                .    U R                   R                  $ )aA  

Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training
Default value: True

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.fuse_all_reduce_ops = False

)r   fuse_all_reduce_opsr   s    r!   r4  'DistributedStrategy.fuse_all_reduce_opsD  r!  r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz2fuse_all_reduce_ops should have value of bool type)r8  r   r   r4  r
   r  r;  s     r!   r4  r5  U  r#  r    c                .    U R                   R                  $ )a
  

Specifying the size of gradient to fuse in Mega-Bytes

Default value: 32

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.fuse_grad_size_in_MB = 50

)r   fuse_grad_size_in_MBr   s    r!   r8  (DistributedStrategy.fuse_grad_size_in_MB]  s      }}111r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz2fuse_grad_size_in_MB should have value of int type)r8  r   r   r8  r
   r  r-  s     r!   r8  r9  o  s(     eS!!16MM.NNOPr    c                .    U R                   R                  $ )ay  

Specifying the size of gradient to fuse in Mega-Bytes when
the last group of each batch communicates. Making the last group
small is useful to improve performance.

Default value: 1

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.last_comm_group_size_MB = 2

)r   last_comm_group_size_MBr   s    r!   r<  +DistributedStrategy.last_comm_group_size_MBw  s    $ }}444r    c                F    US:  a  XR                   l        g [        S5      e)Nr   z0last_comm_group_size_MB should be greater than 0)r   r<  r:  r-  s     r!   r<  r=    s!     1949MM1OPPr    c                .    U R                   R                  $ )a=  

Indicating whether we are using find_unused_parameters to
find unused parameters in DataParallel.

Default value: False

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.find_unused_parameters = True

)r   find_unused_parametersr   s    r!   r@  *DistributedStrategy.find_unused_parameters  s    $ }}333r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz5find_unused_parameters should have value of bool type)r8  r   r   r@  r
   r  r;  s     r!   r@  rA    s*     dD!!37MM0NNGr    c                .    U R                   R                  $ r   )r   fuse_grad_size_in_TFLOPSr   s    r!   _fuse_grad_size_in_TFLOPS-DistributedStrategy._fuse_grad_size_in_TFLOPS  s    }}555r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz8fuse_grad_size_in_TFLOPS should have value of float type)r8  r6   r   rD  r
   r  r-  s     r!   rE  rF    s*     eU##5:MM2NNJr    c                .    U R                   R                  $ )z

Specifying the number of NCCL communicator

Default value: 1

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.nccl_comm_num = 2

)r   nccl_comm_numr   s    r!   rI  !DistributedStrategy.nccl_comm_num  s    " }}***r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz+nccl_comm_num should have value of int type)r8  r   r   rI  r
   r  r-  s     r!   rI  rJ    s(     eS!!*/MM'NNHIr    c                @    [        U R                  R                  5      $ )a?  

Set recompute configurations.

**Note**:
checkpoints(list[str]): list of string name of checkpoints. In general, the recompute
strategy of current implementation should have some manually assign checkpoints.

enable_offload(bool): enable recompute checkpoints offload feature. this feature
will offload the checkpoint to host memory to allow even larger batch size. since
the memcpy from host to device takes time, it is a trade off between larger batch
size and training speed.

checkpoint_shape(list[int]): list of int that specific the shape of checkpoint. so far
recompute-offload requires that all checkpoint to be same shape, and every dimension
specific here should be determined ("-1" is not allowed).

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.recompute = True
        >>> strategy.recompute_configs = {
        ...     "checkpoints": ["x", "y"],
        ...     "enable_offload": True,
        ...     "checkpoint_shape": [100, 512, 1024]
        ... }

)r   r   recompute_configsr   s    r!   rM  %DistributedStrategy.recompute_configs  s    @ DMM;;<<r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )Ncheckpoint_configs)r   r   rM  r   r@  s     r!   rM  rN    s4     	MM++W6J	
 	T]]<<gFr    c                .    U R                   R                  $ )a  

Indicating whether we are using sharding Optimizer for memory
optimization. We implement the sharding optimizer following the ZeRO-DP
idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
Model parameters and Optimizer State are sharded into different ranks allowing to fit larger model.

In Hybrid parallelism scenario, we use sharding config as uniform API to set each parallelism.

Default value: False

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.sharding = True

)r   r   r   s    r!   r   DistributedStrategy.sharding  s    * }}%%%r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz'sharding should have value of bool type)r8  r   r   r   r
   r  r;  s     r!   r   rR    (     dD!!%)MM"NNDEr    c                @    [        U R                  R                  5      $ )aQ  

Set sharding configurations.

**Note**:
    sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are
    available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and
    communication. Default is segment_broadcast_MB.

    segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and
    after every segment_broadcast_MB size parameter being broadcasted, the program will be cut into one segment.
    This configuration will affect the communication speed in sharding training, and should be an empirical value decided by your model size and network topology.
    Only enable when sharding_segment_strategy = segment_broadcast_MB. Default is 32.0 .

    segment_anchors(list): list of anchors used to segment the program, which allows a finer control of program segmentation.
    this strategy is experimental by now. Only enable when sharding_segment_strategy = segment_anchors.

    sharding_degree(int, optional): specific the number of gpus within each sharding parallelism group; and sharding will be turn off if sharding_degree=1.  Default is 8.

    gradient_merge_acc_step(int, optional): specific the accumulation steps in gradient merge; and gradient merge will be turn off if gradient_merge_acc_step=1.  Default is 1.

    optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model.
    the moment var will be prefetch from and offloaded to Host memory during update stage. it is a strategy that trades off between training speed and GPU memory, and is recommended to be turn on only when gradient_merge_acc_step large, where
    the number of time of update stage will be relatively small compared with forward&backward's.  Default is False.

    dp_degree(int, optional): specific the number of data parallelism group; when dp_degree >= 2, it will introduce dp_degree ways data parallelism as the outer parallelism for the inner parallelism. User is responsible to ensure global_world_size = mp_degree * sharding_degree * pp_degree * dp_degree. Default is 1.

    mp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each megatron parallelism group; and megatron parallelism will turn be off if mp_degree=1.  Default is 1.

    pp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.

    pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelism is on.
    This configuration will affect the communication speed of Hybrid parallelism training depended on network topology. this strategy is experimental by now..  Default is False.

    optimize_cast(bool, optional): [Hybrid parallelism ONLY] Move the cast op of AMP which cast fp32 param to fp16 param to optimizer. optimize_cast will persist fp16 param, it
    will take more memory, but will be faster, trade space for time. Recommend to turn on only when using pipeline or gradient_merge_acc_step large.


Examples:
    .. code-block:: python

        >>> # sharding-DP, 2 nodes with 8 gpus per node
        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.sharding = True
        >>> strategy.sharding_configs = {
        ...     "sharding_segment_strategy": "segment_broadcast_MB",
        ...     "segment_broadcast_MB": 32,
        ...     "sharding_degree": 8,
        ...     "dp_degree": 2,
        ...     "gradient_merge_acc_step": 4,
        ... }

)r   r   sharding_configsr   s    r!   rV  $DistributedStrategy.sharding_configs#  s    p DMM::;;r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )NrV  )r   r   rV  r   r@  s     r!   rV  rW  ]  4     	MM**G5G	
 	T]];;WEr    c                .    U R                   R                  $ )a  

Run program using Executor other than ParallelExecutor.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.without_graph_optimization = True

)r   without_graph_optimizationr   s    r!   r[  .DistributedStrategy.without_graph_optimizatione  s     }}777r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz9without_graph_optimization should have value of bool type)r8  r   r   r[  r
   r  r;  s     r!   r[  r\  u  r(  r    c                .    U R                   R                  $ )al  

This based on raw_program_optimizer program
Set whether use same stream for calc and comm when fuse allreduce
The default value for the calc_comm_same_stream is False

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy._calc_comm_same_stream = True

)r   calc_comm_same_streamr   s    r!   _calc_comm_same_stream*DistributedStrategy._calc_comm_same_stream  s      }}222r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz7calc_comm_same_stream should have value of boolean type)r8  r   r   r_  r
   r  )r   sames     r!   r`  ra    s*     dD!!26MM/NNIr    c                .    U R                   R                  $ )ag  

Set whether fuse the grad for gradient merge.
Note: this flag will only effect the gradient merge under pipeline mode
The default value for the fuse_grad_merge is False

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.fuse_grad_merge = True

)r   fuse_grad_merger   s    r!   re  #DistributedStrategy.fuse_grad_merge  s      }},,,r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz1fuse_grad_merge should have value of boolean type)r8  r   r   re  r
   r  )r   re  s     r!   re  rf    s(     ot,,,;MM)NNNOr    c                .    U R                   R                  $ )a  

This based on raw_program_optimizer program and allreduce the num of the fused op

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet

        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.fuse_grad_size_in_num = 2

)r   fuse_grad_size_in_numr   s    r!   ri  )DistributedStrategy.fuse_grad_size_in_num  s     }}222r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz5fuse_grad_size_in_num should have value of int32 type)r8  r   r   ri  r
   r  )r   r[  s     r!   ri  rj    s*     c325MM/NNGr    c                .    U R                   R                  $ )a  

Indicating whether we are using pipeline parallelism for distributed training.
Current implementation mainly focus on single GPU machine pipeline parallelism and
data parallelism across GPU machine. The pipeline information is indicated through
device_guard information in user-defined program.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.pipeline = True

)r   pipeliner   s    r!   rm  DistributedStrategy.pipeline  s    " }}%%%r    c                .    U R                   R                  $ r   )r   is_fl_ps_moder   s    r!   rp  !DistributedStrategy.is_fl_ps_mode  s    }}***r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz,is_fl_ps_mode should have value of bool type)r8  r   r   rp  r
   r  r;  s     r!   rp  rq    s(     dD!!*.MM'NNIJr    c                .    U R                   R                  $ r   )r   with_coordinatorr   s    r!   is_with_coordinator'DistributedStrategy.is_with_coordinator  s    }}---r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz/with_coordinator should have value of bool type)r8  r   r   rt  r
   r  r;  s     r!   ru  rv    s(     dD!!-1MM*NNLMr    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz'pipeline should have value of bool type)r8  r   r   rm  r
   r  r;  s     r!   rm  rn    rT  r    c                @    [        U R                  R                  5      $ )a  

Set pipeline parallelism configurations. In pipeline parallelism,
different parts of neural networks are running on different GPUS.
There are Tensor queue buffer between each pair of neighborhood GPUS
that are responsible for synchronizing hidden Tensor results between
GPUs. Pipeline parallelism consists of several producer-consumer style
hardware pairs, such as GPU-GPU, CPU-GPU, GPU-XPU. The best way to speedup
pipeline parallelism is to make the size of Tensor in Tensor queue smaller,
so that we will have a faster producer for downstream consumers.

**Notes**:
    **Detailed arguments for pipeline_configs**

    **micro_batch_size**: the number of small batches in each user defined batch

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.pipeline = True
        >>> strategy.pipeline_configs = {"micro_batch_size": 12}

)r   r   pipeline_configsr   s    r!   rz  $DistributedStrategy.pipeline_configs  s    8 DMM::;;r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )Nrz  )r   r   rz  r   r@  s     r!   rz  r{  !  rY  r    c                .    U R                   R                  $ )a  

Indicating whether we are using tensor parallel for distributed training.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.tensor_parallel = True

)r   tensor_parallelr   s    r!   r~  #DistributedStrategy.tensor_parallel)  s     }},,,r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz.tensor_parallel should have value of bool type)r8  r   r   r~  r
   r  r;  s     r!   r~  r  9  r2  r    c                @    [        U R                  R                  5      $ )a7  

Set tensor_parallel configurations.

**Notes**:
    **Detailed arguments for tensor_parallel_configs**

    **tensor_parallel_degree**: degree of tensor parallel

    **tensor_init_seed**: parameter initialization random seed


Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.tensor_parallel = True
        >>> strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4,
        ...                                     "tensor_init_seed": 123}

)r   r   tensor_parallel_configsr   s    r!   r  +DistributedStrategy.tensor_parallel_configsA  s    0 DMMAABBr    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )Nr  )r   r   r  r   r@  s     r!   r  r  [  s5     	MM11%	

 	T]]BBGLr    c                @    [        U R                  R                  5      $ )a  

Dynamic graph hybrid parallel strategy configuration. Five-way hybrid parallelism
needs to meet the following relationships

total_number_GPUs = dp_degree * mp_degree * pp_degree * sharding_degree * sep_degree

**Note**:
    **dp_degree(int)**: set number of GPUs in a data parallel group. Default -1.
                            This value should be an integer greater than 0.
                            If it is not set, or set to -1, its value will be inferred
                            based on the total number of cards.

    **mp_degree(int)**: set number of GPUs in a model parallel group. Default 1

    **pp_degree(int)**: set number of GPUs in a pipeline parallel group. Default 1
    **sep_degree(int)**: set number of GPUs in a sep parallel group. Default 1
    **cp_degree(int)**: set number of GPUs in a context parallel group. Default 1
    **sharding_degree(int)**: set number of GPUs in a sharding parallel group. Default 1
    **order(list(string))**: set hybrid parallel dimensions, the order is from outside to inside. Default ['dp','pp','sharding','sep', 'mp']

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.hybrid_configs = {
        ...     "dp_degree": 1,
        ...     "mp_degree": 2,
        ...     "pp_degree": 1,
        ...     "order":['dp','pp','sharding', 'sep', 'mp']
        ... }

)r   r   hybrid_configsr   s    r!   r  "DistributedStrategy.hybrid_configse  s    H DMM8899r    c                X   [         R                  " U5      nSU;   a  US   U l        UR                  S5        [	        U R
                  R                  US5        SU;   ah  SUS   ;   a!  US   S   U l        US   R                  S5        [        U R
                  R                  R                  US   5        UR                  S5        SU;   a>  [        U R
                  R                  R                  US   5        UR                  S5        [        U R
                  R                  U5        g )Nrm   r  
mp_configsr
  
pp_configs)copydeepcopyr	  popr   r   r  r
  r   r  r  )r   rA  hybrid_configs      r!   r  r    s   g.m#)6w)?D&g&MM((-9I	
 7" GL$99'.|'<=N'O$%))*;< ,,779N KK%7" ,,779N KK%T]]997Cr    c                .    U R                   R                  $ )a  

Indicating whether we are using Local SGD training. Default Value: False
For more details, please refer to
`Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.localsgd = True # by default this is false

)r   localsgdr   s    r!   r  DistributedStrategy.localsgd  s      }}%%%r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz'localsgd should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r    rT  r    c                @    [        U R                  R                  5      $ )aK  

Set LocalSGD training configurations. LocalSGD has a configurable
setting that can be configured through a dict.

**Notes**:
    k_steps(int) The local steps for training before parameter synchronization. Default 1.
    begin_step(int) The step of beginning training by localsgd. Default 1.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.localsgd = True
        >>> strategy.localsgd_configs = {"k_steps": 4,
        ...                             "begin_step": 30}

)r   r   localsgd_configsr   s    r!   r  $DistributedStrategy.localsgd_configs  s    , DMM::;;r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )Nr  )r   r   r  r   r@  s     r!   r  r    rY  r    c                .    U R                   R                  $ )a  

Indicating whether we are using Adaptive Local SGD training. Default Value: False
For more details, please refer to `Adaptive Communication Strategies to Achieve
the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.adaptive_localsgd = True # by default this is false

)r   adaptive_localsgdr   s    r!   r  %DistributedStrategy.adaptive_localsgd  s      }}...r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz0adaptive_localsgd should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r    s(     dD!!.2MM+NNMNr    c                @    [        U R                  R                  5      $ )a  

Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable
setting that can be configured through a dict.

**Notes**:
    init_k_steps(int) The initial steps for training before adaptive localsgd.
                      Then, the adaptive localsgd method will modify init_k_steps automatically.
                      Default 1.

    begin_step(int) The step of beginning training by adaptive localsgd. Default 1.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.adaptive_localsgd = True
        >>> strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
        ...                                       "begin_step": 30}

)r   r   adaptive_localsgd_configsr   s    r!   r  -DistributedStrategy.adaptive_localsgd_configs  s    2 DMMCCDDr    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )Nr  )r   r   r  r   r@  s     r!   r  r    s5    
 	MM33'	

 	T]]DDgNr    c                .    U R                   R                  $ )a  

Indicating whether we are using Deep Gradient Compression training. For more details, please refer to
[Deep Gradient Compression](https://arxiv.org/abs/1712.01887).

Default Value: False

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.dgc = True # by default this is false

)r   dgcr   s    r!   r  DistributedStrategy.dgc"  s    " }}   r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz"dgc should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r  5  r  r    c                @    [        U R                  R                  5      $ )ae  

Set Deep Gradient Compression training configurations. In general, dgc has several configurable
settings that can be configured through a dict.

**Notes**:
    rampup_begin_step(int): The beginning step from which gradient compression is implemented. Default 0.

    rampup_step(int): Time steps used in sparsity warm-up periods. Default is 1. \
            For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100, \
            it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array \
            ends, it will use 0.999 then and after.

    sparsity(list[float]): Get top important element from gradient tensor, the ratio is (1 - sparsity). \
            Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important \
            element will be transmitted.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.dgc = True
        >>> strategy.dgc_configs = {"rampup_begin_step": 1252}

)r   r   dgc_configsr   s    r!   r  DistributedStrategy.dgc_configs=  s    8 DMM5566r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )Nr  )r   r   r  r   r@  s     r!   r  r  [  r  r    c                .    U R                   R                  $ )a/  

Indicating whether we are using fp16 gradient allreduce training
Default Value: False

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet

        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.fp16_allreduce = True # by default this is false

)r   fp16_allreducer   s    r!   r  "DistributedStrategy.fp16_allreducea  s      }}+++r    c                d    [        U[        5      (       d  [        S5      eXR                  l        g )Nz)fp16_allreduce must be value of bool type)r8  r   r  r   r  r;  s     r!   r  r  s  s'     $%%GHH'+$r    c                .    U R                   R                  $ )a  

Gradient Merge, also called as Gradient Accumulation,
is a strategy for large batch training. With this strategy,
model parameter will not be updated until user-defined steps.
For each step, the forward network and the backward network
will run to calculate the gradient of model parameters.
For every k step, the optimization network will run,
applying a specific optimization method (such as SGD, Adam)
to model parameters.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.gradient_merge = True
        >>> strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}

)r   gradient_merger   s    r!   r  "DistributedStrategy.gradient_mergez  s    , }}+++r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz-gradient_merge should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r    s(     dD!!+/MM(NNJKr    c                @    [        U R                  R                  5      $ )a  

the key-value configs of distribute_strategy

**Note**:
    k_steps(int): the update period of the parameters.

    avg(bool): whether to average the gradients of each mini-batch, the default value is `True`

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.gradient_merge = True
        >>> strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}

)r   r   gradient_merge_configsr   s    r!   r  *DistributedStrategy.gradient_merge_configs  s    ( DMM@@AAr    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )Ngradient_configs)r   r   r  r   r@  s     r!   r  r    s4     	MM00';M	
 	T]]AA7Kr    c                .    U R                   R                  $ )a  

Set lars configurations. lars is used to deal with the convergence problems when the global
batch size is larger than 8k.  For more details, please refer to
[Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).

Default Value: False

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.lars = True # by default this is false

)r   larsr   s    r!   r  DistributedStrategy.lars  s    $ }}!!!r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz#lars should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r    (     dD!!!%MMNN@Ar    c                @    [        U R                  R                  5      $ )aS  

Set Lars training configurations.

**Notes**:
**lars_coeff (float)**: trust ratio in lars formula.
**lars_weight_decay** (float): weight decay coefficient in lars formula.
**epsilon (float)**: argument is used to avoid potential division-by-zero
when compute the local lr;
**exclude_from_weight_decay (list[str])**: is a list of name strings of layers which
will be exclude from weight decay in lars formula.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.lars = True
        >>> strategy.lars_configs = {
        ...             "lars_coeff": 0.01,
        ...             "lars_weight_decay": 0.0005,
        ...             "epsilon": 0,
        ...             "exclude_from_weight_decay": ['batch_norm', '.b_0']
        ... }

)r   r   lars_configsr   s    r!   r   DistributedStrategy.lars_configs  s    8 DMM6677r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )Nr  )r   r   r  r   r@  s     r!   r  r    .     	$--44g~NT]]77Ar    c                .    U R                   R                  $ )a  

Set lamb configurations. lamb is used to deal with the convergence problems for large
batch size training, specially for attention-related model like BERT. For more details,
please refer to
[Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962).

Default Value: False

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.lamb = True # by default this is false

)r   lambr   s    r!   r  DistributedStrategy.lamb  s    ( }}!!!r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz#lamb should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r  	  r  r    c                @    [        U R                  R                  5      $ )aJ  

Set Lars training configurations.

**Notes**:
**lamb_weight_decay** (float): weight decay coefficient in lamb formula.
**exclude_from_weight_decay (list[str])**: is a list of name strings of layers which
will be exclude from weight decay in lamb formula.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.lamb = True
        >>> strategy.lamb_configs = {
        ...         'lamb_weight_decay': 0.01,
        ...         'exclude_from_weight_decay': [],
        ... }

)r   r   lamb_configsr   s    r!   r   DistributedStrategy.lamb_configs	  s    . DMM6677r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g )Nr  )r   r   r  r   r@  s     r!   r  r  /	  r  r    c                .    U R                   R                  $ )z

Indicating whether we want to do current distributed training on clusters with elastic resources.
Currently, this is configuration is not valid.

)r   elasticr   s    r!   r  DistributedStrategy.elastic5	  s     }}$$$r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz&elastic should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r  ?	  s(     dD!!$(MM!NNCDr    c                .    U R                   R                  $ )a  

Indicating whether we are using auto-parallel configuration
This feature is currently an experimental feature. Currently,
auto-parallelism can be used only when a user does not set any other
strategy configs except auto. For details, please reference the following
code example
Default Value: False

Examples:
    .. code-block:: python

        >>> import paddle
        >>> paddle.enable_static()
        >>> import paddle.distributed.fleet as fleet

        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.auto = True
        >>> # if set other strategy at the same time, auto will not apply
        >>> # strategy.amp = True

        >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01)
        >>> optimizer = fleet.distributed_optimizer(optimizer, strategy)

)r   autor   s    r!   r  DistributedStrategy.autoG	  s    6 }}!!!r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz#auto should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r  d	  s&    dD!!!%MMNN@Ar    c                .    U R                   R                  $ )a  

Indicating whether we are using semi-auto parallel function
This feature is currently an experimental feature. Currently,
auto-parallelism can be used only when a user does not set any other
strategy configs except semi-auto. For details, please reference the following
code example
Default Value: False

Examples:
    .. code-block:: python

        >>> import paddle
        >>> paddle.enable_static()
        >>> import paddle.distributed.fleet as fleet

        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.semi_auto = True
        >>> # if set other strategy at the same time, auto will not apply
        >>> # strategy.amp = True

        >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01)
        >>> optimizer = fleet.distributed_optimizer(optimizer, strategy)

)r   	semi_autor   s    r!   r  DistributedStrategy.semi_autok	  s    6 }}&&&r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz(semi-auto should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r  	  s&    dD!!&*MM#NNEFr    c                .    U R                   R                  $ )a  

Indicating whether we are using auto-search parallel function
For details, please reference the following code example
Default Value: False

Examples:
    .. code-block:: python

        >>> import paddle

        >>> paddle.enable_static()
        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.auto_search = True

)r   auto_searchr   s    r!   r  DistributedStrategy.auto_search	  s    & }}(((r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz*auto-search should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r  	  s&    dD!!(,MM%NNGHr    c                .    U R                   R                  $ )aM  

Indicating whether we split the data. If True, we split the data.
Default Value: True

Examples:
    .. code-block:: python

        >>> import paddle

        >>> paddle.enable_static()
        >>> import paddle.distributed.fleet as fleet
        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.split_data = True

)r   
split_datar   s    r!   r  DistributedStrategy.split_data	  s    $ }}'''r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz)split_data should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r  	  s&    dD!!'+MM$NNFGr    c                .    U R                   R                  $ )zN

Indicating whether we are using quantization training
Default Value: False

r  r   s    r!   r  r  	  s     }}   r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g r  )r8  r   r   r  r
   r  r;  s     r!   r  r  	  s&    dD!! $MMNN?@r    c                @    [        U R                  R                  5      $ )a  

Set quantization training configurations. In general, qat has several configurable
settings that can be configured through a dict.

**Notes**:
    channel_wise_abs_max(bool): Whether to use `per_channel` quantization training. Default is True.

    weight_bits(int): quantization bit number for weight. Default is 8.

    activation_bits(int): quantization bit number for activation. Default is 8.

    not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope,
        the corresponding op will not be quantized.

    algo(str): Other quantization training algorithm.

Examples:
    .. code-block:: python

        >>> import paddle.distributed.fleet as fleet

        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.qat = True
        >>> strategy.qat_configs = {
        ...     "channel_wise_abs_max": True,
        ...     "weight_bits": 8,
        ...     "activation_bits": 8,
        ...     "not_quant_pattern": ['skip_quant']
        ... }

r  r   s    r!   r  r  	  s    D DMM5566r    c                    [        U R                  R                  US5        [        U R                  R                  U5        g r  r  r@  s     r!   r  r  	  r  r    c                .    U R                   R                  $ )aa  

Indicating whether we are using heter_ccl_mode for model training.
This feature is currently an experimental feature. Currently,
heter_ccl_mode can be used only for dataparallel with dygraph mode.
Default Value: False

Examples:
    .. code-block:: python

        >>> import paddle
        >>> import paddle.distributed.fleet as fleet

        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.heter_ccl_mode = True

        >>> # for initialize parallel env, only need to call
        >>> paddle.distributed.init_parallel_env()
        >>> # then the heterogeneous context will be created.

)r   heter_ccl_moder   s    r!   r  "DistributedStrategy.heter_ccl_mode 
  s    . }}+++r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz-heter_ccl_mode should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r  
  s&    dD!!+/MM(NNJKr    c                .    U R                   R                  $ )a  

Indicating whether to use exhaustive search method to choose convolution algorithms.
Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm.
This method is time-consuming, the chosen algorithm will be cached for the given layer specifications.
Once the layer specifications (like batch size, feature map size) are changed, it will search again.
Default Value: True

Examples:
    .. code-block:: python

        >>> import paddle
        >>> paddle.enable_static()
        >>> import paddle.distributed.fleet as fleet

        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.cudnn_exhaustive_search = False

        >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01)
        >>> optimizer = fleet.distributed_optimizer(optimizer, strategy)

)r   r  r   s    r!   r  +DistributedStrategy.cudnn_exhaustive_search 
  s    0 }}444r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz6cudnn_exhaustive_search should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r  :
  s*     dD!!48MM1NNHr    c                .    U R                   R                  $ )a  

The workspace limit size in MB unit for choosing cuDNN convolution algorithms.
The inner function of cuDNN obtain the fastest suited algorithm that fits within this memory limit.
Usually, large workspace size may lead to choose faster algorithms,
but significant increasing memory workspace. Users need to trade-off between memory and speed.
Default Value: 4000

Examples:
    .. code-block:: python

        >>> import paddle
        >>> paddle.enable_static()
        >>> import paddle.distributed.fleet as fleet

        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.conv_workspace_size_limit = 1024

        >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01)
        >>> optimizer = fleet.distributed_optimizer(optimizer, strategy)

)r   r  r   s    r!   r  -DistributedStrategy.conv_workspace_size_limitD
  s    0 }}666r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )Nz7conv_workspace_size_limit should have value of int type)r8  r   r   r  r
   r  r-  s     r!   r  r  ^
  s*     eS!!6;MM3NNIr    c                .    U R                   R                  $ )a(  

Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm.
This is only useful in cudnn.
Default Value: True

Examples:
    .. code-block:: python

        >>> import paddle
        >>> paddle.enable_static()
        >>> import paddle.distributed.fleet as fleet

        >>> strategy = fleet.DistributedStrategy()
        >>> strategy.cudnn_batchnorm_spatial_persistent = True

        >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01)
        >>> optimizer = fleet.distributed_optimizer(optimizer, strategy)

)r   r  r   s    r!   r  6DistributedStrategy.cudnn_batchnorm_spatial_persistenth
  s    , }}???r    c                |    [        U[        5      (       a  XR                  l        g [        R
                  " S5        g )NzAcudnn_batchnorm_spatial_persistent should have value of bool type)r8  r   r   r  r
   r  r;  s     r!   r  r  
  s*     dD!!?CMM<NNSr    c                   U R                   n/ SQn[        UR                  5      [        UR                  5      [        UR
                  5      [        UR                  5      [        UR                  5      [        UR                  5      /n[        U5       H4  u  pE[        5       R                  U5      (       d  M%  X4   [        5       U'   M6     g )N)r   r   r   r    FLAGS_fuse_parameter_memory_size FLAGS_fuse_parameter_groups_size)r   r   r  r   r  r  r  r8  rD  	enumerater   r  )r   r   r   valuesr\  r   s         r!   _enable_envDistributedStrategy._enable_env
  s    ==
 <<=223112--.--.112
  oFA((--'-y$ &r    c                R    U R                   R                  (       a  [        (       a  gg)NTF)r   r  r   r   s    r!   _is_strict_auto#DistributedStrategy._is_strict_auto
  s    =="6"6r    c                	   SnSnSnX#-   U-   nSSU S3-   nSSR                  USU-  U5      -   nSS	R                  S
/U-  5      -   S-   nSS	R                  S/U-  5      -   S-   nUS-   n	XR                  S	5      -  n	XR                  S5      -  n	XR                  S	5      -  n	U R                  R                  R                  n
S	nUS-   nU
 GH  nSUR
                  ;   a  M  SUR
                  ;   a  M(  [        [        U R                  UR
                  5      [        5      (       GaJ  [        U R                  UR
                  S-   5      (       Ga  [        U R                  UR
                  5      (       Ga  XS-   -  n	XR                  UR
                   SUR
                   S35      -  n	XS-   -  n	[        U R                  UR
                  S-   5      nUR                  R                  n[        R                  R                  nUS:  a%  [        R                  R                  R                  nOSSKJn  UR                  nU H  n[        [        UUR
                  5      U5      (       ay  [        UUR
                  5      n[#        U5       HR  u  nnUS:X  a)  XR                  UR
                  [%        U5      5      -  n	M5  XR                  S	[%        U5      5      -  n	MT     M  XR                  UR
                  [%        [        UUR
                  5      5      5      -  n	M     GM[  GM^  XR                  UR
                  [%        [        U R                  UR
                  5      5      5      -  nGM  XR                  UR
                  [%        [        U R                  UR
                  5      5      5      -  nGM     U	U-   S-   UR                  S5      -   nUU-  nUS-   nUUR                  S5      -  nUUS-   -  nU R                  R&                  R                  R                  n
U
 HS  nUUR                  UR
                  [%        [        U R                  R&                  UR
                  5      5      5      -  nMU     UUS-   -  nUU-  nU$ )Nr   &   z    z|{:^zs}|
z|{{:>{}s}}{}{{:^{}s}}|
rU  z    +r  =+-
zDistributedStrategy Overviewr*  _configsz
=True <-> z4.21.0r   )_messagez&Environment Flags, Communication FlagszBuild Strategy)formatjoinr   r   r   r   r8  r   r   r  r   r   __version___upbr  RepeatedScalarContainergoogle.protobuf.pyextr  r&   r*  )r   spacingmax_kmax_vlength	h1_format	h2_formatborderlinedrawsr   str_res	env_drawsr   
my_configsconfig_fieldsprotobuf_versionr  r  ffr  r\  r   
result_resbuild_strategy_strs                            r!   __repr__DistributedStrategy.__repr__
  sO   (uVHF33	7>>3=%
 
	 277C56>22S8#0036!!"%%!!"@AA!!"%%))004K	A166)QVV#gdmmQVV<dCCt}}affz.ABB"4==!&&99!d]2E!%5%5#$66(*QVVHH E& E "D[0E)0 $qvv
/B*J -7,A,A,H,HM/5/J/J,/8;$*KK$8$8$P$P !8 !K %-$D$D !8 '4#-$+J$@$;$" $" .5Z-IF09&0A1+,6,15E5E02Q6. -.E -25E5E02CF6. -.E 1B %*-=-=(*(+GJ,H(I.& %&E! '4+ :T "%5%5FFCqvv(F$G& 	 !1!1GDMM166$B C" Im v  GHI 	 	i
#d]i../?@@dTk)--88??A)"2"2GDMM$@$@!&&IJ#   	ftm+((
r    )__lock_attrr9  r	  r   r
  N)ra  rb  )r   r&   r  r   ra  rb  )r  r&   ra  rb  )r#  r&   ra  rb  )ra  r   )r   r   ra  rb  )ra  r`  )r   r`  ra  rb  )ra  r   )r<  r   ra  rb  )ra  r   )rA  r   ra  rb  )ra  r$   )rA  r$   ra  rb  )ra  r-   )rA  r-   ra  rb  )rA  r`  ra  rb  )ra  r4   )rA  r4   ra  rb  )ra  rD   )rA  rD   ra  rb  )ra  r   )r  r   ra  rb  )ra  r6   )r  r6   ra  rb  )ra  rL   )rA  rL   ra  rb  )ra  rR   )rA  rR   ra  rb  )rc  r   ra  rb  )re  r   ra  rb  )r[  r   ra  rb  )ra  r`   )rA  r`   ra  rb  )ra  rd   )rA  rd   ra  rb  )ra  ri   )rA  ri   ra  rb  )ra  ro   )rA  ro   ra  rb  )ra  rt   )rA  rt   ra  rb  )ra  rx   )rA  rx   ra  rb  )ra  r~   )rA  r~   ra  rb  )ra  r   )rA  r   ra  rb  )ra  r   )rA  r   ra  rb  )ra  r&   )Mr   r   r   r   r  r   r  r  r$  propertyr*  setteris_strict_autor0  r4  r9  rC  rF  rK  rP  r  r  r  r
  r  r  r  r  r%  r*  r/  r4  r8  r<  r@  rE  rI  rM  r   rV  r[  r`  re  ri  rm  rp  ru  rz  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r    r!   r   r     s   K/8b-+&.    D    B B" ""K  #K $ $0 ]]   &: &:P D  D @ @> ( (,      J  !J - -: E  E 2 2   .  !.`   EI !EIN
 ! !  	ZZA  A 97 97v A  A ! !  	ZZA  A ! !  	ZZ!  ! 7 78 A A ' '" G  G 1 1  Q   Q 8 8"  &&  ' A A  )//  0 - -$ M  M 1 1  Q   Q 2 2"   Q  !Q 5 5& ##Q  $Q 4 4& ""  # 6 6 %%  & + +$ J  J = =B G  G & &, __F  F 7< 7<r F  F 8 8  &&  ' 3 3" ""  # - -" P  P 3 3  !!  " & &$ + + K  K . . N   N __F  F < <: F  F - - M  M C C2 ##M  $M #: #:J D D6 & &" __F  F < <. F  F / /" O  O E E4 %%O.O	O  &O ! !$ 	ZZA  A 7 7: A  A , ," ,  ,
 , ,. L  L B B* ""L  #L " "& 
[[B  B 8 8: B  B " "* 
[[B  B 8 80 B  B % % ^^E  E " "8 
[[B B ' '8 G G ) )( I I ( (& H H ! ! 	ZZA A !7 !7F A A , ,0 L L 5 52 ##  $ 7 72 %%  & @ @. (..  /1.fr    r   )2
__future__r   r  typingr   r   r   google.protobufr   google.protobuf.text_formatr(  paddle.base.frameworkr   paddle.base.wrapped_decoratorr   paddle.distributed.fleet.protor	   'paddle.distributed.fleet.utils.log_utilr
   paddle.staticr   r   r$   r-   r4   rD   rL   rR   r`   rd   ri   ro   rt   rx   r~   r   r   __all__r   r   r  r   r   r   r   r   r   r   r.  r   r   r    r!   <module>r$     sf    #  0 0  "  / 8 C :+&IU &"9E "% 9E Ye $9E $
)5 )5 	 	 )5 )5 Ye 
y -iu --iu -
     89",9.9.A"/ "/J  p' p'r    