
    ёi             	         S SK Jr  S SKJrJrJrJr  S SKrS SK	r	S SK	J
r
  S SKJr  S SKJr  SSKJr  SS	KJrJr  SS
KJrJrJrJrJr  SSKJr  SSKJr  SSKJrJ r J!r!J"r"J#r#  SSK$J%r%  \(       a  S SK&J'r'  S SK	J(r(Jr  S SK)J*r*J+r+J,r,  S SK-J.r.  / SQr/    S;                         S<S jjr0     S=                       S>S jjr1SS// SQSSSS/SSS4	                         S?S jjr2    S@               SAS jjr3        SB                       SCS jjr4 " S S\!5      r5\   SD                 SES  jj5       r6\   SD                 SFS! jj5       r6   SGS" jr6SHSIS# jjr7  SJ       SKS$ jjr8  SL             SMS% jjr9 " S& S'\!5      r:  SL             SMS( jjr; " S) S*\!5      r<    SN                 SOS+ jjr= " S, S-\!5      r> " S. S/\#5      r?     SP             SQS0 jjr@\        SR                           SSS1 jj5       rA\        SR                           STS2 jj5       rA\        SR                           SUS3 jj5       rA        SVS4 jrA\       SW                           SXS5 jj5       rB\       SW                           SYS6 jj5       rB\       SW                           SZS7 jj5       rB\       SW                           S[S8 jj5       rB\       SW                           S\S9 jj5       rB       S]S: jrBg)^    )annotations)TYPE_CHECKINGCallableLiteraloverloadN)_C_ops)_add_with_axis)convert_to_list   )core)
check_typecheck_variable_and_dtype)Variableconvert_np_dtype_to_dtype_in_dygraph_modein_dynamic_or_pir_modein_pir_mode)LayerHelper)_current_expected_place)BatchNorm2DConv2DLayerReLU
Sequential)Normal)Sequence)Tensornn)ParamAttrLikeSize2Size4)_PaddingSizeMode)	yolo_lossyolo_box	prior_box	box_coderdeform_conv2dDeformConv2Ddistribute_fpn_proposalsgenerate_proposals	read_filedecode_jpegroi_poolRoIPool
psroi_pool	PSRoIPool	roi_alignRoIAlignnms
matrix_nms      ?c                   [        5       (       a"  [        R                  " U UUUUUUUUU	U5      nU$ [        S0 [	        5       D6n[        U SSS/S5        [        USSS/S5        [        USSS5        [        US	[        [        4S5        [        US
[        [        4S5        [        US[        S5        [        US[        S5        [        U	S[        S5        UR                  U R                  S9nUR                  SS9nUR                  SS9nU UUS.nUb  UUS'   UUUUUU	US.nUR                  SUUUUS.US9  U$ )a  

This operator generates YOLOv3 loss based on given predict result and ground
truth boxes.

The output of previous network is in shape [N, C, H, W], while H and W
should be the same, H and W specify the grid size, each grid point predict
given number bounding boxes, this given number, which following will be represented as S,
is specified by the number of anchor clusters in each scale. In the second dimension(the channel
dimension), C should be equal to S * (class_num + 5), class_num is the object
category number of source dataset(such as 80 in coco dataset), so in the
second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
also includes confidence score of the box and class one-hot key of each anchor box.

Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
should be as follows:

$$
b_x = \\sigma(t_x) + c_x
$$
$$
b_y = \\sigma(t_y) + c_y
$$
$$
b_w = p_w e^{t_w}
$$
$$
b_h = p_h e^{t_h}
$$

In the equation above, :math:`c_x, c_y` is the left top corner of current grid
and :math:`p_w, p_h` is specified by anchors.

As for confidence score, it is the logistic regression value of IoU between
anchor boxes and ground truth boxes, the score of the anchor box which has
the max IoU should be 1, and if the anchor box has IoU bigger than ignore
thresh, the confidence score loss of this anchor box will be ignored.

Therefore, the YOLOv3 loss consists of three major parts: box location loss,
objectness loss and classification loss. The L1 loss is used for
box coordinates (w, h), sigmoid cross entropy loss is used for box
coordinates (x, y), objectness loss and classification loss.

Each ground truth box finds a best matching anchor box in all anchors.
Prediction of this anchor box will incur all three parts of losses, and
prediction of anchor boxes with no GT box matched will only incur objectness
loss.

In order to trade off box coordinate losses between big boxes and small
boxes, box coordinate losses will be multiplied by scale weight, which is
calculated as follows.

$$
weight_{box} = 2.0 - t_w * t_h
$$

Final loss will be represented as follows.

$$
loss = (loss_{xy} + loss_{wh}) * weight_{box} + loss_{conf} + loss_{class}
$$

While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
target will be smoothed when calculating classification loss, target of
positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
negative samples will be smoothed to :math:`1.0 / class\_num`.

While :attr:`gt_score` is given, which means the mixup score of ground truth
boxes, all losses incurred by a ground truth box will be multiplied by its
mixup score.

Args:
    x (Tensor): The input tensor of YOLOv3 loss operator, This is a 4-D
                  tensor with shape of [N, C, H, W]. H and W should be same,
                  and the second dimension(C) stores box locations, confidence
                  score and classification one-hot keys of each anchor box.
                  The data type is float32 or float64.
    gt_box (Tensor): ground truth boxes, should be in shape of [N, B, 4],
                      in the third dimension, x, y, w, h should be stored.
                      x,y is the center coordinate of boxes, w, h are the
                      width and height, x, y, w, h should be divided by
                      input image height to scale to [0, 1].
                      N is the batch number and B is the max box number in
                      an image.The data type is float32 or float64.
    gt_label (Tensor): class id of ground truth boxes, should be in shape
                        of [N, B].The data type is int32.
    anchors (list|tuple): The anchor width and height, it will be parsed
                          pair by pair.
    anchor_mask (list|tuple): The mask index of anchors used in current
                              YOLOv3 loss calculation.
    class_num (int): The number of classes.
    ignore_thresh (float): The ignore threshold to ignore confidence loss.
    downsample_ratio (int): The downsample ratio from network input to YOLOv3
                            loss input, so 32, 16, 8 should be set for the
                            first, second, and third YOLOv3 loss operators.
    gt_score (Tensor|None, optional): mixup score of ground truth boxes, should be in shape
                        of [N, B]. Default None.
    use_label_smooth (bool, optional): Whether to use label smooth. Default True.
    name (str|None, optional): The default value is None. Normally there is no need
                   for user to set this property. For more information,
                   please refer to :ref:`api_guide_Name`
    scale_x_y (float, optional): Scale the center point of decoded bounding box.
                       Default 1.0.

Returns:
    Tensor: A 1-D tensor with shape [N], the value of yolov3 loss

Examples:
    .. code-block:: python

        >>> import paddle
        >>> x = paddle.rand([2, 14, 8, 8]).astype('float32')
        >>> gt_box = paddle.rand([2, 10, 4]).astype('float32')
        >>> gt_label = paddle.rand([2, 10]).astype('int32')
        >>> loss = paddle.vision.ops.yolo_loss(x,
        ...                                    gt_box=gt_box,
        ...                                    gt_label=gt_label,
        ...                                    anchors=[10, 13, 16, 30],
        ...                                    anchor_mask=[0, 1],
        ...                                    class_num=2,
        ...                                    ignore_thresh=0.7,
        ...                                    downsample_ratio=8,
        ...                                    use_label_smooth=True,
        ...                                    scale_x_y=1.)
yolov3_lossxfloat32float64r#   gt_boxgt_labelint32anchorsanchor_mask	class_numignore_threshuse_label_smoothdtype)XGTBoxGTLabelGTScore)r>   r?   r@   rA   downsample_ratiorB   	scale_x_y)LossObjectnessMaskGTMatchMasktypeinputsoutputsattrs)r7   )r   r   r#   r   localsr   r   listtupleintfloatbool"create_variable_for_type_inferencerD   	append_op)r8   r;   r<   r>   r?   r@   rA   rI   gt_scorerB   namerJ   losshelperobjectness_maskgt_match_maskrP   rR   s                     Q/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/vision/ops.pyr#   r#   E   s   X 
  7fh7 C)Y)?M Hy)4k	
 	!:wL7Ie}kB;e}kJ9k3<=/5+F#%7{K88qww8G CC D 
 AAAP 

  (F9 &"* 0 0"
 	"1,
  	 		
     F      ?c                   [        5       (       a$  [        R                  " U UUUUUUUU	U
5
      u  pX4$ [        S0 [	        5       D6n[        U SSS/S5        [        USSS5        [        US[        [        4S5        [        US[        S5        UR                  U R                  S	9nUR                  U R                  S	9nUUUUUUU	U
S
.nUR                  SU US.UUS.US9  X4$ )a\  

This operator generates YOLO detection boxes from output of YOLOv3 network.

The output of previous network is in shape [N, C, H, W], while H and W
should be the same, H and W specify the grid size, each grid point predict
given number boxes, this given number, which following will be represented as S,
is specified by the number of anchors. In the second dimension(the channel
dimension), C should be equal to S * (5 + class_num) if :attr:`iou_aware` is false,
otherwise C should be equal to S * (6 + class_num). class_num is the object
category number of source dataset(such as 80 in coco dataset), so the
second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
also includes confidence score of the box and class one-hot key of each anchor
box.

Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box
predictions should be as follows:

$$
b_x = \\sigma(t_x) + c_x
$$
$$
b_y = \\sigma(t_y) + c_y
$$
$$
b_w = p_w e^{t_w}
$$
$$
b_h = p_h e^{t_h}
$$

in the equation above, :math:`c_x, c_y` is the left top corner of current grid
and :math:`p_w, p_h` is specified by anchors.

The logistic regression value of the 5th channel of each anchor prediction boxes
represents the confidence score of each prediction box, and the logistic
regression value of the last :attr:`class_num` channels of each anchor prediction
boxes represents the classification scores. Boxes with confidence scores less than
:attr:`conf_thresh` should be ignored, and box final scores is the product of
confidence scores and classification scores.

$$
score_{pred} = score_{conf} * score_{class}
$$


Args:
    x (Tensor): The input tensor of YoloBox operator is a 4-D tensor with
                  shape of [N, C, H, W]. The second dimension(C) stores box
                  locations, confidence score and classification one-hot keys
                  of each anchor box. Generally, X should be the output of
                  YOLOv3 network. The data type is float32 or float64.
    img_size (Tensor): The image size tensor of YoloBox operator, This is a
                       2-D tensor with shape of [N, 2]. This tensor holds
                       height and width of each input image used for resizing
                       output box in input image scale. The data type is int32.
    anchors (list|tuple): The anchor width and height, it will be parsed pair
                          by pair.
    class_num (int): The number of classes.
    conf_thresh (float): The confidence scores threshold of detection boxes.
                         Boxes with confidence scores under threshold should
                         be ignored.
    downsample_ratio (int): The downsample ratio from network input to
                            :attr:`yolo_box` operator input, so 32, 16, 8
                            should be set for the first, second, and third
                            :attr:`yolo_box` layer.
    clip_bbox (bool, optional): Whether clip output bonding box in :attr:`img_size`
                      boundary. Default true.
    name (str|None, optional): The default value is None. Normally there is no need
                   for user to set this property. For more information,
                   please refer to :ref:`api_guide_Name`.
    scale_x_y (float, optional): Scale the center point of decoded bounding box. Default 1.0
    iou_aware (bool, optional): Whether use iou aware. Default false.
    iou_aware_factor (float, optional): iou aware factor. Default 0.5.

Returns:
    Tensor: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
    and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification
    scores of boxes.

Examples:
    .. code-block:: python

        >>> import paddle

        >>> x = paddle.rand([2, 14, 8, 8]).astype('float32')
        >>> img_size = paddle.ones((2, 2)).astype('int32')
        >>> boxes, scores = paddle.vision.ops.yolo_box(x,
        ...                                             img_size=img_size,
        ...                                             anchors=[10, 13, 16, 30],
        ...                                             class_num=2,
        ...                                             conf_thresh=0.01,
        ...                                             downsample_ratio=8,
        ...                                             clip_bbox=True,
        ...                                             scale_x_y=1.)
r$   r8   r9   r:   img_sizer=   r>   conf_threshrC   )r>   r@   rf   rI   	clip_bboxrJ   	iou_awareiou_aware_factor)rE   ImgSize)BoxesScoresrN   )r$   )r   r   r$   r   rS   r   r   rT   rU   rW   rY   rD   rZ   )r8   re   r>   r@   rf   rI   rg   r\   rJ   rh   ri   boxesscoresr^   rR   s                  ra   r$   r$     s&   Z 
 } 4684 C)Y)?L :w
K7Ie}jA;ujA999H:::I "& 0""" 0	
 	#
    	 	
 }rb   )皙?ro   皙?rp           c                @   S nU" U5      (       d  U/nU" U5      (       d  U/nU" U5      (       d  U/n[        U5      S:X  d  [        S5      e[        [        [        U5      5      n[        [        [        U5      5      n[        [        [        U5      5      nSnUb*  [        U5      S:  a  US   S:  a  U" U5      (       d  U/nUn[        5       (       a1  Uu  pUc  / n[        R                  " U UUUUUUUUUU	U
5      u  nnUU4$ [        S0 [        5       D6nUR                  5       n[        U S/ SQS5        UUUUUUS   US	   U	U
S
.	nUb  UUS'   UR                  U5      nUR                  U5      nUR                  SXS.UUS.US9  SUl        SUl        UU4$ )a
  

This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.

Each position of the input produce N prior boxes, N is determined by
the count of min_sizes, max_sizes and aspect_ratios, The size of the
box is in range(min_size, max_size) interval, which is generated in
sequence according to the aspect_ratios.

Args:
   input (Tensor): 4-D tensor(NCHW), the data type should be float32 or float64.
   image (Tensor): 4-D tensor(NCHW), the input image data of PriorBoxOp,
        the data type should be float32 or float64.
   min_sizes (list|tuple|float): the min sizes of generated prior boxes.
   max_sizes (list|tuple|float|None, optional): the max sizes of generated prior boxes.
        Default: None, means [] and will not be used.
   aspect_ratios (list|tuple|float, optional): the aspect ratios of generated
        prior boxes. Default: [1.0].
   variance (list|tuple|float, optional): the variances to be encoded in prior boxes.
        Default:[0.1, 0.1, 0.2, 0.2].
   flip (bool): Whether to flip aspect ratios. Default:False.
   clip (bool): Whether to clip out-of-boundary boxes. Default: False.
   steps (list|tuple|float, optional): Prior boxes steps across width and height, If
        steps[0] equals to 0.0 or steps[1] equals to 0.0, the prior boxes steps across
        height or weight of the input will be automatically calculated.
        Default: [0., 0.]
   offset (float, optional)): Prior boxes center offset. Default: 0.5
   min_max_aspect_ratios_order (bool, optional): If set True, the output prior box is
        in order of [min, max, aspect_ratios], which is consistent with
        Caffe. Please note, this order affects the weights order of
        convolution layer followed by and does not affect the final
        detection results. Default: False.
   name (str, optional): The default value is None. Normally there is no need for
        user to set this property. For more information, please refer to :ref:`api_guide_Name`

Returns:
    Tensor: the output prior boxes and the expanded variances of PriorBox.
        The prior boxes is a 4-D tensor, the layout is [H, W, num_priors, 4],
        num_priors is the total box count of each position of input.
        The expanded variances is a 4-D tensor, same shape as the prior boxes.

Examples:
    .. code-block:: python

        >>> import paddle

        >>> input = paddle.rand((1, 3, 6, 9), dtype=paddle.float32)
        >>> image = paddle.rand((1, 3, 9, 12), dtype=paddle.float32)
        >>> box, var = paddle.vision.ops.prior_box(
        ...     input=input,
        ...     image=image,
        ...     min_sizes=[2.0, 4.0],
        ...     clip=True,
        ...     flip=True)
        ...
c                .    [        U [        [        45      $ N)
isinstancerT   rU   )datas    ra   _is_list_or_tuple_%prior_box.<locals>._is_list_or_tuple_  s    $u..rb   r   z steps should be (step_w, step_h)Nr   r%   input)uint8int8r9   r:      )		min_sizesaspect_ratios	variancesflipclipstep_wstep_hoffsetmin_max_aspect_ratios_order	max_sizes)InputImage)rk   	VariancesrN   T)r%   )len
ValueErrorrT   maprW   r   r   r%   r   rS   input_dtyper   rY   rZ   stop_gradient)ry   imager}   r   r~   variancer   r   stepsr   r   r\   rw   cur_max_sizesr   r   boxvarr^   rD   rR   s                        ra   r%   r%     s   N/ i((K	m,,&e$$u:?;<<S	*+IUM23MUE"#EMY!!3	!q8H!),,"I!I##'
S Cx 5FH5""$ 7C[	
 #*!AhAh+F

 $!.E+77>77>"3!4	 	 	
 ! Cxrb   c           	     @   [        5       (       a  [        U[        R                  R                  [
        R                  R                  45      (       a  [        R                  " U UUUUU/ 5      nU$ [        U[        [        45      (       a?  [        U5      n[        U5      S:X  d   S5       e[        R                  " U SUUUUU5      nU$ [        S5      e[        U SSS/S5        [        US	SS/S5        [        S0 [!        5       D6nUR#                  U R$                  S
9nXS.n	UUUS.n
[        U[&        5      (       a  XS'   OD[        U[        [        45      (       a  XS'   [        U
S   5      S:X  d   S5       eO[        S5      eUR)                  SU	U
SU0S9  U$ )ad  
Encode/Decode the target bounding box with the priorbox information.

The Encoding schema described below:

.. math::

    ox &= (tx - px) / pw / pxv

    oy &= (ty - py) / ph / pyv

    ow &= log(abs(tw / pw)) / pwv

    oh &= log(abs(th / ph)) / phv

The Decoding schema described below:

.. math::

    ox &= (pw * pxv * tx * + px) - tw / 2

    oy &= (ph * pyv * ty * + py) - th / 2

    ow &= exp(pwv * tw) * pw + tw / 2

    oh &= exp(phv * th) * ph + th / 2

where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates,
width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote
the priorbox's (anchor) center coordinates, width and height. `pxv`,
`pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`,
`ow`, `oh` denote the encoded/decoded coordinates, width and height.
During Box Decoding, two modes for broadcast are supported. Say target
box has shape [N, M, 4], and the shape of prior box can be [N, 4] or
[M, 4]. Then prior box will broadcast to target box along the
assigned axis.

Args:
    prior_box (Tensor): Box list prior_box is a 2-D Tensor with shape
        [M, 4] holds M boxes and data type is float32 or float64. Each box
        is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the
        left top coordinate of the anchor box, if the input is image feature
        map, they are close to the origin of the coordinate system.
        [xmax, ymax] is the right bottom coordinate of the anchor box.
    prior_box_var (Tensor|List|tuple|None): prior_box_var supports four types
        of input. One is Tensor with shape [M, 4] which holds M group and
        data type is float32 or float64. The second is list or tuple consist
        of 4 elements shared by all boxes and data type is float32 or float64.
        Other is None and not involved in calculation.
    target_box (Tensor): This input can be a 2-D DenseTensor with shape
        [N, 4] when code_type is 'encode_center_size'. This input also can
        be a 3-D Tensor with shape [N, M, 4] when code_type is
        'decode_center_size'. Each box is represented as
        [xmin, ymin, xmax, ymax]. The data type is float32 or float64.
    code_type (str, optional): The code type used with the target box. It can be
        `encode_center_size` or `decode_center_size`. `encode_center_size`
        by default.
    box_normalized (bool, optional): Whether treat the priorbox as a normalized box.
        Set true by default.
    axis (int, optional): Which axis in PriorBox to broadcast for box decode,
        for example, if axis is 0 and TargetBox has shape [N, M, 4] and
        PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4]
        for decoding. It is only valid when code type is
        `decode_center_size`. Set 0 by default.
    name (str, optional): For detailed information, please refer
        to :ref:`api_guide_Name`. Usually name is no need to set and
        None by default.

Returns:
    Tensor: output boxes, when code_type is 'encode_center_size', the
        output tensor of box_coder_op with shape [N, M, 4] representing the
        result of N target boxes encoded with M Prior boxes and variances.
        When code_type is 'decode_center_size', N represents the batch size
        and M represents the number of decoded boxes.

Examples:
    .. code-block:: python

        >>> import paddle

        >>> # For encode
        >>> prior_box_encode = paddle.rand((80, 4), dtype=paddle.float32)
        >>> prior_box_var_encode = paddle.rand((80, 4), dtype=paddle.float32)
        >>> target_box_encode = paddle.rand((20, 4), dtype=paddle.float32)
        >>> output_encode = paddle.vision.ops.box_coder(
        ...     prior_box=prior_box_encode,
        ...     prior_box_var=prior_box_var_encode,
        ...     target_box=target_box_encode,
        ...     code_type="encode_center_size")
        ...
        >>> # For decode
        >>> prior_box_decode = paddle.rand((80, 4), dtype=paddle.float32)
        >>> prior_box_var_decode = paddle.rand((80, 4), dtype=paddle.float32)
        >>> target_box_decode = paddle.rand((20, 80, 4), dtype=paddle.float32)
        >>> output_decode = paddle.vision.ops.box_coder(
        ...     prior_box=prior_box_decode,
        ...     prior_box_var=prior_box_var_decode,
        ...     target_box=target_box_decode,
        ...     code_type="decode_center_size",
        ...     box_normalized=False)
        ...
   zCInput prior_box_var must be Variable or list|tuple with 4 elements.Nz2Input prior_box_var must be Variable or list|tupler%   r9   r:   r&   
target_boxrC   )PriorBox	TargetBox)	code_typebox_normalizedaxisPriorBoxVarr   	OutputBoxrO   rP   rR   rQ   )r&   )r   ru   r   eagerr   paddlepirValuer   r&   rT   rU   r   	TypeErrorr   r   rS   rY   rD   r   rZ   )r%   prior_box_varr   r   r   r   r\   
output_boxr^   rP   rR   s              ra   r&   r&   H  s   b mdjj&7&79I9I%JKK))J6 % e}55 /M}%* U*  ))J  D 
 	!{Y	$:K	
 	!y)&<k	
 5FH5>>// ? 

 (A",

 mX..$1=!e}55 -*uZ()Q. U. D  	 *-	 	 	
 rb   c                   [        USS5      n[        USS5      n[        USS5      nU	c  SOSn[        R                  R                  5       S:X  a  UR                  [        R
                  :X  a  UR                  U R                  5      nUR                  [        R
                  :X  a  UR                  U R                  5      nUb9  UR                  [        R
                  :X  a  UR                  U R                  5      nU	b9  U	R                  [        R
                  :X  a  U	R                  U R                  5      n	[        5       (       a3  [        R                  " U UUU	UUUUUS5
      nUb  [        XSS	9nU$ Un U$ [        U S
SS/S5        [        USSS/S5        U R                  S   n[        S0 [        5       D6nUR                  5       n[        USS5      n[        USS5      n[        USS5      nUR!                  U5      nU(       a	  SnU UUS.nO	SnU UUU	S.nSU0nUUUUUSS.nUR#                  UUUUS9  Ub/  UR!                  U5      nUR#                  SU/U/S.SU/0SS0S9  U$ UnU$ )a  
Compute 2-D deformable convolution on 4-D input.
Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:


Deformable Convolution v2:

.. math::

    y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k) * \Delta m_k}

Deformable Convolution v1:

.. math::

    y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k)}

Where :math:`\Delta p_k` and :math:`\Delta m_k` are the learnable offset and modulation scalar for the k-th location,
Which :math:`\Delta m_k` is one in deformable convolution v1. Please refer to `Deformable ConvNets v2: More Deformable, Better Results
<https://arxiv.org/abs/1811.11168v2>`_ and `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_.

Example:
    - Input:

      x shape: :math:`(N, C_{in}, H_{in}, W_{in})`

      weight shape: :math:`(C_{out}, C_{in}, H_f, W_f)`

      offset shape: :math:`(N, 2 * H_f * W_f, H_{out}, W_{out})`

      mask shape: :math:`(N, H_f * W_f, H_{out}, W_{out})`

    - Output:

      Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`

    Where

    .. math::

        H_{out}&= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\
        W_{out}&= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1

Args:
    x (Tensor): The input image with [N, C, H, W] format. A Tensor with type
        float32, float64.
    offset (Tensor): The input coordinate offset of deformable convolution layer.
        A Tensor with type float32, float64.
    weight (Tensor): The convolution kernel with shape [M, C/g, kH, kW], where M is
        the number of output channels, g is the number of groups, kH is the filter's
        height, kW is the filter's width.
    bias (Tensor, optional): The bias with shape [M,]. Default: None.
    stride (int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
        contain two integers, (stride_H, stride_W). Otherwise, the
        stride_H = stride_W = stride. Default: 1.
    padding (int|list|tuple, optional): The padding size. If padding is a list/tuple, it must
        contain two integers, (padding_H, padding_W). Otherwise, the
        padding_H = padding_W = padding. Default: 0.
    dilation (int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
        contain two integers, (dilation_H, dilation_W). Otherwise, the
        dilation_H = dilation_W = dilation. Default: 1.
    deformable_groups (int): The number of deformable group partitions.
        Default: 1.
    groups (int, optional): The groups number of the deformable conv layer. According to
        grouped convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
        the first half of the filters is only connected to the first half
        of the input channels, while the second half of the filters is only
        connected to the second half of the input channels. Default: 1.
    mask (Tensor, optional): The input mask of deformable convolution layer.
        A Tensor with type float32, float64. It should be None when you use
        deformable convolution v1. Default: None.
    name(str|None, optional): For details, please refer to :ref:`api_guide_Name`.
                    Generally, no setting is required. Default: None.
Returns:
    Tensor: 4-D Tensor storing the deformable convolution result.\
        A Tensor with type float32, float64.

Examples:
    .. code-block:: pycon

        >>> #deformable conv v2:

        >>> import paddle
        >>> input = paddle.rand((8, 1, 28, 28))
        >>> kh, kw = 3, 3
        >>> weight = paddle.rand((16, 1, kh, kw))
        >>> # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
        >>> # mask shape should be [bs, hw * hw, out_h, out_w]
        >>> # In this case, for an input of 28, stride of 1
        >>> # and kernel size of 3, without padding, the output size is 26
        >>> offset = paddle.rand((8, 2 * kh * kw, 26, 26))
        >>> mask = paddle.rand((8, kh * kw, 26, 26))
        >>> out = paddle.vision.ops.deform_conv2d(input, offset, weight, mask=mask)
        >>> print(out.shape)
        paddle.Size([8, 16, 26, 26])

        >>> #deformable conv v1:

        >>> import paddle
        >>> input = paddle.rand((8, 1, 28, 28))
        >>> kh, kw = 3, 3
        >>> weight = paddle.rand((16, 1, kh, kw))
        >>> # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
        >>> # In this case, for an input of 28, stride of 1
        >>> # and kernel size of 3, without padding, the output size is 26
        >>> offset = paddle.rand((8, 2 * kh * kw, 26, 26))
        >>> out = paddle.vision.ops.deform_conv2d(input, offset, weight)
        >>> print(out.shape)
        paddle.Size([8, 16, 26, 26])
r   stridepaddingdilationTFcpur|   )r   r8   r9   r:   r'   r   deformable_convdeformable_conv_v1)r   FilterOffset)r   r   r   MaskOutput)stridespaddings	dilationsgroupsdeformable_groupsim2col_steprN   elementwise_add)rE   YOutr   )r   )r
   r   device
get_devicerD   float16astyper   r   r   r	   r   shaper   rS   r   rY   rZ   )r8   r   weightbiasr   r   r   r   r   maskr\   use_deform_conv2d_v1pre_biasoutnum_channelsr^   rD   op_typerP   rQ   rR   s                        ra   r'   r'     s   v VQ1Fgq)4GxJ7H#'<4U }}!U*<<6>>)]]177+F<<6>>)]]177+F

fnn <;;qww'D

fnn <;;qww'D))
  a8Cz Jw Cv Js 	!sY	*O	
 	!Hy)4o	
 wwqz;&(;""$ H5!'1i8"8Q
;<<UC*G  F (G  	F X&!!2
 	 	 	
 ;;EBC&&Ztf5qk	   J CJrb   c                     ^  \ rS rSr% SrS\S'   S\S'          S	                     S
U 4S jjjr S       SS jjrSrU =r	$ )r(   i  a|  
Compute 2-D deformable convolution on 4-D input.
Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:


Deformable Convolution v2:

.. math::

    y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k) * \Delta m_k}

Deformable Convolution v1:

.. math::

    y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k)}

Where :math:`\Delta p_k` and :math:`\Delta m_k` are the learnable offset and modulation scalar for the k-th location,
Which :math:`\Delta m_k` is one in deformable convolution v1. Please refer to `Deformable ConvNets v2: More Deformable, Better Results
<https://arxiv.org/abs/1811.11168v2>`_ and `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_.

Example:
    - Input:

      x shape: :math:`(N, C_{in}, H_{in}, W_{in})`

      weight shape: :math:`(C_{out}, C_{in}, H_f, W_f)`

      offset shape: :math:`(N, 2 * H_f * W_f, H_{out}, W_{out})`

      mask shape: :math:`(N, H_f * W_f, H_{out}, W_{out})`

    - Output:

      Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`

    Where

    .. math::

        H_{out}&= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\
        W_{out}&= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1


Parameters:
    in_channels(int): The number of input channels in the input image.
    out_channels(int): The number of output channels produced by the convolution.
    kernel_size(int|list|tuple): The size of the convolving kernel.
    stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
        contain three integers, (stride_H, stride_W). Otherwise, the
        stride_H = stride_W = stride. The default value is 1.
    padding (int|list|tuple, optional): The padding size. If padding is a list/tuple, it must
        contain two integers, (padding_H, padding_W). Otherwise, the
        padding_H = padding_W = padding. Default: padding = 0.
    dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
        contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
        dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
    deformable_groups (int, optional): The number of deformable group partitions.
        Default: deformable_groups = 1.
    groups(int, optional): The groups number of the Conv3D Layer. According to grouped
        convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
        the first half of the filters is only connected to the first half
        of the input channels, while the second half of the filters is only
        connected to the second half of the input channels. The default value is 1.
    weight_attr(ParamAttr|None, optional): The parameter attribute for learnable parameters/weights
        of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
        will create ParamAttr as param_attr. If it is set to None, the parameter
        is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
        :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
    bias_attr(ParamAttr|bool|None, optional): The parameter attribute for the bias of conv2d.
        If it is set to False, no bias will be added to the output units.
        If it is set to None or one attribute of ParamAttr, conv2d
        will create ParamAttr as bias_attr. If the Initializer of the bias_attr
        is not set, the bias is initialized zero. The default value is None.
Attribute:
    **weight** (Parameter): the learnable weights of filter of this layer.
    **bias** (Parameter or None): the learnable bias of this layer.
Shape:
    - x: :math:`(N, C_{in}, H_{in}, W_{in})`
    - offset: :math:`(N, 2 * H_f * W_f, H_{out}, W_{out})`
    - mask: :math:`(N, H_f * W_f, H_{out}, W_{out})`
    - output: :math:`(N, C_{out}, H_{out}, W_{out})`

    Where

    ..  math::

        H_{out}&= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1 \\
        W_{out}&= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1

Examples:
    .. code-block:: pycon

        >>> #deformable conv v2:
        >>> import paddle
        >>> input = paddle.rand((8, 1, 28, 28))
        >>> kh, kw = 3, 3
        >>> # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
        >>> # mask shape should be [bs, hw * hw, out_h, out_w]
        >>> # In this case, for an input of 28, stride of 1
        >>> # and kernel size of 3, without padding, the output size is 26
        >>> offset = paddle.rand((8, 2 * kh * kw, 26, 26))
        >>> mask = paddle.rand((8, kh * kw, 26, 26))
        >>> deform_conv = paddle.vision.ops.DeformConv2D(
        ...     in_channels=1,
        ...     out_channels=16,
        ...     kernel_size=[kh, kw])
        >>> out = deform_conv(input, offset, mask)
        >>> print(out.shape)
        paddle.Size([8, 16, 26, 26])

        >>> #deformable conv v1:
        >>> import paddle
        >>> input = paddle.rand((8, 1, 28, 28))
        >>> kh, kw = 3, 3
        >>> # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
        >>> # mask shape should be [bs, hw * hw, out_h, out_w]
        >>> # In this case, for an input of 28, stride of 1
        >>> # and kernel size of 3, without padding, the output size is 26
        >>> offset = paddle.rand((8, 2 * kh * kw, 26, 26))
        >>> deform_conv = paddle.vision.ops.DeformConv2D(
        ...     in_channels=1,
        ...     out_channels=16,
        ...     kernel_size=[kh, kw])
        >>> out = deform_conv(input, offset)
        >>> print(out.shape)
        paddle.Size([8, 16, 26, 26])
r   r   r   c                $  >^  [         TT ]  5         U	SLd   S5       eU	T l        U
T l        UT l        UT l        UT l        UT l        ST l        [        USS5      T l
        [        USS5      T l        [        USS5      T l        X-  S:w  a  [        S	5      e[        USS
5      T l        X!U-  /T R                  QnU 4S jnT R                  UT R                  U" 5       S9T l        T R                  T R                  T R                  /SS9T l        g )NFz(weight_attr should not be False in Conv.r|   r   r   r   kernel_sizer   z(in_channels must be divisible by groups.r   c                    > [         R                  " TR                  5      TR                  -  n SU -  S-  n[	        SU5      $ )N       @rc   rq   )npprod_kernel_size_in_channelsr   )filter_elem_numstdselfs     ra   _get_default_param_initializer=DeformConv2D.__init__.<locals>._get_default_param_initializer  s>     ggd&7&784;L;LLO(S0C#s##rb   )r   attrdefault_initializerT)r   r   is_bias)super__init___weight_attr
_bias_attr_deformable_groups_groupsr   _out_channels_channel_dimr
   _stride	_dilationr   r   _paddingcreate_parameterr   r   )r   in_channelsout_channelsr   r   r   r   r   r   weight_attr	bias_attrfilter_shaper   	__class__s   `            ra   r   DeformConv2D.__init__]  s-    	%' 	
6	
' (#"3')&vq(;(1jA+KMJ1$GHH'I>$V&;Pd>O>OP	$
 ++"" > @ , 

 ))););(<d * 
	rb   c                    [        UUU R                  U R                  U R                  U R                  U R
                  U R                  U R                  US9
nU$ )N)
r8   r   r   r   r   r   r   r   r   r   )r'   r   r   r   r   r   r   r   )r   r8   r   r   r   s        ra   forwardDeformConv2D.forward  sQ     ;;<<MM^^"55<<
 
rb   )r   r   r   r   r   r   r   r   r   r   r   r   r   )r|   r   r|   r|   r|   NN)r   rV   r   rV   r   r    r   r    r   r    r   r    r   rV   r   rV   r   ParamAttrLike | Noner   r   returnNonert   )r8   r   r   r   r   Tensor | Noner   r   
__name__
__module____qualname____firstlineno____doc____annotations__r   r   __static_attributes____classcell__r   s   @ra   r(   r(     s    B N
L !",0*.0
0
 0
 	0

 0
 0
 0
 0
 0
 *0
 (0
 
0
 0
f @D!'/<	 rb   r(   c                    g rt    fpn_rois	min_level	max_levelrefer_levelrefer_scalepixel_offsetrois_numr\   s           ra   r)   r)     s     ),rb   c                    g rt   r   r   s           ra   r)   r)     s     14rb   c                   US:  a  US:  d   S5       eX!-
  S-   nUS:  d   S5       eUS:  d   S5       e[        5       (       a.  Uc   S5       e[        R                  " U UUUUUU5      u  n	n
nXU
4$ [        U S	S
S/S5        [	        S0 [        5       D6nUR                  S	5      n[        U5       Vs/ s H  nUR                  U5      PM     n	nUR                  SS9nSU 0nU	US.nUb2  XoS'   [        U5       Vs/ s H  nUR                  SS9PM     n
nU
US'   OSn
UR                  SUUUUUUUS.S9  XU
4$ s  snf s  snf )a  

In Feature Pyramid Networks (FPN) models, it is needed to distribute
all proposals into different FPN level, with respect to scale of the proposals,
the referring scale and the referring level. Besides, to restore the order of
proposals, we return an array which indicates the original index of rois
in current proposals. To compute FPN level for each roi, the formula is given as follows:

.. math::
    roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} \\
    level &= floor(\log(\frac{roi\_scale}{refer\_scale}) + refer\_level)

where BBoxArea is a function to compute the area of each roi.

Args:
    fpn_rois (Tensor): The input fpn_rois. 2-D Tensor with shape [N, 4] and data type can be
        float32 or float64.
    min_level (int): The lowest level of FPN layer where the proposals come
        from.
    max_level (int): The highest level of FPN layer where the proposals
        come from.
    refer_level (int): The referring level of FPN layer with specified scale.
    refer_scale (int): The referring scale of FPN layer with specified level.
    pixel_offset (bool, optional): Whether there is pixel offset. If True, the offset of
        image shape will be 1. 'False' by default.
    rois_num (Tensor|None, optional): 1-D Tensor contains the number of RoIs in each image.
        The shape is [B] and data type is int32. B is the number of images.
        If rois_num not None, it will return a list of 1-D Tensor. Each element
        is the output RoIs' number of each image on the corresponding level
        and the shape is [B]. None by default.
    name (str|None, optional): For detailed information, please refer
        to :ref:`api_guide_Name`. Usually name is no need to set and
        None by default.

Returns:
    - multi_rois (List), The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
      and data type is same as `fpn_rois` . The length is max_level-min_level+1.
    - restore_ind (Tensor), The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
      , where N is the number of total rois. The data type is int32.
    - rois_num_per_level (List), A list of 1-D Tensor and each Tensor is
      the RoIs' number in each image on the corresponding level. The shape
      is [B] and data type of int32, where B is the number of images.

Examples:
    .. code-block:: python

        >>> import paddle

        >>> fpn_rois = paddle.rand((10, 4))
        >>> rois_num = paddle.to_tensor([3, 1, 4, 2], dtype=paddle.int32)
        >>> multi_rois, restore_ind, rois_num_per_level = paddle.vision.ops.distribute_fpn_proposals(
        ...     fpn_rois=fpn_rois,
        ...     min_level=2,
        ...     max_level=5,
        ...     refer_level=4,
        ...     refer_scale=224,
        ...     rois_num=rois_num)
        ...
r   z0min_level and max_level should be greater than 0r|   z*max_level should be greater than min_leveld   zAOnly support max to 100 levels, (max_level - min_level + 1 < 100)Nz,rois_num should not be None in dygraph mode.r   r9   r:   r)   r=   rC   FpnRois)MultiFpnRoisRestoreIndexRoisNumMultiLevelRoIsNum)r  r  r  r  r  rN   )r)   )
r   r   r)   r   r   rS   r   rangerY   rZ   )r   r  r  r  r  r  r  r\   num_lvl
multi_roisrois_num_per_levelrestore_indr^   rD   irP   rQ   s                    ra   r)   r)     s   J q=Y] :* #a'GQ;DDD;S= K= # 	
:	
# ++
		
 (::: 	!	"&		
 D68D"":. 7^
# 55e<# 	 

 ??g?NX&&'

  (9 w"'A 999H'  " ,>G'(!%+&&** ,	 	 	
 (:::G
"s   3D;<E c                   [        S5      n[        5       (       a*  [        R                  " X[        R
                  " 5       5      $ 0 nSU 0n[        S0 [        5       D6nUR                  S5      nUR                  SX4SU0S9  U$ )a  
Reads and outputs the bytes contents of a file as a uint8 Tensor
with one dimension.

Args:
    filename (str): Path of the file to be read.
    name (str|None, optional): The default value is None. Normally there is no
        need for user to set this property. For more information, please
        refer to :ref:`api_guide_Name`.

Returns:
    A uint8 tensor.

Examples:
    .. code-block:: pycon

        >>> import cv2
        >>> import paddle
        >>> paddle.seed(2023)
        >>> fake_img = (paddle.rand((400, 300, 3)).numpy() * 255).astype('uint8')
        >>> cv2.imwrite('fake.jpg', fake_img)
        >>> img_bytes = paddle.vision.ops.read_file('fake.jpg')
        >>> print(img_bytes.shape)
        paddle.Size([142773])
rz   filenamer+   r   r   )r+   )
r   r   r   r+   r   CPUPlacer   rS   rY   rZ   )r  r\   
attr_dtyperP   rR   r^   r   s          ra   r+   r+   L  s    6 ,G4Jfoo6GHHX&5FH577@V5#, 	 	
 
rb   c                    [        5       (       a  [        R                  " X[        5       5      $ SU 0nSU0n[	        S0 [        5       D6nUR                  S5      nUR                  SX4SU0S9  U$ )a  
Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor.
Optionally converts the image to the desired format.
The values of the output tensor are uint8 between 0 and 255.

Args:
    x (Tensor): A one dimensional uint8 tensor containing the raw bytes
        of the JPEG image.
    mode (str, optional): The read mode used for optionally converting the image. Must be one of
        ["unchanged", "gray", "rgb"]. Default: 'unchanged'.
    name (str, optional): The default value is None. Normally there is no
        need for user to set this property. For more information, please
        refer to :ref:`api_guide_Name`.
Returns:
    Tensor: A decoded image tensor with shape (image_channels, image_height, image_width)

Examples:
    .. code-block:: pycon

        >>> # doctest: +REQUIRES(env:GPU)
        >>> import cv2
        >>> import numpy as np
        >>> import paddle
        >>> paddle.device.set_device('gpu')

        >>> fake_img = (np.random.random((400, 300, 3)) * 255).astype('uint8')
        >>> cv2.imwrite('fake.jpg', fake_img)
        >>> img_bytes = paddle.vision.ops.read_file('fake.jpg')
        >>> img = paddle.vision.ops.decode_jpeg(img_bytes)
        >>> print(img.shape)
        paddle.Size([3, 400, 300])
rE   moder,   rz   r   r   )r,   )r   r   r,   r   r   rS   rY   rZ   )r8   r  r\   rP   rR   r^   r   s          ra   r,   r,   w  s{    J !!!+B+DEEq7fh777@vUCL 	 	
 
rb   c           
        [        US[        [        [        4S5        [	        U[        5      (       a  X34nUu  pg[        U R                  5      S:X  d   S5       eXg-  S:X  a  [        S5      e[        U R                  S   Xg-  -  5      n[        5       (       a  [        R                  " U UUUUUU5      $ [        S0 [        5       D6n	U	R                  5       n
U	R                  U
5      nU	R                  SXS.S	U0UUUUS
.S9  U$ )aF  
Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
position-sensitive average pooling on regions of interest specified by input. It performs
on inputs of nonuniform sizes to obtain fixed-size feature maps.

PSROIPooling is proposed by R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details.

Args:
    x (Tensor): Input features with shape (N, C, H, W). The data type can be float32 or float64.
    boxes (Tensor): Box coordinates of ROIs (Regions of Interest) to pool over. It should be
                     a 2-D Tensor with shape (num_rois, 4). Given as [[x1, y1, x2, y2], ...],
                     (x1, y1) is the top left coordinates, and (x2, y2) is the bottom
                     right coordinates.
    boxes_num (Tensor): The number of boxes contained in each picture in the batch.
    output_size (int|Tuple(int, int))  The pooled output size(H, W), data type
                           is int32. If int, H and W are both equal to output_size.
    spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their
                           input scale to the scale used when pooling. Default: 1.0
    name(str|None, optional): The default value is None.
                         Normally there is no need for user to set this property.
                         For more information, please refer to :ref:`api_guide_Name`

Returns:
    4-D Tensor. The pooled ROIs with shape (num_rois, output_channels, pooled_h, pooled_w).
    The output_channels equal to C / (pooled_h * pooled_w), where C is the channels of input.

Examples:
    .. code-block:: pycon

        >>> import paddle
        >>> x = paddle.uniform([2, 490, 28, 28], dtype='float32')
        >>> boxes = paddle.to_tensor(
        ...     [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]],
        ...     dtype='float32',
        ... )
        >>> boxes_num = paddle.to_tensor([1, 2], dtype='int32')
        >>> pool_out = paddle.vision.ops.psroi_pool(x, boxes, boxes_num, 7, 1.0)
        >>> print(pool_out.shape)
        paddle.Size([3, 10, 7, 7])
output_sizer/   r   z0Input features with shape should be (N, C, H, W)r   z!output_size should not contain 0.r|   rE   ROIsr   )output_channelsspatial_scalepooled_heightpooled_widthrN   )r/   )r   rV   rU   rT   ru   r   r   r   r   r   r/   r   rS   r   rY   rZ   )r8   rm   	boxes_numr  r   r\   r!  r"  r  r^   rD   r   s               ra   r/   r/     s   b {MC+=|L+s##"0"-Mqww<1PPP#q(<==!''!*(DEFO  
 	
 6VX6""$77>*CL#2!.!. ,		 	 
	
 
rb   c                  T   ^  \ rS rSr% SrS\S'   S\S'   S
SU 4S jjjrSS jrS	rU =r	$ )r0   i   a^  
This interface is used to construct a callable object of the ``PSRoIPool`` class. Please
refer to :ref:`api_paddle_vision_ops_psroi_pool`.

Args:
    output_size (int|Tuple(int, int))  The pooled output size(H, W), data type
                           is int32. If int, H and W are both equal to output_size.
    spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their
                           input scale to the scale used when pooling. Default: 1.0.

Shape:
    - x: 4-D Tensor with shape (N, C, H, W).
    - boxes: 2-D Tensor with shape (num_rois, 4).
    - boxes_num: 1-D Tensor.
    - output: 4-D tensor with shape (num_rois, output_channels, pooled_h, pooled_w).
          The output_channels equal to C / (pooled_h * pooled_w), where C is the channels of input.

Returns:
    None.

Examples:
    .. code-block:: pycon

        >>> import paddle

        >>> psroi_module = paddle.vision.ops.PSRoIPool(7, 1.0)
        >>> x = paddle.uniform([2, 490, 28, 28], dtype='float32')
        >>> boxes = paddle.to_tensor(
        ...     [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]],
        ...     dtype='float32',
        ... )
        >>> boxes_num = paddle.to_tensor([1, 2], dtype='int32')
        >>> pool_out = psroi_module(x, boxes, boxes_num)
        >>> print(pool_out.shape)
        paddle.Size([3, 10, 7, 7])
r    r  rW   r   c                :   > [         TU ]  5         Xl        X l        g rt   )r   r   r  r   r   r  r   r   s      ra   r   PSRoIPool.__init__)  s    &*rb   c                D    [        XX0R                  U R                  5      $ rt   )r/   r  r   r   r8   rm   r#  s       ra   r   PSRoIPool.forward.  s"    i!1!143E3E
 	
rb   )r  r   r5   r  r    r   rW   r   r   r8   r   rm   r   r#  r   r   r   r   r   s   @ra   r0   r0      s+    #J + +

 
rb   r0   c           	        [        US[        [        4S5        [        U[        5      (       a  X34nUu  pg[	        5       (       a"  Uc   S5       e[
        R                  " XX&Xt5      $ [        U SS/S5        [        USS/S5        [        S0 [        5       D6nUR                  5       n	UR                  U	5      n
UR                  SS9nU US	.nUb  X,S
'   UR                  SUXS.UUUS.S9  U
$ )aj  
This operator implements the roi_pooling layer.
Region of interest pooling (also known as RoI pooling) is to perform max pooling on inputs of nonuniform sizes to obtain fixed-size feature maps (e.g. 7*7).
The operator has three steps: 1. Dividing each region proposal into equal-sized sections with output_size(h, w) 2. Finding the largest value in each section 3. Copying these max values to the output buffer
For more information, please refer to https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn.

Args:
    x (Tensor): input feature, 4D-Tensor with the shape of [N,C,H,W],
        where N is the batch size, C is the input channel, H is Height, W is weight.
        The data type is float32 or float64.
    boxes (Tensor): boxes (Regions of Interest) to pool over.
        2D-Tensor with the shape of [num_boxes,4].
        Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates,
        and (x2, y2) is the bottom right coordinates.
    boxes_num (Tensor): the number of RoIs in each image, data type is int32.
    output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size.
    spatial_scale (float, optional): multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0.
    name(str, optional): for detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default. Default: None.

Returns:
    pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].

Examples:
    .. code-block:: pycon

        >>> import paddle
        >>> from paddle.vision.ops import roi_pool

        >>> data = paddle.rand([1, 256, 32, 32])
        >>> boxes = paddle.rand([3, 4])
        >>> boxes[:, 2] += boxes[:, 0] + 3
        >>> boxes[:, 3] += boxes[:, 1] + 4
        >>> boxes_num = paddle.to_tensor([3]).astype('int32')
        >>> pool_out = roi_pool(data, boxes, boxes_num=boxes_num, output_size=3)
        >>> print(pool_out.shape)
        paddle.Size([3, 256, 3, 3])
r  r-   -boxes_num should not be None in dygraph mode.r8   r9   rm   r=   rC   r  r  )r   Argmax)r!  r"  r   rN   )r-   )r   rV   rU   ru   r   r   r-   r   r   rS   r   rY   rZ   )r8   rm   r#  r  r   r\   r!  r"  r^   rD   pool_outargmaxesrP   s                ra   r-   r-   4  s"   \ {MC<D+s##"0"-M$ 	
;	
$ i
 	
 	!C)jA )jI4684""$<<UC<<7<K 
   )9$9!. ,!.	 	 		
 rb   c                  H   ^  \ rS rSrSrSSU 4S jjjrS	S jrS
S jrSrU =r	$ )r.   i  aP  
This interface is used to construct a callable object of the `RoIPool` class. Please
refer to :ref:`api_paddle_vision_ops_roi_pool`.

Args:
    output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size.
    spatial_scale (float, optional): multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0.

Returns:
    pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].

Examples:
    .. code-block:: pycon

        >>> import paddle
        >>> from paddle.vision.ops import RoIPool

        >>> data = paddle.rand([1, 256, 32, 32])
        >>> boxes = paddle.rand([3, 4])
        >>> boxes[:, 2] += boxes[:, 0] + 3
        >>> boxes[:, 3] += boxes[:, 1] + 4
        >>> boxes_num = paddle.to_tensor([3]).astype('int32')
        >>> roi_pool = RoIPool(output_size=(4, 3))
        >>> pool_out = roi_pool(data, boxes, boxes_num)
        >>> print(pool_out.shape)
        paddle.Size([3, 256, 4, 3])
c                :   > [         TU ]  5         Xl        X l        g rt   r   r   _output_size_spatial_scaler&  s      ra   r   RoIPool.__init__      '+rb   c                D    [        UUUU R                  U R                  S9$ )N)r8   rm   r#  r  r   )r-   r6  r7  r)  s       ra   r   RoIPool.forward  s*    ))--
 	
rb   c                >    SnUR                   " S0 U R                  D6$ )Nz:output_size={_output_size}, spatial_scale={_spatial_scale}r   )format__dict__)r   main_strs     ra   
extra_reprRoIPool.extra_repr  s    O///rb   r6  r7  r+  r,  r-  )r   str)
r   r   r   r   r   r   r   r@  r   r   r   s   @ra   r.   r.     s!    8, ,

0 0rb   r.   c                   [        US[        [        4S5        [        U[        5      (       a  X34nUu  p[	        5       (       a'  Uc   S5       e[
        R                  " U UUUU	UUU5      $ [        U SSS/S5        [        USSS/S5        [        S0 [        5       D6n
U
R                  5       nU
R                  U5      nU US.nUb  X-S	'   U
R                  SUS
U0UU	UUUS.S9  U$ )a  
Implementing the roi_align layer.
Region of Interest (RoI) Align operator (also known as RoI Align) is to
perform bilinear interpolation on inputs of nonuniform sizes to obtain
fixed-size feature maps (e.g. 7*7), as described in Mask R-CNN.

Dividing each region proposal into equal-sized sections with the pooled_width
and pooled_height. Location remains the origin result.

In each ROI bin, the value of the four regularly sampled locations are
computed directly through bilinear interpolation. The output is the mean of
four locations. Thus avoid the misaligned problem.

Args:
    x (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W],
        where N is the batch size, C is the input channel, H is Height,
        W is weight. The data type is float32 or float64.
    boxes (Tensor): Boxes (RoIs, Regions of Interest) to pool over. It
        should be a 2-D Tensor of shape (num_boxes, 4). The data type is
        float32 or float64. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
        the top left coordinates, and (x2, y2) is the bottom right coordinates.
    boxes_num (Tensor): The number of boxes contained in each picture in
        the batch, the data type is int32.
    output_size (int or Tuple[int, int]): The pooled output size(h, w), data
        type is int32. If int, h and w are both equal to output_size.
    spatial_scale (float, optional): Multiplicative spatial scale factor to translate
        ROI coords from their input scale to the scale used when pooling.
        Default: 1.0.
    sampling_ratio (int, optional): number of sampling points in the interpolation
        grid used to compute the output value of each pooled output bin.
        If > 0, then exactly ``sampling_ratio x sampling_ratio`` sampling
        points per bin are used.
        If <= 0, then an adaptive number of grid points are used (computed
        as ``ceil(roi_width / output_width)``, and likewise for height).
        Default: -1.
    aligned (bool, optional): If False, use the legacy implementation. If True, pixel
        shift the box coordinates it by -0.5 for a better alignment with the
        two neighboring pixel indices. This version is used in Detectron2.
        Default: True.
    name(str|None, optional): For detailed information, please refer to :
        ref:`api_guide_Name`. Usually name is no need to set and None by
        default. Default: None.

Returns:
    The output of ROIAlignOp is a 4-D tensor with shape (num_boxes,            channels, pooled_h, pooled_w). The data type is float32 or float64.

Examples:
    .. code-block:: pycon

        >>> import paddle
        >>> from paddle.vision.ops import roi_align

        >>> data = paddle.rand([1, 256, 32, 32])
        >>> boxes = paddle.rand([3, 4])
        >>> boxes[:, 2] += boxes[:, 0] + 3
        >>> boxes[:, 3] += boxes[:, 1] + 4
        >>> boxes_num = paddle.to_tensor([3]).astype('int32')
        >>> align_out = roi_align(data, boxes, boxes_num, output_size=3)
        >>> print(align_out.shape)
        paddle.Size([3, 256, 3, 3])
r  r1   r/  r8   r9   r:   rm   r  r  r   )r!  r"  r   sampling_ratioalignedrN   )r1   )r   rV   rU   ru   r   r   r1   r   r   rS   r   rY   rZ   )r8   rm   r#  r  r   rE  rF  r\   r!  r"  r^   rD   	align_outrP   s                 ra   r1   r1     s4   R {MC<E+s##"0"-M$ 	
;	
$ 	
 		
 	!C)Y)?M 7Y	2K	
 5FH5""$==eD	
   )9I&!. ,!."0"	 	 	
 rb   c                  V   ^  \ rS rSrSrSSU 4S jjjr S         S	S jjrSrU =r$ )
r2   i2  ao  
This interface is used to construct a callable object of the `RoIAlign` class.
Please refer to :ref:`api_paddle_vision_ops_roi_align`.

Args:
    output_size (int or tuple[int, int]): The pooled output size(h, w),
        data type is int32. If int, h and w are both equal to output_size.
    spatial_scale (float, optional): Multiplicative spatial scale factor
        to translate ROI coords from their input scale to the scale used
        when pooling. Default: 1.0.

Returns:
    The output of ROIAlign operator is a 4-D tensor with             shape (num_boxes, channels, pooled_h, pooled_w).

Examples:
    .. code-block:: pycon

        >>> import paddle
        >>> from paddle.vision.ops import RoIAlign

        >>> data = paddle.rand([1, 256, 32, 32])
        >>> boxes = paddle.rand([3, 4])
        >>> boxes[:, 2] += boxes[:, 0] + 3
        >>> boxes[:, 3] += boxes[:, 1] + 4
        >>> boxes_num = paddle.to_tensor([3]).astype('int32')
        >>> roi_align = RoIAlign(output_size=(4, 3))
        >>> align_out = roi_align(data, boxes, boxes_num)
        >>> print(align_out.shape)
        paddle.Size([3, 256, 4, 3])
c                :   > [         TU ]  5         Xl        X l        g rt   r5  r&  s      ra   r   RoIAlign.__init__S  r9  rb   c           	     F    [        UUUU R                  U R                  US9$ )N)r8   rm   r#  r  r   rF  )r1   r6  r7  )r   r8   rm   r#  rF  s        ra   r   RoIAlign.forwardX  s/     ))--
 	
rb   rB  r+  r,  )T)
r8   r   rm   r   r#  r   rF  rX   r   r   )	r   r   r   r   r   r   r   r   r   r   s   @ra   r2   r2   2  sL    @, , LP



 &

39

DH

	

 

rb   r2   c                  n   ^  \ rS rSrSrSSSS\\SS4                     SU 4S jjjrSrU =r	$ )	ConvNormActivationie  a  
Configurable block used for Convolution-Normalization-Activation blocks.
This code is based on the torchvision code with modifications.
You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L68
Args:
    in_channels (int): Number of channels in the input image
    out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
    kernel_size: (int|list|tuple, optional): Size of the convolving kernel. Default: 3
    stride (int|list|tuple, optional): Stride of the convolution. Default: 1
    padding (int|str|tuple|list, optional): Padding added to all four sides of the input. Default: None,
        in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
    groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
    norm_layer (Callable[..., paddle.nn.Layer], optional): Norm layer that will be stacked on top of the convolution layer.
        If ``None`` this layer won't be used. Default: ``paddle.nn.BatchNorm2D``
    activation_layer (Callable[..., paddle.nn.Layer], optional): Activation function which will be stacked on top of the normalization
        layer (if not ``None``), otherwise on top of the conv layer. If ``None`` this layer won't be used. Default: ``paddle.nn.ReLU``
    dilation (int): Spacing between kernel elements. Default: 1
    bias (bool|None, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
   r|   Nc                   > Uc  US-
  S-  U	-  nU
c  US L n
[        UUUUUU	UU
S9/nUb  UR                  U" U5      5        Ub  UR                  U" 5       5        [        TU ]  " U6   g )Nr|   r   )r   r   r   )r   appendr   r   )r   r   r   r   r   r   r   
norm_layeractivation_layerr   r   layersr   s               ra   r   ConvNormActivation.__init__z  s     ?"Q1,x7G<%D!	
 !MM*\23'MM*,-&!rb   r   )r   rV   r   rV   r   r    r   r    r   z-_PaddingSizeMode | Size2 | Size4 | str | Noner   rV   rR  Callable[..., nn.Layer]rS  rV  r   rV   r   zbool | Noner   r   )
r   r   r   r   r   r   r   r   r   r   r   s   @ra   rN  rN  e  s    0 AE.948 !"!" !" 	!"
 !" ?!" !" ,!" 2!" !" !" 
!" !"rb   rN  c           	        S nUc  U" X5      $ SSK nUc   UR                  " USS9nU" X   U5      n	X   $ Ub  XRR                  S   ::  d   S5       eUc   S5       eUR                  " USS	9n
U H  nUR                  " UR
                  " X7R                  " U5      5      5      S   nUR                  S   nUR                  " X/5      nUS:X  a  Mb  US
:X  a  S
X'   Mn  X   nX,   nUR                  " USS9nUU   nUU" UU5         nUR                  " UU   SS	9nUR                  " U
UU   USS9n
M     UR                  " U
5      S   nUR                  S   nUR                  " UU/5      nUR                  " UU   SS9nUc  UU   $ [        5       (       a)  [        X]5      nUR                  " UU   U5      u  nnUU   $ UU   SU $ )u  
This operator implements non-maximum suppression. Non-maximum suppression (NMS)
is used to select one bounding box out of many overlapping bounding boxes in object detection.
Boxes with IoU > iou_threshold will be considered as overlapping boxes,
just one with highest score can be kept. Here IoU is Intersection Over Union,
which can be computed by:

..  math::

    IoU = \frac{intersection\_area(box1, box2)}{union\_area(box1, box2)}

    If scores are provided, input boxes will be sorted by their scores firstly.

If category_idxs and categories are provided, NMS will be performed with a batched style,
which means NMS will be applied to each category respectively and results of each category
will be concatenated and sorted by scores.

If K is provided, only the first k elements will be returned. Otherwise, all box indices sorted by scores will be returned.

Args:
    boxes(Tensor): The input boxes data to be computed, it's a 2D-Tensor with
        the shape of [num_boxes, 4]. The data type is float32 or float64.
        Given as [[x1, y1, x2, y2], …],  (x1, y1) is the top left coordinates,
        and (x2, y2) is the bottom right coordinates.
        Their relation should be ``0 <= x1 < x2 && 0 <= y1 < y2``.
    iou_threshold(float, optional): IoU threshold for determine overlapping boxes. Default value: 0.3.
    scores(Tensor|None, optional): Scores corresponding to boxes, it's a 1D-Tensor with
        shape of [num_boxes]. The data type is float32 or float64. Default: None.
    category_idxs(Tensor|None, optional): Category indices corresponding to boxes.
        it's a 1D-Tensor with shape of [num_boxes]. The data type is int64. Default: None.
    categories(list|tuple|None, optional): A list of unique id of all categories. The data type is int64. Default: None.
    top_k(int|None, optional): The top K boxes who has higher score and kept by NMS preds to
        consider. top_k should be smaller equal than num_boxes. Default: None.

Returns:
    Tensor: 1D-Tensor with the shape of [num_boxes]. Indices of boxes kept by NMS.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> paddle.seed(2023)

        >>> boxes = paddle.rand([4, 4]).astype('float32')
        >>> boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
        >>> boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
        >>> print(boxes)
        Tensor(shape=[4, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[0.86583614, 0.52014720, 1.12544549, 1.42540050],
         [0.42400089, 0.40641287, 1.39420986, 1.15078652],
         [0.51785129, 0.73292869, 1.49571705, 0.77608776],
         [0.42639419, 0.71958369, 0.63450879, 0.91689879]])

        >>> out = paddle.vision.ops.nms(boxes, 0.1)
        >>> print(out)
        Tensor(shape=[3], dtype=int64, place=Place(cpu), stop_gradient=True,
        [0, 2, 3])

        >>> scores = paddle.to_tensor([0.6, 0.7, 0.4, 0.233])
        >>> categories = [0, 1, 2, 3]
        >>> category_idxs = paddle.to_tensor([2, 0, 0, 3], dtype="int64")
        >>> out = paddle.vision.ops.nms(boxes,
        ...                             0.1,
        ...                             paddle.to_tensor(scores),
        ...                             paddle.to_tensor(category_idxs),
        ...                             categories,
        ...                             4)
        >>> print(out)
        Tensor(shape=[4], dtype=int64, place=Place(cpu), stop_gradient=True,
        [1, 0, 2, 3])
c                    [        5       (       a  [        R                  " X5      $ [        S0 [	        5       D6nUR                  S5      nUR                  SSU 0SU0SU0S9  U$ )Nr3   int64rk   KeepBoxesIdxsiou_thresholdrN   )r3   )r   r   r3   r   rS   rY   rZ   )rm   r[  r^   r   s       ra   _nmsnms.<locals>._nms  sp    !##::e33 !3&(3F;;GDC'(#.&6	   Jrb   Nr   T)
descendingz6top_k should be smaller equal than the number of boxeszaif category_idxs is given, categories which is a list of unique id of all categories is necessaryr=   rC   r|   )	overwrite)r   argsortr   
zeros_likewhereequal	to_tensorreshape	ones_likescatterr   mintopk)rm   r[  rn   category_idxs
categoriestop_kr\  r   sorted_global_indicessorted_keep_boxes_indicesr   category_idcur_category_boxes_idxsr   cur_category_boxescur_category_scorescur_category_sorted_indicescur_category_sorted_boxes cur_category_keep_boxes_sub_idxsupdateskeep_boxes_idxssorted_sub_indices_topk_sub_indicess                           ra   r3   r3     s^   ` ~E)) &v$ G$((-%
! %??Q' 	
D	
' ! k! V73D!"(,,LL(8(8(EF#

# (--a0"(..#W#
 A:aZ,-D)";$=&,nnD'
# %7'%
! ,G*M:,
( ""#$DE
 ~~#$DE	
= "H ll4(+O!!!$Enn_ug>OD }122E!$kk&*A5I/00-.v66rb   c                    g rt   r   rn   bbox_deltasre   r>   r   pre_nms_top_npost_nms_top_n
nms_threshmin_sizeetar  return_rois_numr\   s                ra   r*   r*   I       %(rb   c                    g rt   r   r|  s                ra   r*   r*   [       #&rb   c                    g rt   r   r|  s                ra   r*   r*   m  s     ,/rb   c                "   [        5       (       a6  U(       d   S5       eUUUUU	U
4n[        R                  " XX#U/UQ76 u  pnXU4$ [        5       (       aJ  U(       d   S5       e[        R                  " U UUUUUUUUU	U
5      u  pnSUl        SUl        SUl        XU4$ [        S0 [        5       D6n[        U SS/S5        [        USS/S5        [        USSS	/S5        [        US
S/S5        [        USS/S5        UR                  UR                  S9nUR                  U R                  S9nUUS.nU(       a  UR                  SS9nSUl        UUS'   UR                  SU UUUUS.UUUUU	U
S.US9  SUl        SUl        U(       d  SnXW4$ )a  
This operation proposes RoIs according to each box with their
probability to be a foreground object. And
the proposals of RPN output are  calculated by anchors, bbox_deltas and scores. Final proposals
could be used to train detection net.

For generating proposals, this operation performs following steps:

1. Transpose and resize scores and bbox_deltas in size of
   (H * W * A, 1) and (H * W * A, 4)
2. Calculate box locations as proposals candidates.
3. Clip boxes to image
4. Remove predicted boxes with small area.
5. Apply non-maximum suppression (NMS) to get final proposals as output.

Args:
    scores (Tensor): A 4-D Tensor with shape [N, A, H, W] represents
        the probability for each box to be an object.
        N is batch size, A is number of anchors, H and W are height and
        width of the feature map. The data type must be float32.
    bbox_deltas (Tensor): A 4-D Tensor with shape [N, 4*A, H, W]
        represents the difference between predicted box location and
        anchor location. The data type must be float32.
    img_size (Tensor): A 2-D Tensor with shape [N, 2] represents origin
        image shape information for N batch, including height and width of the input sizes.
        The data type can be float32 or float64.
    anchors (Tensor):   A 4-D Tensor represents the anchors with a layout
        of [H, W, A, 4]. H and W are height and width of the feature map,
        num_anchors is the box count of each position. Each anchor is
        in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32.
    variances (Tensor): A 4-D Tensor. The expanded variances of anchors with a layout of
        [H, W, num_priors, 4]. Each variance is in
        (xcenter, ycenter, w, h) format. The data type must be float32.
    pre_nms_top_n (float, optional): Number of total bboxes to be kept per
        image before NMS. `6000` by default.
    post_nms_top_n (float, optional): Number of total bboxes to be kept per
        image after NMS. `1000` by default.
    nms_thresh (float, optional): Threshold in NMS. The data type must be float32. `0.5` by default.
    min_size (float, optional): Remove predicted boxes with either height or
        width less than this value. `0.1` by default.
    eta(float, optional): Apply in adaptive NMS, only works if adaptive `threshold > 0.5`,
        `adaptive_threshold = adaptive_threshold * eta` in each iteration. 1.0 by default.
    pixel_offset (bool, optional): Whether there is pixel offset. If True, the offset of `img_size` will be 1. 'False' by default.
    return_rois_num (bool, optional): Whether to return `rpn_rois_num` . When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's
        num of each image in one batch. 'False' by default.
    name(str|None, optional): For detailed information, please refer
        to :ref:`api_guide_Name`. Usually name is no need to set and
        None by default.

Returns:
    - rpn_rois (Tensor): The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
    - rpn_roi_probs (Tensor): The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
    - rpn_rois_num (Tensor): Rois's num of each image in one batch. 1-D Tensor with shape ``[B,]`` while ``B`` is the batch size. And its sum equals to RoIs number ``N`` .

Examples:
    .. code-block:: python

        >>> import paddle
        >>> paddle.seed(2023)

        >>> scores = paddle.rand((2,4,5,5), dtype=paddle.float32)
        >>> bbox_deltas = paddle.rand((2, 16, 5, 5), dtype=paddle.float32)
        >>> img_size = paddle.to_tensor([[224.0, 224.0], [224.0, 224.0]])
        >>> anchors = paddle.rand((2,5,4,4), dtype=paddle.float32)
        >>> variances = paddle.rand((2,5,10,4), dtype=paddle.float32)
        >>> rois, roi_probs, roi_nums = paddle.vision.ops.generate_proposals(scores, bbox_deltas,
        ...                 img_size, anchors, variances, return_rois_num=True)
        >>> # doctest: +SKIP('random sample')
        >>> print(rois, roi_probs, roi_nums)
        Tensor(shape=[2, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[0., 0., 0., 0.],
         [0., 0., 0., 0.]])
        Tensor(shape=[2, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[0.],
         [0.]])
        Tensor(shape=[2], dtype=int32, place=Place(cpu), stop_gradient=True,
        [1, 1])
z/return_rois_num should be True in dygraph mode.z=return_rois_num should be True in PaddlePaddle inner op mode.Tgenerate_proposals_v2rn   r9   r}  re   r:   r>   r   rC   )RpnRoisRpnRoiProbsr=   
RpnRoisNum)rl   
BboxDeltasImShapeAnchorsr   )pre_nms_topNpost_nms_topNr  r  r  r  r   N)r  )r   r   r*   r   r   r   rS   r   rY   rD   rZ   )rn   r}  re   r>   r   r~  r  r  r  r  r  r  r\   rR   rpn_roisrpn_roi_probsrpn_rois_numr^   rQ   s                      ra   r*   r*     sQ   |  	
=	
 
 170I0II1
@E1
- 44	 	
K	
 170I0I1
- "&&*#%)"44AA Hyk+B	
 	!5L	
 	!	"#		
 	!Y-D	
 	!{YK1H	
 <<## = 
 AA,, B 
  (
 !DD E L *.L&$0GL!( )#"& !.!/($ , # 	 	
& "&&*#L44rb   c                    g rt   r   bboxesrn   score_thresholdpost_threshold	nms_top_k
keep_top_kuse_gaussiangaussian_sigmabackground_label
normalizedreturn_indexr  r\   s                ra   r4   r4   E	  r  rb   c                    g rt   r   r  s                ra   r4   r4   W	  r  rb   c                    g rt   r   r  s                ra   r4   r4   i	  r  rb   c                    g rt   r   r  s                ra   r4   r4   {	  s     !$rb   c                    g rt   r   r  s                ra   r4   r4   	  s     36rb   c                   [        5       (       a8  [        R                  " U UUUUUUUUU	5
      u  pnU
(       d  SnU(       d  SnXU4$ [        U SSS/S5        [        USSS/S5        [	        US[
        S5        [	        US[
        S5        [	        US	[        S5        [	        US
[        S5        [	        U	S[        S5        [	        US[        S5        [	        US[
        S5        [	        US[        S5        [        S0 [        5       D6nUR                  U R                  S9nUR                  SS9nUUS.nU(       a  UR                  SS9nUUS'   UR                  SXS.UUUUUUUU	S.US9  SUl        U
(       d  SnU(       d  SnUWU4$ )a  

This operator does matrix non maximum suppression (NMS).
First selects a subset of candidate bounding boxes that have higher scores
than score_threshold (if provided), then the top k candidate is selected if
nms_top_k is larger than -1. Score of the remaining candidate are then
decayed according to the Matrix NMS scheme.
After NMS step, at most keep_top_k number of total bboxes are to be kept
per image if keep_top_k is larger than -1.

Args:
    bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
                       predicted locations of M bounding bboxes,
                       N is the batch size. Each bounding box has four
                       coordinate values and the layout is
                       [xmin, ymin, xmax, ymax], when box size equals to 4.
                       The data type is float32 or float64.
    scores (Tensor): A 3-D Tensor with shape [N, C, M]
                       represents the predicted confidence predictions.
                       N is the batch size, C is the class number, M is
                       number of bounding boxes. For each category there
                       are total M scores which corresponding M bounding
                       boxes. Please note, M is equal to the 2nd dimension
                       of BBoxes. The data type is float32 or float64.
    score_threshold (float): Threshold to filter out bounding boxes with
                             low confidence score.
    post_threshold (float): Threshold to filter out bounding boxes with
                            low confidence score AFTER decaying.
    nms_top_k (int): Maximum number of detections to be kept according to
                     the confidences after the filtering detections based
                     on score_threshold.
    keep_top_k (int): Number of total bboxes to be kept per image after NMS
                      step. -1 means keeping all bboxes after NMS step.
    use_gaussian (bool, optional): Use Gaussian as the decay function. Default: False
    gaussian_sigma (float, optional): Sigma for Gaussian decay function. Default: 2.0
    background_label (int, optional): The index of background label, the background
                            label will be ignored. If set to -1, then all
                            categories will be considered. Default: 0
    normalized (bool, optional): Whether detections are normalized. Default: True
    return_index(bool, optional): Whether return selected index. Default: False
    return_rois_num(bool, optional): whether return rois_num. Default: True
    name(str|None, optional): Name of the matrix nms op. Default: None.
Returns:
    - A tuple with three Tensor, (Out, Index, RoisNum) if return_index is True,
      otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
    - Out (Tensor), A 2-D Tensor with shape [No, 6] containing the
      detection results.
      Each row has 6 values, [label, confidence, xmin, ymin, xmax, ymax]
    - Index (Tensor), A 2-D Tensor with shape [No, 1] containing the
      selected indices, which are absolute values cross batches.
    - rois_num (Tensor), A 1-D Tensor with shape [N] containing
      the number of detected boxes in each image.

Examples:
    .. code-block:: python

        >>> import paddle
        >>> from paddle.vision.ops import matrix_nms

        >>> boxes = paddle.rand([4, 1, 4])
        >>> boxes[..., 2] = boxes[..., 0] + boxes[..., 2]
        >>> boxes[..., 3] = boxes[..., 1] + boxes[..., 3]
        >>> scores = paddle.rand([4, 80, 1])
        >>> out = matrix_nms(bboxes=boxes, scores=scores, background_label=0,
        ...                         score_threshold=0.5, post_threshold=0.1,
        ...                         nms_top_k=400, keep_top_k=200, normalized=False)
NBBoxesr9   r:   r4   rl   r  r  
nums_top_kr  r  r  r  r  rC   r=   )r   Indexr  )r  rl   )r  r  r  r  r  r  r  r  r   T)r4   )r   r   r4   r   r   rW   rV   rX   r   rS   rY   rD   rZ   r   )r  rn   r  r  r  r  r  r  r  r  r  r  r\   r   indexr  r^   outputrQ   s                      ra   r4   r4   	  s   d %00 
H EHe## Hy)4l	
 	!Hy)4l	
 	?$5ulK>#3ULI9lC>:|S,?:|T<@<|D>#3ULI#%7lK6VX6:::N999H 51@@w@OH!)GI$7$4#2"0&"0 ,((	  	 	
  $EHx&&rb   )NTNr5   )r8   r   r;   r   r<   r   r>   Sequence[int]r?   r  r@   rV   rA   rW   rI   rV   r[   r   rB   rX   r\   
str | NonerJ   rW   r   r   )TNr5   Frc   )r8   r   re   r   r>   r  r@   rV   rf   rW   rI   rV   rg   rX   r\   r  rJ   rW   rh   rX   ri   rW   r   r   )ry   r   r   r   r}   Sequence[float] | floatr   zSequence[float] | float | Noner~   r  r   r  r   rX   r   rX   r   r  r   rW   r   rX   r\   r  r   ztuple[Tensor, Tensor])encode_center_sizeTr   N)r%   r   r   zTensor | Sequence[float]r   r   r   z3Literal['encode_center_size', 'decode_center_size']r   rX   r   rV   r\   r  r   r   )Nr|   r   r|   r|   r|   NN)r8   r   r   r   r   r   r   r   r   r    r   r    r   r    r   rV   r   rV   r   r   r\   r  r   r   )...)r   r   r  rV   r  rV   r  rV   r  rV   r  rX   r  r   r\   r  r   z!tuple[list[Tensor], Tensor, None])r   r   r  rV   r  rV   r  rV   r  rV   r  rX   r  r   r\   r  r   z)tuple[list[Tensor], Tensor, list[Tensor]])FNNrt   )r  rC  r\   r  r   r   )	unchangedN)r8   r   r  z#Literal['unchanged', 'gray', 'rgb']r\   r  r   r   )r5   N)r8   r   rm   r   r#  r   r  r    r   rW   r\   r  r   r   )r5   TN)r8   r   rm   r   r#  r   r  r    r   rW   rE  rV   rF  rX   r\   r  r   r   )g333333?NNNN)rm   r   r[  rW   rn   r   rj  r   rk  zSequence[int] | Nonerl  z
int | Noner   r   )........)rn   r   r}  r   re   r   r>   r   r   r   r~  rW   r  rW   r  rW   r  rW   r  rW   r  rX   r  Literal[True]r\   r  r   tuple[Tensor, Tensor, Tensor])rn   r   r}  r   re   r   r>   r   r   r   r~  rW   r  rW   r  rW   r  rW   r  rW   r  rX   r  Literal[False]r\   r  r   tuple[Tensor, Tensor, None])rn   r   r}  r   re   r   r>   r   r   r   r~  rW   r  rW   r  rW   r  rW   r  rW   r  rX   r  rX   r\   r  r   z$tuple[Tensor, Tensor, Tensor | None])ip  i  rc   ro   r5   FFN)......N)r  r   rn   r   r  rW   r  rW   r  rV   r  rV   r  rX   r  rW   r  rV   r  rX   r  r  r  r  r\   r  r   ztuple[Tensor, None, Tensor])r  r   rn   r   r  rW   r  rW   r  rV   r  rV   r  rX   r  rW   r  rV   r  rX   r  r  r  r  r\   r  r   r  )r  r   rn   r   r  rW   r  rW   r  rV   r  rV   r  rX   r  rW   r  rV   r  rX   r  r  r  r  r\   r  r   r  )r  r   rn   r   r  rW   r  rW   r  rV   r  rV   r  rX   r  rW   r  rV   r  rX   r  r  r  r  r\   r  r   ztuple[Tensor, None, None])r  r   rn   r   r  rW   r  rW   r  rV   r  rV   r  rX   r  rW   r  rV   r  rX   r  rX   r  rX   r\   r  r   z+tuple[Tensor, Tensor | None, Tensor | None])Fr   r   TFTN)C
__future__r   typingr   r   r   r   numpyr   r   r   paddle.tensor.mathr	   paddle.utilsr
   baser   base.data_feederr   r   base.frameworkr   r   r   r   r   base.layer_helperr   	frameworkr   r   r   r   r   r   r   nn.initializerr   collections.abcr   r   paddle._typingr   r    r!   paddle.nn.functional.commonr"   __all__r#   r$   r%   r&   r'   r(   r)   r+   r,   r/   r0   r-   r.   r1   r2   rN  r3   r*   r4   r   rb   ra   <module>r     s+   # = =    - (  C  , / = = #(!::<> #!MMM M 	M
 M M M M M M M M Mn !^^^ ^ 	^
 ^ ^ ^ ^ ^ ^ ^ ^J 15.1U(<&)3Z(-OOO 'O .	O
 +O &O O O #O O "&O O Op 	ss+s s	s s s s st WWW W 	W
 W W W W W W W WtF5 FR 
 	,	,	, 	, 		,
 	, 	, 	, 	, '	, 
	, 
 	4	4	4 	4 		4
 	4 	4 	4 	4 /	4 
	4$ 	N;b(Z 1<11
-1 1 	1r RRR R 	R
 R R Rj1
 1
r RRR R 	R
 R R Rj-0e -0j vvv v 	v
 v v v v vr0
u 0
f6" 6"v  #''+h7h7h7 h7 !	h7
 %h7 h7 h7V 
 %(((( ( 	(
 ( ( ( ( ( 
( ( #( ( #( 
(" 
 &)&&& & 	&
 & & & & & 
& & $& & !& 
&" 
 /// / 	/
 / / / / / 
/ / / / */ 
/. 	C5L 
 "%&)&&& & 	&
 & & & & & &  & $& & !& 
&" 
 "%%(((( ( 	(
 ( ( ( ( ( (  ( #( ( #( 
(" 
 #&%(&&& & 	&
 & & & & & & !& #& & !& 
&" 
 #&&)$$$ $ 	$
 $ $ $ $ $ $ !$ $$ $ $ 
$" 
 666 6 	6
 6 6 6 6 6 6 6 6 6 16 
60 	P'rb   