#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"
// Bounding box utils for generate_proposals_op
// Reference: facebookresearch/Detectron/detectron/utils/boxes.py
namespace caffe2 {
namespace utils {
// Default value for minimum bounding box width and height after bounding box
// transformation (bbox_transform()) in log-space
const float BBOX_XFORM_CLIP_DEFAULT = log(1000.0 / 16.0);
const float PI = 3.14159265358979323846;
// Forward transform that maps proposal boxes to ground-truth boxes using
// bounding-box regression deltas.
// boxes: pixel coordinates of the bounding boxes
// size (M, 4), format [x1; y1; x2; y2], x2 >= x1, y2 >= y1
// deltas: bounding box translations and scales
// size (M, 4), format [dx; dy; dw; dh]
// dx, dy: scale-invariant translation of the center of the bounding box
// dw, dh: log-space scaling of the width and height of the bounding box
// weights: weights [wx, wy, ww, wh] for the deltas
// bbox_xform_clip: minimum bounding box width and height in log-space after
// transofmration
// correct_transform_coords: Correct bounding box transform coordates. Set to
// true to match the detectron code, set to false for backward compatibility
// return: pixel coordinates of the bounding boxes
// size (M, 4), format [x1; y1; x2; y2]
// see "Rich feature hierarchies for accurate object detection and semantic
// segmentation" Appendix C for more details
// reference: detectron/lib/utils/boxes.py bbox_transform()
template <class Derived1, class Derived2>
EArrXXt<typename Derived1::Scalar> bbox_transform_upright(
const Eigen::ArrayBase<Derived1>& boxes,
const Eigen::ArrayBase<Derived2>& deltas,
const std::vector<typename Derived2::Scalar>& weights =
std::vector<typename Derived2::Scalar>{1.0, 1.0, 1.0, 1.0},
const float bbox_xform_clip = BBOX_XFORM_CLIP_DEFAULT,
const bool legacy_plus_one = false) {
using T = typename Derived1::Scalar;
using EArrXX = EArrXXt<T>;
using EArrX = EArrXt<T>;
if (boxes.rows() == 0) {
return EArrXX::Zero(T(0), deltas.cols());
CAFFE_ENFORCE_EQ(boxes.rows(), deltas.rows());
CAFFE_ENFORCE_EQ(boxes.cols(), 4);
CAFFE_ENFORCE_EQ(deltas.cols(), 4);
EArrX widths = boxes.col(2) - boxes.col(0) + T(int(legacy_plus_one));
EArrX heights = boxes.col(3) - boxes.col(1) + T(int(legacy_plus_one));
auto ctr_x = boxes.col(0) + T(0.5) * widths;
auto ctr_y = boxes.col(1) + T(0.5) * heights;
auto dx = deltas.col(0).template cast<T>() / weights[0];
auto dy = deltas.col(1).template cast<T>() / weights[1];
auto dw =
(deltas.col(2).template cast<T>() / weights[2]).cwiseMin(bbox_xform_clip);
auto dh =
(deltas.col(3).template cast<T>() / weights[3]).cwiseMin(bbox_xform_clip);
EArrX pred_ctr_x = dx * widths + ctr_x;
EArrX pred_ctr_y = dy * heights + ctr_y;
EArrX pred_w = dw.exp() * widths;
EArrX pred_h = dh.exp() * heights;
EArrXX pred_boxes = EArrXX::Zero(deltas.rows(), deltas.cols());
// x1
pred_boxes.col(0) = pred_ctr_x - T(0.5) * pred_w;
// y1
pred_boxes.col(1) = pred_ctr_y - T(0.5) * pred_h;
// x2
pred_boxes.col(2) = pred_ctr_x + T(0.5) * pred_w - T(int(legacy_plus_one));
// y2
pred_boxes.col(3) = pred_ctr_y + T(0.5) * pred_h - T(int(legacy_plus_one));
return pred_boxes;
// Like bbox_transform_upright, but works on rotated boxes.
// boxes: pixel coordinates of the bounding boxes
// size (M, 5), format [ctr_x; ctr_y; width; height; angle (in degrees)]
// deltas: bounding box translations and scales
// size (M, 5), format [dx; dy; dw; dh; da]
// dx, dy: scale-invariant translation of the center of the bounding box
// dw, dh: log-space scaling of the width and height of the bounding box
// da: delta for angle in radians
// return: pixel coordinates of the bounding boxes
// size (M, 5), format [ctr_x; ctr_y; width; height; angle (in degrees)]
template <class Derived1, class Derived2>
EArrXXt<typename Derived1::Scalar> bbox_transform_rotated(
const Eigen::ArrayBase<Derived1>& boxes,
const Eigen::ArrayBase<Derived2>& deltas,
const std::vector<typename Derived2::Scalar>& weights =
std::vector<typename Derived2::Scalar>{1.0, 1.0, 1.0, 1.0},
const float bbox_xform_clip = BBOX_XFORM_CLIP_DEFAULT,
const bool angle_bound_on = true,
const int angle_bound_lo = -90,
const int angle_bound_hi = 90) {
using T = typename Derived1::Scalar;
using EArrXX = EArrXXt<T>;
if (boxes.rows() == 0) {
return EArrXX::Zero(T(0), deltas.cols());
CAFFE_ENFORCE_EQ(boxes.rows(), deltas.rows());
CAFFE_ENFORCE_EQ(boxes.cols(), 5);
CAFFE_ENFORCE_EQ(deltas.cols(), 5);
const auto& ctr_x = boxes.col(0);
const auto& ctr_y = boxes.col(1);
const auto& widths = boxes.col(2);
const auto& heights = boxes.col(3);
const auto& angles = boxes.col(4);
auto dx = deltas.col(0).template cast<T>() / weights[0];
auto dy = deltas.col(1).template cast<T>() / weights[1];
auto dw =
(deltas.col(2).template cast<T>() / weights[2]).cwiseMin(bbox_xform_clip);
auto dh =
(deltas.col(3).template cast<T>() / weights[3]).cwiseMin(bbox_xform_clip);
// Convert back to degrees
auto da = deltas.col(4).template cast<T>() * 180.0 / PI;
EArrXX pred_boxes = EArrXX::Zero(deltas.rows(), deltas.cols());
// new ctr_x
pred_boxes.col(0) = dx * widths + ctr_x;
// new ctr_y
pred_boxes.col(1) = dy * heights + ctr_y;
// new width
pred_boxes.col(2) = dw.exp() * widths;
// new height
pred_boxes.col(3) = dh.exp() * heights;
// new angle
pred_boxes.col(4) = da + angles;
if (angle_bound_on) {
// Normalize angle to be within [angle_bound_lo, angle_bound_hi].
// Deltas are guaranteed to be <= period / 2 while computing training
// targets by bbox_transform_inv.
const int period = angle_bound_hi - angle_bound_lo;
CAFFE_ENFORCE(period > 0 && period % 180 == 0);
auto angles = pred_boxes.col(4);
for (int i = 0; i < angles.size(); ++i) {
if (angles[i] < angle_bound_lo) {
angles[i] += T(period);
} else if (angles[i] > angle_bound_hi) {
angles[i] -= T(period);
return pred_boxes;
template <class Derived1, class Derived2>
EArrXXt<typename Derived1::Scalar> bbox_transform(
const Eigen::ArrayBase<Derived1>& boxes,
const Eigen::ArrayBase<Derived2>& deltas,
const std::vector<typename Derived2::Scalar>& weights =
std::vector<typename Derived2::Scalar>{1.0, 1.0, 1.0, 1.0},
const float bbox_xform_clip = BBOX_XFORM_CLIP_DEFAULT,
const bool legacy_plus_one = false,
const bool angle_bound_on = true,
const int angle_bound_lo = -90,
const int angle_bound_hi = 90) {
CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5);
if (boxes.cols() == 4) {
// Upright boxes
return bbox_transform_upright(
boxes, deltas, weights, bbox_xform_clip, legacy_plus_one);
} else {
// Rotated boxes with angle info
return bbox_transform_rotated(
template <class Derived>
EArrXXt<typename Derived::Scalar> bbox_xyxy_to_ctrwh(
const Eigen::ArrayBase<Derived>& boxes,
bool legacy_plus_one = false) {
CAFFE_ENFORCE_EQ(boxes.cols(), 4);
const auto& x1 = boxes.col(0);
const auto& y1 = boxes.col(1);
const auto& x2 = boxes.col(2);
const auto& y2 = boxes.col(3);
EArrXXt<typename Derived::Scalar> ret(boxes.rows(), 4);
ret.col(0) = (x1 + x2) / 2.0; // x_ctr
ret.col(1) = (y1 + y2) / 2.0; // y_ctr
ret.col(2) = x2 - x1 + int(legacy_plus_one); // w
ret.col(3) = y2 - y1 + int(legacy_plus_one); // h
return ret;
template <class Derived>
EArrXXt<typename Derived::Scalar> bbox_ctrwh_to_xyxy(
const Eigen::ArrayBase<Derived>& boxes,
const bool legacy_plus_one = false) {
CAFFE_ENFORCE_EQ(boxes.cols(), 4);
const auto& x_ctr = boxes.col(0);
const auto& y_ctr = boxes.col(1);
const auto& w = boxes.col(2);
const auto& h = boxes.col(3);
EArrXXt<typename Derived::Scalar> ret(boxes.rows(), 4);
ret.col(0) = x_ctr - (w - int(legacy_plus_one)) / 2.0; // x1
ret.col(1) = y_ctr - (h - int(legacy_plus_one)) / 2.0; // y1
ret.col(2) = x_ctr + (w - int(legacy_plus_one)) / 2.0; // x2
ret.col(3) = y_ctr + (h - int(legacy_plus_one)) / 2.0; // y2
return ret;
// Clip boxes to image boundaries
// boxes: pixel coordinates of bounding box, size (M * 4)
template <class Derived>
EArrXXt<typename Derived::Scalar> clip_boxes_upright(
const Eigen::ArrayBase<Derived>& boxes,
int height,
int width,
bool legacy_plus_one = false) {
CAFFE_ENFORCE(boxes.cols() == 4);
EArrXXt<typename Derived::Scalar> ret(boxes.rows(), boxes.cols());
// x1 >= 0 && x1 < width
ret.col(0) = boxes.col(0).cwiseMin(width - int(legacy_plus_one)).cwiseMax(0);
// y1 >= 0 && y1 < height
ret.col(1) = boxes.col(1).cwiseMin(height - int(legacy_plus_one)).cwiseMax(0);
// x2 >= 0 && x2 < width
ret.col(2) = boxes.col(2).cwiseMin(width - int(legacy_plus_one)).cwiseMax(0);
// y2 >= 0 && y2 < height
ret.col(3) = boxes.col(3).cwiseMin(height - int(legacy_plus_one)).cwiseMax(0);
return ret;
// Similar to clip_boxes_upright but handles rotated boxes with angle info.
// boxes: size (M, 5), format [ctr_x; ctr_y; width; height; angle (in degrees)]
// Clipping is only performed for boxes that are almost upright
// (within a given `angle_thresh` tolerance) to maintain backward compatibility
// for non-rotated boxes.
// We don't clip rotated boxes due to a couple of reasons:
// (1) There are potentially multiple ways to clip a rotated box to make it
// fit within the image.
// (2) It's tricky to make the entire rectangular box fit within the image and
// still be able to not leave out pixels of interest.
// Therefore, we rely on upstream ops like RoIAlignRotated safely handling this.
template <class Derived>
EArrXXt<typename Derived::Scalar> clip_boxes_rotated(
const Eigen::ArrayBase<Derived>& boxes,
int height,
int width,
float angle_thresh = 1.0,
bool legacy_plus_one = false) {
CAFFE_ENFORCE(boxes.cols() == 5);
const auto& angles = boxes.col(4);
// Filter boxes that are upright (with a tolerance of angle_thresh)
EArrXXt<typename Derived::Scalar> upright_boxes;
const auto& indices = GetArrayIndices(angles.abs() <= angle_thresh);
GetSubArrayRows(boxes, AsEArrXt(indices), &upright_boxes);
// Convert to [x1, y1, x2, y2] format and clip them
const auto& upright_boxes_xyxy =
bbox_ctrwh_to_xyxy(upright_boxes.leftCols(4), legacy_plus_one);
const auto& clipped_upright_boxes_xyxy =
clip_boxes_upright(upright_boxes_xyxy, height, width, legacy_plus_one);
// Convert back to [x_ctr, y_ctr, w, h, angle] and update upright boxes
upright_boxes.block(0, 0, upright_boxes.rows(), 4) =
bbox_xyxy_to_ctrwh(clipped_upright_boxes_xyxy, legacy_plus_one);
EArrXXt<typename Derived::Scalar> ret(boxes.rows(), boxes.cols());
ret = boxes;
for (int i = 0; i < upright_boxes.rows(); ++i) {
ret.row(indices[i]) = upright_boxes.row(i);
return ret;
// Clip boxes to image boundaries.
template <class Derived>
EArrXXt<typename Derived::Scalar> clip_boxes(
const Eigen::ArrayBase<Derived>& boxes,
int height,
int width,
float angle_thresh = 1.0,
bool legacy_plus_one = false) {
CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5);
if (boxes.cols() == 4) {
// Upright boxes
return clip_boxes_upright(boxes, height, width, legacy_plus_one);
} else {
// Rotated boxes with angle info
return clip_boxes_rotated(
boxes, height, width, angle_thresh, legacy_plus_one);
// Only keep boxes with both sides >= min_size and center within the image.
// boxes: pixel coordinates of bounding box, size (M * 4)
// im_info: [height, width, img_scale]
// return: row indices for 'boxes'
template <class Derived>
std::vector<int> filter_boxes_upright(
const Eigen::ArrayBase<Derived>& boxes,
double min_size,
const Eigen::Array3f& im_info,
const bool legacy_plus_one = false) {
CAFFE_ENFORCE_EQ(boxes.cols(), 4);
// Scale min_size to match image scale
min_size *= im_info[2];
using T = typename Derived::Scalar;
using EArrX = EArrXt<T>;
EArrX ws = boxes.col(2) - boxes.col(0) + T(int(legacy_plus_one));
EArrX hs = boxes.col(3) - boxes.col(1) + T(int(legacy_plus_one));
EArrX x_ctr = boxes.col(0) + ws / T(2);
EArrX y_ctr = boxes.col(1) + hs / T(2);
EArrXb keep = (ws >= min_size) && (hs >= min_size) &&
(x_ctr < T(im_info[1])) && (y_ctr < T(im_info[0]));
Loading ...