Repository URL to install this package:
|
Version:
5.0.6-1+cuda10.0 ▾
|
#include <cassert>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <sys/stat.h>
#include <cmath>
#include <time.h>
#include <cuda_runtime_api.h>
#include <memory>
#include <cstring>
#include <algorithm>
#include "NvCaffeParser.h"
#include "NvInferPlugin.h"
#include "common.h"
#include "factoryFasterRCNN.h"
static Logger gLogger;
using namespace nvinfer1;
using namespace nvcaffeparser1;
using namespace plugin;
// stuff we know about the network and the caffe input/output blobs
static const int INPUT_C = 3;
static const int INPUT_H = 375;
static const int INPUT_W = 500;
static const int IM_INFO_SIZE = 3;
static const int OUTPUT_CLS_SIZE = 21;
static const int OUTPUT_BBOX_SIZE = OUTPUT_CLS_SIZE * 4;
static int gUseDLACore{-1};
const std::string CLASSES[OUTPUT_CLS_SIZE]{"background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"};
const char* INPUT_BLOB_NAME0 = "data";
const char* INPUT_BLOB_NAME1 = "im_info";
const char* OUTPUT_BLOB_NAME0 = "bbox_pred";
const char* OUTPUT_BLOB_NAME1 = "cls_prob";
const char* OUTPUT_BLOB_NAME2 = "rois";
struct PPM
{
std::string magic, fileName;
int h, w, max;
uint8_t buffer[INPUT_C * INPUT_H * INPUT_W];
};
struct BBox
{
float x1, y1, x2, y2;
};
std::string locateFile(const std::string& input)
{
std::vector<std::string> dirs{"data/samples/faster-rcnn/", "data/faster-rcnn/"};
return locateFile(input, dirs);
}
// simple PPM (portable pixel map) reader
void readPPMFile(const std::string& filename, PPM& ppm)
{
ppm.fileName = filename;
std::ifstream infile(locateFile(filename), std::ifstream::binary);
infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max;
infile.seekg(1, infile.cur);
infile.read(reinterpret_cast<char*>(ppm.buffer), ppm.w * ppm.h * 3);
}
void writePPMFileWithBBox(const std::string& filename, PPM& ppm, const BBox& bbox)
{
std::ofstream outfile("./" + filename, std::ofstream::binary);
assert(!outfile.fail());
outfile << "P6"
<< "\n"
<< ppm.w << " " << ppm.h << "\n"
<< ppm.max << "\n";
auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); };
for (int x = int(bbox.x1); x < int(bbox.x2); ++x)
{
// bbox top border
ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3] = 255;
ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 1] = 0;
ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 2] = 0;
// bbox bottom border
ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3] = 255;
ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 1] = 0;
ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 2] = 0;
}
for (int y = int(bbox.y1); y < int(bbox.y2); ++y)
{
// bbox left border
ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3] = 255;
ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 1] = 0;
ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 2] = 0;
// bbox right border
ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3] = 255;
ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 1] = 0;
ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 2] = 0;
}
outfile.write(reinterpret_cast<char*>(ppm.buffer), ppm.w * ppm.h * 3);
}
void caffeToTRTModel(const std::string& deployFile, // name for caffe prototxt
const std::string& modelFile, // name for model
const std::vector<std::string>& outputs, // network outputs
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
nvcaffeparser1::IPluginFactoryV2* pluginFactory, // factory for plugin layers
IHostMemory** trtModelStream) // output stream for the TensorRT model
{
// create the builder
IBuilder* builder = createInferBuilder(gLogger);
// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
parser->setPluginFactoryV2(pluginFactory);
std::cout << "Begin parsing model..." << std::endl;
const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile).c_str(),
locateFile(modelFile).c_str(),
*network,
DataType::kFLOAT);
std::cout << "End parsing model..." << std::endl;
// specify which tensors are outputs
for (auto& s : outputs)
network->markOutput(*blobNameToTensor->find(s.c_str()));
// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(10 << 20); // we need about 6MB of scratch space for the plugin layer for batch size 5
samplesCommon::enableDLA(builder, gUseDLACore);
std::cout << "Begin building engine..." << std::endl;
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);
std::cout << "End building engine..." << std::endl;
// we don't need the network any more, and we can destroy the parser
network->destroy();
parser->destroy();
// serialize the engine, then close everything down
(*trtModelStream) = engine->serialize();
engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
}
void doInference(IExecutionContext& context, float* inputData, float* inputImInfo, float* outputBboxPred, float* outputClsProb, float* outputRois, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly 2 inputs and 3 outputs.
assert(engine.getNbBindings() == 5);
void* buffers[5];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex0 = engine.getBindingIndex(INPUT_BLOB_NAME0),
inputIndex1 = engine.getBindingIndex(INPUT_BLOB_NAME1),
outputIndex0 = engine.getBindingIndex(OUTPUT_BLOB_NAME0),
outputIndex1 = engine.getBindingIndex(OUTPUT_BLOB_NAME1),
outputIndex2 = engine.getBindingIndex(OUTPUT_BLOB_NAME2);
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex0], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float))); // data
CHECK(cudaMalloc(&buffers[inputIndex1], batchSize * IM_INFO_SIZE * sizeof(float))); // im_info
CHECK(cudaMalloc(&buffers[outputIndex0], batchSize * nmsMaxOut * OUTPUT_BBOX_SIZE * sizeof(float))); // bbox_pred
CHECK(cudaMalloc(&buffers[outputIndex1], batchSize * nmsMaxOut * OUTPUT_CLS_SIZE * sizeof(float))); // cls_prob
CHECK(cudaMalloc(&buffers[outputIndex2], batchSize * nmsMaxOut * 4 * sizeof(float))); // rois
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex0], inputData, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
CHECK(cudaMemcpyAsync(buffers[inputIndex1], inputImInfo, batchSize * IM_INFO_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(outputBboxPred, buffers[outputIndex0], batchSize * nmsMaxOut * OUTPUT_BBOX_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK(cudaMemcpyAsync(outputClsProb, buffers[outputIndex1], batchSize * nmsMaxOut * OUTPUT_CLS_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK(cudaMemcpyAsync(outputRois, buffers[outputIndex2], batchSize * nmsMaxOut * 4 * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex0]));
CHECK(cudaFree(buffers[inputIndex1]));
CHECK(cudaFree(buffers[outputIndex0]));
CHECK(cudaFree(buffers[outputIndex1]));
CHECK(cudaFree(buffers[outputIndex2]));
}
void bboxTransformInvAndClip(float* rois, float* deltas, float* predBBoxes, float* imInfo,
const int N, const int nmsMaxOut, const int numCls)
{
float width, height, ctr_x, ctr_y;
float dx, dy, dw, dh, pred_ctr_x, pred_ctr_y, pred_w, pred_h;
float *deltas_offset, *predBBoxes_offset, *imInfo_offset;
for (int i = 0; i < N * nmsMaxOut; ++i)
{
width = rois[i * 4 + 2] - rois[i * 4] + 1;
height = rois[i * 4 + 3] - rois[i * 4 + 1] + 1;
ctr_x = rois[i * 4] + 0.5f * width;
ctr_y = rois[i * 4 + 1] + 0.5f * height;
deltas_offset = deltas + i * numCls * 4;
predBBoxes_offset = predBBoxes + i * numCls * 4;
imInfo_offset = imInfo + i / nmsMaxOut * 3;
for (int j = 0; j < numCls; ++j)
{
dx = deltas_offset[j * 4];
dy = deltas_offset[j * 4 + 1];
dw = deltas_offset[j * 4 + 2];
dh = deltas_offset[j * 4 + 3];
pred_ctr_x = dx * width + ctr_x;
pred_ctr_y = dy * height + ctr_y;
pred_w = exp(dw) * width;
pred_h = exp(dh) * height;
predBBoxes_offset[j * 4] = std::max(std::min(pred_ctr_x - 0.5f * pred_w, imInfo_offset[1] - 1.f), 0.f);
predBBoxes_offset[j * 4 + 1] = std::max(std::min(pred_ctr_y - 0.5f * pred_h, imInfo_offset[0] - 1.f), 0.f);
predBBoxes_offset[j * 4 + 2] = std::max(std::min(pred_ctr_x + 0.5f * pred_w, imInfo_offset[1] - 1.f), 0.f);
predBBoxes_offset[j * 4 + 3] = std::max(std::min(pred_ctr_y + 0.5f * pred_h, imInfo_offset[0] - 1.f), 0.f);
}
}
}
std::vector<int> nms(std::vector<std::pair<float, int>>& score_index, float* bbox, const int classNum, const int numClasses, const float nms_threshold)
{
auto overlap1D = [](float x1min, float x1max, float x2min, float x2max) -> float {
if (x1min > x2min)
{
std::swap(x1min, x2min);
std::swap(x1max, x2max);
}
return x1max < x2min ? 0 : std::min(x1max, x2max) - x2min;
};
auto computeIoU = [&overlap1D](float* bbox1, float* bbox2) -> float {
float overlapX = overlap1D(bbox1[0], bbox1[2], bbox2[0], bbox2[2]);
float overlapY = overlap1D(bbox1[1], bbox1[3], bbox2[1], bbox2[3]);
float area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]);
float area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1]);
float overlap2D = overlapX * overlapY;
float u = area1 + area2 - overlap2D;
return u == 0 ? 0 : overlap2D / u;
};
std::vector<int> indices;
for (auto i : score_index)
{
const int idx = i.second;
bool keep = true;
for (unsigned k = 0; k < indices.size(); ++k)
{
if (keep)
{
const int kept_idx = indices[k];
float overlap = computeIoU(&bbox[(idx * numClasses + classNum) * 4],
&bbox[(kept_idx * numClasses + classNum) * 4]);
keep = overlap <= nms_threshold;
}
else
break;
}
if (keep)
indices.push_back(idx);
}
return indices;
}
int main(int argc, char** argv)
{
gUseDLACore = samplesCommon::parseDLA(argc, argv);
// create a TensorRT model from the caffe model and serialize it to a stream
FRCNNPluginFactory pluginFactorySerialize;
IHostMemory* trtModelStream{nullptr};
initLibNvInferPlugins(&gLogger, "");
// batch size
const int N = 2;
caffeToTRTModel("faster_rcnn_test_iplugin.prototxt",
"VGG16_faster_rcnn_final.caffemodel",
std::vector<std::string>{OUTPUT_BLOB_NAME0, OUTPUT_BLOB_NAME1, OUTPUT_BLOB_NAME2},
N, &pluginFactorySerialize, &trtModelStream);
assert(trtModelStream != nullptr);
pluginFactorySerialize.destroyPlugin();
// read a random sample image
srand(unsigned(time(nullptr)));
// available images
std::vector<std::string> imageList = {"000456.ppm", "000542.ppm", "001150.ppm", "001763.ppm", "004545.ppm"};
std::vector<PPM> ppms(N);
float imInfo[N * 3]; // input im_info
std::random_shuffle(imageList.begin(), imageList.end(), [](int i) { return rand() % i; });
assert(ppms.size() <= imageList.size());
for (int i = 0; i < N; ++i)
{
readPPMFile(imageList[i], ppms[i]);
imInfo[i * 3] = float(ppms[i].h); // number of rows
imInfo[i * 3 + 1] = float(ppms[i].w); // number of columns
imInfo[i * 3 + 2] = 1; // image scale
}
float* data = new float[N * INPUT_C * INPUT_H * INPUT_W];
// pixel mean used by the Faster R-CNN's author
float pixelMean[3]{102.9801f, 115.9465f, 122.7717f}; // also in BGR order
for (int i = 0, volImg = INPUT_C * INPUT_H * INPUT_W; i < N; ++i)
{
for (int c = 0; c < INPUT_C; ++c)
{
// the color image to input should be in BGR order
for (unsigned j = 0, volChl = INPUT_H * INPUT_W; j < volChl; ++j)
data[i * volImg + c * volChl + j] = float(ppms[i].buffer[j * INPUT_C + 2 - c]) - pixelMean[c];
}
}
// deserialize the engine
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
if (gUseDLACore >= 0)
{
runtime->setDLACore(gUseDLACore);
}
FRCNNPluginFactory pluginFactory;
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
assert(engine != nullptr);
trtModelStream->destroy();
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
// host memory for outputs
float* rois = new float[N * nmsMaxOut * 4];
float* bboxPreds = new float[N * nmsMaxOut * OUTPUT_BBOX_SIZE];
float* clsProbs = new float[N * nmsMaxOut * OUTPUT_CLS_SIZE];
// predicted bounding boxes
float* predBBoxes = new float[N * nmsMaxOut * OUTPUT_BBOX_SIZE];
// run inference
doInference(*context, data, imInfo, bboxPreds, clsProbs, rois, N);
// Destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
// unscale back to raw image space
for (int i = 0; i < N; ++i)
{
float* rois_offset = rois + i * nmsMaxOut * 4;
for (int j = 0; j < nmsMaxOut * 4 && imInfo[i * 3 + 2] != 1; ++j)
rois_offset[j] /= imInfo[i * 3 + 2];
}
bboxTransformInvAndClip(rois, bboxPreds, predBBoxes, imInfo, N, nmsMaxOut, OUTPUT_CLS_SIZE);
const float nms_threshold = 0.3f;
const float score_threshold = 0.8f;
for (int i = 0; i < N; ++i)
{
float* bbox = predBBoxes + i * nmsMaxOut * OUTPUT_BBOX_SIZE;
float* scores = clsProbs + i * nmsMaxOut * OUTPUT_CLS_SIZE;
for (int c = 1; c < OUTPUT_CLS_SIZE; ++c) // skip the background
{
std::vector<std::pair<float, int>> score_index;
for (int r = 0; r < nmsMaxOut; ++r)
{
if (scores[r * OUTPUT_CLS_SIZE + c] > score_threshold)
{
score_index.push_back(std::make_pair(scores[r * OUTPUT_CLS_SIZE + c], r));
std::stable_sort(score_index.begin(), score_index.end(),
[](const std::pair<float, int>& pair1,
const std::pair<float, int>& pair2) {
return pair1.first > pair2.first;
});
}
}
// apply NMS algorithm
std::vector<int> indices = nms(score_index, bbox, c, OUTPUT_CLS_SIZE, nms_threshold);
// Show results
for (unsigned k = 0; k < indices.size(); ++k)
{
int idx = indices[k];
std::string storeName = CLASSES[c] + "-" + std::to_string(scores[idx * OUTPUT_CLS_SIZE + c]) + ".ppm";
std::cout << "Detected " << CLASSES[c] << " in " << ppms[i].fileName << " with confidence " << scores[idx * OUTPUT_CLS_SIZE + c] * 100.0f << "% "
<< " (Result stored in " << storeName << ")." << std::endl;
BBox b{bbox[idx * OUTPUT_BBOX_SIZE + c * 4], bbox[idx * OUTPUT_BBOX_SIZE + c * 4 + 1], bbox[idx * OUTPUT_BBOX_SIZE + c * 4 + 2], bbox[idx * OUTPUT_BBOX_SIZE + c * 4 + 3]};
writePPMFileWithBBox(storeName, ppms[i], b);
}
}
}
delete[] data;
delete[] rois;
delete[] bboxPreds;
delete[] clsProbs;
delete[] predBBoxes;
return 0;
}