#ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_
#define CAFFE2_VIDEO_VIDEO_DECODER_H_
#include <caffe2/core/logging.h>
#include <stdio.h>
#include <memory>
#include <string>
#include <vector>
extern "C" {
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavformat/avio.h>
#include <libavutil/log.h>
#include <libavutil/motion_vector.h>
#include <libswresample/swresample.h>
#include <libswscale/swscale.h>
}
namespace caffe2 {
#define VIO_BUFFER_SZ 32768
#define MAX_DECODING_FRAMES 10000
// enum to specify 3 special fps sampling behaviors:
// 0: disable fps sampling, no frame sampled at all
// -1: unlimited fps sampling, will sample at native video fps
// -2: disable fps sampling, but will get the frame at specific timestamp
enum SpecialFps {
SAMPLE_NO_FRAME = 0,
SAMPLE_ALL_FRAMES = -1,
SAMPLE_TIMESTAMP_ONLY = -2,
};
// three different types of resolution when decoding the video
// 0: resize to width x height and ignore the aspect ratio;
// 1: resize to short_edge and keep the aspect ratio;
// 2: using the original resolution of the video; if resolution
// is smaller than crop_size x crop_size, resize to crop_size
// and keep the aspect ratio;
// 3: for xray video service
enum VideoResType {
USE_WIDTH_HEIGHT = 0,
USE_SHORT_EDGE = 1,
ORIGINAL_RES = 2,
};
// three different types of decoding behavior are supported
// 0: do temporal jittering to sample a random clip from the video
// 1: uniformly sample multiple clips from the video;
// 2: sample a clip from a given starting frame
// 3: for xray video service
enum DecodeType {
DO_TMP_JITTER = 0,
DO_UNIFORM_SMP = 1,
USE_START_FRM = 2,
};
// sampling interval for fps starting at specified timestamp
// use enum SpecialFps to set special fps decoding behavior
// note sampled fps will not always accurately follow the target fps,
// because sampled frame has to snap to actual frame timestamp,
// e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25
// video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2,
// because of floating-point division accuracy (1 / 5.0 is not exactly 0.2)
struct SampleInterval {
double timestamp;
double fps;
SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {}
SampleInterval(double ts, double f) : timestamp(ts), fps(f) {}
bool operator<(const SampleInterval& itvl) const {
return (timestamp < itvl.timestamp);
}
};
class Params {
public:
// return all key-frames regardless of specified fps
bool keyFrames_ = false;
// return audio data while decoding the video
bool getAudio_ = false;
// for sampling audio data
int outrate_ = 22000;
int outfmt_ = AV_SAMPLE_FMT_FLT;
int64_t outlayout_ = AV_CH_LAYOUT_MONO;
// Output image pixel format
AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24;
// Index of stream to decode.
// -1 will automatically decode the first video stream.
int streamIndex_ = -1;
// How many frames to output at most from the video
// -1 no limit
int maximumOutputFrames_ = -1;
// params for video resolution
int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT;
int crop_size_ = -1;
int short_edge_ = -1;
// Output video size, -1 to preserve origianl dimension
int outputWidth_ = -1;
int outputHeight_ = -1;
// max output dimension, -1 to preserve original size
// the larger dimension of the video will be scaled to this size,
// and the second dimension will be scaled to preserve aspect ratio
int maxOutputDimension_ = -1;
// params for decoding behavior
int decode_type_ = DecodeType::DO_TMP_JITTER;
int num_of_required_frame_ = -1;
// intervals_ control variable sampling fps between different timestamps
// intervals_ must be ordered strictly ascending by timestamps
// the first interval must have a timestamp of zero
// fps must be either the 3 special fps defined in SpecialFps, or > 0
std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}};
Params() {}
/**
* FPS of output frames
* setting here will reset intervals_ and force decoding at target FPS
* This can be used if user just want to decode at a steady fps
*/
Params& fps(float v) {
intervals_.clear();
intervals_.emplace_back(0, v);
return *this;
}
/**
* Sample output frames at a specified list of timestamps
* Timestamps must be in increasing order, and timestamps past the end of the
* video will be ignored
* Setting here will reset intervals_
*/
Params& setSampleTimestamps(const std::vector<double>& timestamps) {
intervals_.clear();
// insert an interval per desired frame.
for (auto& timestamp : timestamps) {
intervals_.emplace_back(timestamp, SpecialFps::SAMPLE_TIMESTAMP_ONLY);
}
return *this;
}
/**
* Pixel format of output buffer, default PIX_FMT_RGB24
*/
Params& pixelFormat(AVPixelFormat pixelFormat) {
pixelFormat_ = pixelFormat;
return *this;
}
/**
* Return all key-frames
*/
Params& keyFrames(bool keyFrames) {
keyFrames_ = keyFrames;
return *this;
}
/**
* Index of video stream to process, defaults to the first video stream
*/
Params& streamIndex(int index) {
streamIndex_ = index;
return *this;
}
/**
* Only output this many frames, default to no limit
*/
Params& maxOutputFrames(int count) {
maximumOutputFrames_ = count;
return *this;
}
/**
* Output frame width, default to video width
*/
Params& outputWidth(int width) {
outputWidth_ = width;
return *this;
}
/**
* Output frame height, default to video height
*/
Params& outputHeight(int height) {
outputHeight_ = height;
return *this;
}
/**
* Max dimension of either width or height, if any is bigger
* it will be scaled down to this and econd dimension
* will be scaled down to maintain aspect ratio.
*/
Params& maxOutputDimension(int size) {
maxOutputDimension_ = size;
return *this;
}
};
// data structure for storing decoded video frames
class DecodedFrame {
public:
struct avDeleter {
void operator()(unsigned char* p) const {
av_free(p);
}
};
using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
// decoded data buffer
AvDataPtr data_;
// size in bytes
int size_ = 0;
// frame dimensions
int width_ = 0;
int height_ = 0;
// timestamp in seconds since beginning of video
double timestamp_ = 0;
// true if this is a key frame.
bool keyFrame_ = false;
// index of frame in video
int index_ = -1;
// Sequential number of outputted frame
int outputFrameIndex_ = -1;
};
// data structure for storing decoded audio data
struct DecodedAudio {
int dataSize_;
int outSampleSize_;
std::unique_ptr<float[]> audio_data_;
explicit DecodedAudio(
int dataSize = 0,
int outSampleSize = 0,
std::unique_ptr<float[]> audio_data = nullptr)
: dataSize_(dataSize),
outSampleSize_(outSampleSize),
audio_data_(std::move(audio_data)) {}
};
class VideoIOContext {
public:
explicit VideoIOContext(const std::string& fname)
: workBuffersize_(VIO_BUFFER_SZ),
workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
inputFile_(nullptr),
inputBuffer_(nullptr),
inputBufferSize_(0) {
inputFile_ = fopen(fname.c_str(), "rb");
if (inputFile_ == nullptr) {
LOG(ERROR) << "Error opening video file " << fname;
return;
}
ctx_ = avio_alloc_context(
static_cast<unsigned char*>(workBuffer_.get()),
workBuffersize_,
0,
this,
&VideoIOContext::readFile,
nullptr, // no write function
&VideoIOContext::seekFile);
}
explicit VideoIOContext(const char* buffer, int size)
: workBuffersize_(VIO_BUFFER_SZ),
workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
inputFile_(nullptr),
inputBuffer_(buffer),
inputBufferSize_(size) {
ctx_ = avio_alloc_context(
static_cast<unsigned char*>(workBuffer_.get()),
workBuffersize_,
0,
this,
&VideoIOContext::readMemory,
nullptr, // no write function
&VideoIOContext::seekMemory);
}
~VideoIOContext() {
av_free(ctx_);
if (inputFile_) {
fclose(inputFile_);
}
}
int read(unsigned char* buf, int buf_size) {
if (inputBuffer_) {
return readMemory(this, buf, buf_size);
} else if (inputFile_) {
return readFile(this, buf, buf_size);
} else {
return -1;
}
}
int64_t seek(int64_t offset, int whence) {
if (inputBuffer_) {
return seekMemory(this, offset, whence);
} else if (inputFile_) {
return seekFile(this, offset, whence);
} else {
return -1;
}
}
static int readFile(void* opaque, unsigned char* buf, int buf_size) {
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
if (feof(h->inputFile_)) {
return AVERROR_EOF;
}
size_t ret = fread(buf, 1, buf_size, h->inputFile_);
if (ret < buf_size) {
if (ferror(h->inputFile_)) {
return -1;
}
}
return ret;
}
static int64_t seekFile(void* opaque, int64_t offset, int whence) {
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
switch (whence) {
case SEEK_CUR: // from current position
case SEEK_END: // from eof
case SEEK_SET: // from beginning of file
return fseek(h->inputFile_, static_cast<long>(offset), whence);
Loading ...