深入淺出FFMPEG

數字媒體處理的基本流程

認識FFMPEG

FFMPEG堪稱自由軟件中最完備的一套多媒體支持庫，它幾乎實現了所有當下常見的數據封裝格式、多媒體傳輸協議以及音視頻編解碼器。因此，對於從事多媒體技術開發的工程師來說，深入研究FFMPEG成爲一門必不可少的工作，可以這樣說，FFMPEG之於多媒體開發工程師的重要性正如kernel之於嵌入式系統工程師一般。

幾個小知識：

· FFMPEG項目是由法國人Fabrice Bellard發起的，此人也是著名的CPU模擬器項目QEMU的發起者，同時還是圓周率算法紀錄的保持者。

· FF是Fast Forward的意思，翻譯成中文是“快進”。

· FFMPEG的LOGO是一個”Z字掃描”示意圖，Z字掃描用於將圖像的二維頻域數據一維化，同時保證了一維化的數據具備良好的統計特性，從而提高其後要進行的一維熵編碼的效率。

關於恥辱柱（Hallof Shame）：FFMpeg大部分代碼遵循LGPL許可證，如果使用者對FFMpeg進行了修改，要求公佈修改的源代碼；有少部分代碼遵循GPL許可證，要求使用者同時公開使用FFMpeg的軟件的源代碼。實際上，除去部分大的系統軟件開發商（Microsoft、Apple等）以及某些著名的音視頻服務提供商（Divx、Real等）提供的自有播放器之外，絕大部分第三方開發的播放器都離不開FFMpeg的支持，像Linux桌面環境中的開源播放器VLC、MPlayer，Windows下的KMPlayer、暴風影音以及Android下幾乎全部第三方播放器都是基於FFMpeg的。也有許多看似具備自主技術的播放器，其實也都不聲不響地使用了FFMpeg，這種行爲被稱爲“盜竊”，參與“盜竊”的公司的名字則被刻在恥辱柱上，國產播放器暴風影音、QQ影音於2009年上榜。

一個簡單的測試程序

#include<stdio.h>

#include <string.h>

#include <stdlib.h>

#include <sys/time.h>

#include "libavutil/avstring.h"

#include "libavformat/avformat.h"

#include "libavdevice/avdevice.h"

#include "libavcodec/opt.h"

#include "libswscale/swscale.h"

#define DECODED_AUDIO_BUFFER_SIZE 192000

struct options

{

intstreamId;

intframes;

intnodec;

intbplay;

intthread_count;

int64_t lstart;

charfinput[256];

charfoutput1[256];

charfoutput2[256];

};

int parse_options(struct options *opts, intargc, char** argv)

{

intoptidx;

char*optstr;

if(argc < 2) return -1;

opts->streamId = -1;

opts->lstart = -1;

opts->frames = -1;

opts->foutput1[0] = 0;

opts->foutput2[0] = 0;

opts->nodec = 0;

opts->bplay = 0;

opts->thread_count = 0;

strcpy(opts->finput, argv[1]);

optidx = 2;

while(optidx < argc)

{

optstr = argv[optidx++];

if (*optstr++ != '-') return -1;

switch (*optstr++)

{

case 's': //< stream id

opts->streamId = atoi(optstr);

break;

case 'f': //< frames

opts->frames = atoi(optstr);

break;

case 'k': //< skipped

opts->lstart = atoll(optstr);

break;

case 'o': //< output

strcpy(opts->foutput1, optstr);

strcat(opts->foutput1, ".mpg");

strcpy(opts->foutput2, optstr);

strcat(opts->foutput2, ".raw");

break;

case 'n': //decoding and output options

if (strcmp("dec", optstr) == 0)

opts->nodec = 1;

break;

case 'p':

opts->bplay = 1;

break;

case 't':

opts->thread_count = atoi(optstr);

break;

default:

return -1;

}

return0;

}

void show_help(char* program)

{

printf("Simple FFMPEG test program\n");

printf("Usage: %s inputfile [-sstreamid [-fframes][-kskipped] [-ooutput_filename(without extension)] [-p] [-tthread_count]]\n",

program);

return;

}

static void log_callback(void* ptr, intlevel, const char* fmt, va_list vl)

{

vfprintf(stdout, fmt, vl);

}

#include <sys/ioctl.h>

#include <unistd.h>

#include <fcntl.h>

#include <sys/soundcard.h>

#define OSS_DEVICE "/dev/dsp0"

struct audio_dsp

{

intaudio_fd;

intchannels;

intformat;

intspeed;

};

int map_formats(enum SampleFormat format)

{

switch(format)

{

case SAMPLE_FMT_U8:

return AFMT_U8;

case SAMPLE_FMT_S16:

return AFMT_S16_LE;

default:

return AFMT_U8;

}

int set_audio(struct audio_dsp* dsp)

{

if(dsp->audio_fd == -1)

{

printf("Invalid audio dsp id!\n");

return -1;

}

if(-1 == ioctl(dsp->audio_fd, SNDCTL_DSP_SETFMT, &dsp->format))

{

printf("Failed to set dsp format!\n");

return -1;

}

if(-1 == ioctl(dsp->audio_fd, SNDCTL_DSP_CHANNELS, &dsp->channels))

{

printf("Failed to set dsp format!\n");

return -1;

}

if(-1 == ioctl(dsp->audio_fd, SNDCTL_DSP_SPEED, &dsp->speed))

{

printf("Failed to set dsp format!\n");

return -1;

}

return0;

}

int play_pcm(struct audio_dsp* dsp, unsignedchar *buf, int size)

{

if(dsp->audio_fd == -1)

{

printf("Invalid audio dsp id!\n");

return -1;

}

if(-1 == write(dsp->audio_fd, buf, size))

{

printf("Failed to write audio dsp!\n");

return -1;

}

return0;

}

#include <linux/fb.h>

#include <sys/mman.h>

#define FB_DEVICE "/dev/fb0"

enum pic_format

{

eYUV_420_Planer,

};

struct video_fb

{

intvideo_fd;

structfb_var_screeninfo vinfo;

structfb_fix_screeninfo finfo;

unsignedchar *fbp;

AVFrame *frameRGB;

struct

{

int x;

int y;

}video_pos;

};

int open_video(struct video_fb *fb, int x, inty)

{

intscreensize;

fb->video_fd = open(FB_DEVICE, O_WRONLY);

if(fb->video_fd == -1) return -1;

if(ioctl(fb->video_fd, FBIOGET_FSCREENINFO, &fb->finfo)) return -2;

if(ioctl(fb->video_fd, FBIOGET_VSCREENINFO, &fb->vinfo)) return -2;

printf("video device: resolution %dx%d, �pp\n",fb->vinfo.xres, fb->vinfo.yres, fb->vinfo.bits_per_pixel);

screensize = fb->vinfo.xres * fb->vinfo.yres * fb->vinfo.bits_per_pixel/ 8;

fb->fbp = (unsigned char *) mmap(0, screensize, PROT_READ|PROT_WRITE,MAP_SHARED, fb->video_fd, 0);

if(fb->fbp == -1) return -3;

if(x >= fb->vinfo.xres || y >= fb->vinfo.yres)

{

return -4;

}

else

{

fb->video_pos.x = x;

fb->video_pos.y = y;

}

fb->frameRGB = avcodec_alloc_frame();

if(!fb->frameRGB) return -5;

return0;

}

#if 0

int show_picture(struct video_fb *fb,AVFrame *frame, int width, int height, enum pic_format format)

{

structSwsContext *sws;

inti;

unsignedchar *dest;

unsignedchar *src;

if(fb->video_fd == -1) return -1;

if ((fb->video_pos.x >= fb->vinfo.xres)|| (fb->video_pos.y >= fb->vinfo.yres)) return -2;

if(fb->video_pos.x + width > fb->vinfo.xres)

{

width = fb->vinfo.xres - fb->video_pos.x;

}

if(fb->video_pos.y + height > fb->vinfo.yres)

{

height = fb->vinfo.yres - fb->video_pos.y;

}

if(format == PIX_FMT_YUV420P)

{

sws = sws_getContext(width, height, format, width, height, PIX_FMT_RGB32,SWS_FAST_BILINEAR, NULL, NULL, NULL);

if (sws == 0)

{

return -3;

}

if (sws_scale(sws, frame->data, frame->linesize, 0, height, fb->frameRGB->data,fb->frameRGB->linesize))

{

return -3;

}

dest = fb->fbp + (fb->video_pos.x+fb->vinfo.xoffset) * (fb->vinfo.bits_per_pixel/8)+(fb->video_pos.y+fb->vinfo.yoffset) * fb->finfo.line_length;

for (i = 0; i < height; i++)

{

memcpy(dest, src, width*4);

src += fb->frameRGB->linesize[0];

dest += fb->finfo.line_length;

}

return0;

}

#endif

void close_video(struct video_fb *fb)

{

if(fb->video_fd != -1)

{

munmap(fb->fbp, fb->vinfo.xres * fb->vinfo.yres * fb->vinfo.bits_per_pixel/ 8);

close(fb->video_fd);

fb->video_fd = -1;

}

int main(int argc, char **argv)

{

AVFormatContext* pCtx = 0;

AVCodecContext *pCodecCtx = 0;

AVCodec *pCodec = 0;

AVPacket packet;

AVFrame *pFrame = 0;

FILE *fpo1 = NULL;

FILE *fpo2 = NULL;

int nframe;

interr;

intgot_picture;

intpicwidth, picheight, linesize;

unsignedchar *pBuf;

inti;

int64_t timestamp;

structoptions opt;

intusefo = 0;

structaudio_dsp dsp;

intdusecs;

floatusecs1 = 0;

float usecs2 = 0;

structtimeval elapsed1, elapsed2;

intdecoded = 0;

av_register_all();

av_log_set_callback(log_callback);

av_log_set_level(50);

if(parse_options(&opt, argc, argv) < 0 || (strlen(opt.finput) == 0))

{

show_help(argv[0]);

return 0;

}

err = avformat_open_input(&pCtx, opt.finput, 0, 0);

if(err < 0)

{

printf("\n->(avformat_open_input)\tERROR:\t%d\n",err);

goto fail;

}

err = avformat_find_stream_info(pCtx, 0);

if(err < 0)

{

printf("\n->(avformat_find_stream_info)\tERROR:\t%d\n",err);

goto fail;

}

if(opt.streamId < 0)

{

av_dump_format(pCtx, 0, pCtx->filename, 0);

goto fail;

}

else

{

printf("\n extra data in Stream %d (�):",opt.streamId, pCtx->streams[opt.streamId]->codec->extradata_size);

for (i = 0; i < pCtx->streams[opt.streamId]->codec->extradata_size;i++)

{

if (i%16 == 0) printf("\n");

printf("%2x ", pCtx->streams[opt.streamId]->codec->extradata[i]);

}

if(strlen(opt.foutput1) && strlen(opt.foutput2))

{

fpo1 = fopen(opt.foutput1, "wb");

fpo2 = fopen(opt.foutput2, "wb");

if (!fpo1 || !fpo2)

{

printf("\n->error opening output files\n");

goto fail;

}

usefo = 1;

}

else

{

usefo = 0;

}

if(opt.streamId >= pCtx->nb_streams)

{

printf("\n->StreamId\tERROR\n");

goto fail;

}

if(opt.lstart > 0)

{

err = av_seek_frame(pCtx, opt.streamId, opt.lstart, AVSEEK_FLAG_ANY);

if (err < 0)

{

printf("\n->(av_seek_frame)\tERROR:\t%d\n",err);

goto fail;

}

if(!opt.nodec)

{

pCodecCtx = pCtx->streams[opt.streamId]->codec;

if (opt.thread_count <= 16 && opt.thread_count > 0 )

{

pCodecCtx->thread_count = opt.thread_count;

pCodecCtx->thread_type = FF_THREAD_FRAME;

}

pCodec = avcodec_find_decoder(pCodecCtx->codec_id);

if (!pCodec)

{

printf("\n->can not find codec!\n");

goto fail;

}

err = avcodec_open2(pCodecCtx, pCodec, 0);

if (err < 0)

{

printf("\n->(avcodec_open)\tERROR:\t%d\n",err);

goto fail;

}

pFrame = avcodec_alloc_frame();

if (opt.bplay)

{

dsp.audio_fd = open(OSS_DEVICE, O_WRONLY);

if (dsp.audio_fd == -1)

{

printf("\n-> can not open audio device\n");

goto fail;

}

dsp.channels = pCodecCtx->channels;

dsp.speed = pCodecCtx->sample_rate;

dsp.format = map_formats(pCodecCtx->sample_fmt);

if (set_audio(&dsp) < 0)

{

printf("\n-> can not set audio device\n");

goto fail;

}

nframe = 0;

while(nframe< opt.frames || opt.frames == -1)

{

gettimeofday(&elapsed1, NULL);

err = av_read_frame(pCtx, &packet);

if (err < 0)

{

printf("\n->(av_read_frame)\tERROR:\t%d\n",err);

break;

}

gettimeofday(&elapsed2, NULL);

dusecs = (elapsed2.tv_sec - elapsed1.tv_sec)*1000000 + (elapsed2.tv_usec- elapsed1.tv_usec);

usecs2 += dusecs;

timestamp = av_rescale_q(packet.dts, pCtx->streams[packet.stream_index]->time_base,(AVRational){1, AV_TIME_BASE});

printf("\nFrame No ] stream#%d\tsize mB,timestamp:%6lld, dts:%6lld, pts:%6lld, ", nframe++, packet.stream_index, packet.size,

timestamp, packet.dts, packet.pts);

if (packet.stream_index == opt.streamId)

{

#if 0

for (i = 0; i < 16; i++)

{

if (i%16 == 0) printf("\n pktdata: ");

printf("%2x ", packet.data[i]);

}

printf("\n");

#endif

if (usefo)

{

fwrite(packet.data, packet.size,1, fpo1);

fflush(fpo1);

}

if (pCtx->streams[opt.streamId]->codec->codec_type ==AVMEDIA_TYPE_VIDEO && !opt.nodec)

{

picheight = pCtx->streams[opt.streamId]->codec->height;

picwidth = pCtx->streams[opt.streamId]->codec->width;

gettimeofday(&elapsed1,NULL);

avcodec_decode_video2(pCodecCtx,pFrame, &got_picture, &packet);

decoded++;

gettimeofday(&elapsed2,NULL);

dusecs = (elapsed2.tv_sec -elapsed1.tv_sec)*1000000 + (elapsed2.tv_usec - elapsed1.tv_usec);

usecs1 += dusecs;

if (got_picture)

{

printf("[Video: type %d, ref %d, pts %lld, pkt_pts%lld, pkt_dts %lld]",

pFrame->pict_type,pFrame->reference, pFrame->pts, pFrame->pkt_pts, pFrame->pkt_dts);

if (pCtx->streams[opt.streamId]->codec->pix_fmt== PIX_FMT_YUV420P)

{

if (usefo)

{

linesize = pFrame->linesize[0];

pBuf = pFrame->data[0];

for (i = 0; i <picheight; i++)

{

fwrite(pBuf,picwidth, 1, fpo2);

pBuf += linesize;

}

linesize = pFrame->linesize[1];

pBuf = pFrame->data[1];

for (i = 0; i <picheight/2; i++)

{

fwrite(pBuf,picwidth/2, 1, fpo2);

pBuf +=linesize;

}

linesize = pFrame->linesize[2];

pBuf = pFrame->data[2];

for (i = 0; i <picheight/2; i++)

{

fwrite(pBuf,picwidth/2, 1, fpo2);

pBuf +=linesize;

}

fflush(fpo2);

}

if (opt.bplay)

{

}

av_free_packet(&packet);

}

else if (pCtx->streams[opt.streamId]->codec->codec_type ==AVMEDIA_TYPE_AUDIO && !opt.nodec)

{

int got;

gettimeofday(&elapsed1,NULL);

avcodec_decode_audio4(pCodecCtx,pFrame, &got, &packet);

decoded++;

gettimeofday(&elapsed2,NULL);

dusecs = (elapsed2.tv_sec -elapsed1.tv_sec)*1000000 + (elapsed2.tv_usec - elapsed1.tv_usec);

usecs1 += dusecs;

if (got)

{

printf("[Audio: ]B raw data, decoding time: %d]",pFrame->linesize[0], dusecs);

if (usefo)

{

fwrite(pFrame->data[0], pFrame->linesize[0], 1, fpo2);

fflush(fpo2);

}

if (opt.bplay)

{

play_pcm(&dsp,pFrame->data[0], pFrame->linesize[0]);

}

if(!opt.nodec && pCodecCtx)

{

avcodec_close(pCodecCtx);

}

printf("\n%d frames parsed, average %.2f us perframe\n", nframe, usecs2/nframe);

printf("%d frames decoded, average %.2f us perframe\n", decoded, usecs1/decoded);

fail:

if(pCtx)

{

avformat_close_input(&pCtx);

}

if(fpo1)

{

fclose(fpo1);

}

if(fpo2)

{

fclose(fpo2);

}

if(!pFrame)

{

av_free(pFrame);

}

if(!usefo && (dsp.audio_fd != -1))

{

close(dsp.audio_fd);

}

return0;

}

這一小段代碼可以實現的功能包括：

· 打開一個多媒體文件並獲取基本的媒體信息。

· 獲取編碼器句柄。

· 根據給定的時間標籤進行一個跳轉。

· 讀取數據幀。

· 解碼音頻幀或者視頻幀。

· 關閉多媒體文件。

這些功能足以支持一個功能強大的多媒體播放器，因爲最複雜的解複用、解碼、數據分析過程已經在FFMpeg內部實現了，需要關注的僅剩同步問題。

用戶接口

數據結構

基本概念

編解碼器、數據幀、媒體流和容器是數字媒體處理系統的四個基本概念。

首先需要統一術語：

· 容器／文件（Conainer/File）：即特定格式的多媒體文件。

· 媒體流（Stream）：指時間軸上的一段連續數據，如一段聲音數據，一段視頻數據或一段字幕數據，可以是壓縮的，也可以是非壓縮的，壓縮的數據需要關聯特定的編解碼器。

· 數據幀／數據包（Frame/Packet）：通常，一個媒體流由大量的數據幀組成，對於壓縮數據，幀對應着編解碼器的最小處理單元。通常，分屬於不同媒體流的數據幀交錯複用於容器之中，參見交錯。

· 編解碼器：編解碼器以幀爲單位實現壓縮數據和原始數據之間的相互轉換。

在FFMPEG中，使用AVFormatContext、AVStream、AVCodecContext、AVCodec及AVPacket等結構來抽象這些基本要素，它們的關係如下圖所示：

AVCodecContext

這是一個描述編解碼器上下文的數據結構，包含了衆多編解碼器需要的參數信息，如下列出了部分比較重要的域：

typedef structAVCodecContext {

......

uint8_t *extradata;

intextradata_size;

AVRational time_base;

intwidth, height;

......

intsample_rate; ///< samples per second

intchannels; ///< number of audiochannels

enumSampleFormat sample_fmt; ///< sampleformat

intframe_size;

intframe_number; ///< audio or videoframe number

......

charcodec_name[32];

enumAVMediaType codec_type;

enumCodecID codec_id;

unsignedint codec_tag;

......

inthas_b_frames;

intblock_align;

......

intbits_per_coded_sample;

......

} AVCodecContext;

如果是單純使用libavcodec，這部分信息需要調用者進行初始化；如果是使用整個FFMPEG庫，這部分信息在調用avformat_open_input和avformat_find_stream_info的過程中根據文件的頭信息及媒體流內的頭部信息完成初始化。其中幾個主要域的釋義如下：

1. extradata/extradata_size：這個buffer中存放了解碼器可能會用到的額外信息，在av_read_frame中填充。一般來說，首先，某種具體格式的demuxer在讀取格式頭信息的時候會填充extradata，其次，如果demuxer沒有做這個事情，比如可能在頭部壓根兒就沒有相關的編解碼信息，則相應的parser會繼續從已經解複用出來的媒體流中繼續尋找。在沒有找到任何額外信息的情況下，這個buffer指針爲空。

2. time_base：

3. width/height：視頻的寬和高。

4. sample_rate/channels：音頻的採樣率和信道數目。

5. sample_fmt：音頻的原始採樣格式。

6. codec_name/codec_type/codec_id/codec_tag：編解碼器的信息。

AVStream

該結構體描述一個媒體流，定義如下：

typedef structAVStream {

intindex;

intid;

AVCodecContext *codec;

AVRational r_frame_rate;

......

AVRational time_base;

......

int64_t start_time;

int64_t duration;

#if LIBAVFORMAT_VERSION_INT <(53<<16)

charlanguage[4];

#endif

enumAVStreamParseType need_parsing;

structAVCodecParserContext *parser;

......

AVIndexEntry *index_entries;

intnb_index_entries;

unsignedint index_entries_allocated_size;

int64_t nb_frames; ///< number of frames in this stream if known or 0

......

AVRational avg_frame_rate;

......

} AVStream;

主要域的釋義如下，其中大部分域的值可以由avformat_open_input根據文件頭的信息確定，缺少的信息需要通過調用avformat_find_stream_info讀幀及軟解碼進一步獲取：

1. index/id：index對應流的索引，這個數字是自動生成的，根據index可以從AVFormatContext::streams表中索引到該流；而id則是流的標識，依賴於具體的容器格式。比如對於MPEG TS格式，id就是pid。

2. time_base：流的時間基準，是一個實數，該流中媒體數據的pts和dts都將以這個時間基準爲粒度。通常，使用av_rescale/av_rescale_q可以實現不同時間基準的轉換。

3. start_time：流的起始時間，以流的時間基準爲單位，通常是該流中第一個幀的pts。

4. duration：流的總時間，以流的時間基準爲單位。

5. need_parsing：對該流parsing過程的控制域。

6. nb_frames：流內的幀數目。

7. r_frame_rate/framerate/avg_frame_rate：幀率相關。

8. codec：指向該流對應的AVCodecContext結構，調用avformat_open_input時生成。

9. parser：指向該流對應的AVCodecParserContext結構，調用avformat_find_stream_info時生成。。

AVFormatContext

這個結構體描述了一個媒體文件或媒體流的構成和基本信息，定義如下：

typedef structAVFormatContext {

constAVClass *av_class;

structAVInputFormat *iformat;

structAVOutputFormat *oformat;

void*priv_data;

ByteIOContext *pb;

unsignedint nb_streams;

AVStream *streams[MAX_STREAMS];

charfilename[1024];

int64_ttimestamp;

#if LIBAVFORMAT_VERSION_INT <(53<<16)

chartitle[512];

charauthor[512];

charcopyright[512];

charcomment[512];

charalbum[512];

intyear;

inttrack;

chargenre[32];

#endif

intctx_flags;

structAVPacketList *packet_buffer;

int64_t start_time;

int64_t duration;

int64_t file_size;

intbit_rate;

AVStream *cur_st;

#if LIBAVFORMAT_VERSION_INT <(53<<16)

constuint8_t *cur_ptr_deprecated;

intcur_len_deprecated;

AVPacket cur_pkt_deprecated;

#endif

int64_t data_offset;

intindex_built;

intmux_rate;

unsignedint packet_size;

intpreload;

intmax_delay;

#define AVFMT_NOOUTPUTLOOP -1

#define AVFMT_INFINITEOUTPUTLOOP 0

intloop_output;

intflags;

#define AVFMT_FLAG_GENPTS 0x0001 ///< Generate missing pts evenif it requires parsing future frames.

#define AVFMT_FLAG_IGNIDX 0x0002 ///< Ignore index.

#define AVFMT_FLAG_NONBLOCK 0x0004 ///< Do not block when readingpackets from input.

#define AVFMT_FLAG_IGNDTS 0x0008 ///< Ignore DTS on frames thatcontain both DTS & PTS

#define AVFMT_FLAG_NOFILLIN 0x0010 ///< Do not infer any valuesfrom other values, just return what is stored in the container

#define AVFMT_FLAG_NOPARSE 0x0020 ///< Do not use AVParsers, youalso must set AVFMT_FLAG_NOFILLIN as the fillin code works on frames and noparsing -> no frames. Also seeking to frames can not work if parsing to findframe boundaries has been disabled

#define AVFMT_FLAG_RTP_HINT 0x0040 ///< Add RTP hinting to theoutput file

intloop_input;

unsignedint probesize;

intmax_analyze_duration;

constuint8_t *key;

intkeylen;

unsignedint nb_programs;

AVProgram **programs;

enumCodecID video_codec_id;

enumCodecID audio_codec_id;

enumCodecID subtitle_codec_id;

unsignedint max_index_size;

unsignedint max_picture_buffer;

unsignedint nb_chapters;

AVChapter **chapters;

intdebug;

#define FF_FDEBUG_TS 0x0001

structAVPacketList *raw_packet_buffer;

structAVPacketList *raw_packet_buffer_end;

structAVPacketList *packet_buffer_end;

AVMetadata *metadata;

#define RAW_PACKET_BUFFER_SIZE 2500000

intraw_packet_buffer_remaining_size;

int64_t start_time_realtime;

} AVFormatContext;

這是FFMpeg中最爲基本的一個結構，是其他所有結構的根，是一個多媒體文件或流的根本抽象。其中:

· nb_streams和streams所表示的AVStream結構指針數組包含了所有內嵌媒體流的描述；

· iformat和oformat指向對應的demuxer和muxer指針；

· pb則指向一個控制底層數據讀寫的ByteIOContext結構。

· start_time和duration是從streams數組的各個AVStream中推斷出的多媒體文件的起始時間和長度，以微妙爲單位。

通常，這個結構由avformat_open_input在內部創建並以缺省值初始化部分成員。但是，如果調用者希望自己創建該結構，則需要顯式爲該結構的一些成員置缺省值——如果沒有缺省值的話，會導致之後的動作產生異常。以下成員需要被關注：

· probesize

· mux_rate

· packet_size

· flags

· max_analyze_duration

· key

· max_index_size

· max_picture_buffer

· max_delay

AVPacket

AVPacket定義在avcodec.h中，如下：

typedef structAVPacket {

int64_t pts;

int64_t dts;

uint8_t *data;

int size;

int stream_index;

int flags;

int duration;

void (*destruct)(struct AVPacket *);

void *priv;

int64_t pos; ///< byte position instream, -1 if unknown

int64_t convergence_duration;

} AVPacket;

FFMPEG使用AVPacket來暫存解複用之後、解碼之前的媒體數據（一個音/視頻幀、一個字幕包等）及附加信息（解碼時間戳、顯示時間戳、時長等）。其中：

· dts表示解碼時間戳，pts表示顯示時間戳，它們的單位是所屬媒體流的時間基準。

· stream_index給出所屬媒體流的索引；

· data爲數據緩衝區指針，size爲長度；

· duration爲數據的時長，也是以所屬媒體流的時間基準爲單位；

· pos表示該數據在媒體流中的字節偏移量；

· destruct爲用於釋放數據緩衝區的函數指針；

· flags爲標誌域，其中，最低爲置1表示該數據是一個關鍵幀。

AVPacket結構本身只是個容器，它使用data成員引用實際的數據緩衝區。這個緩衝區通常是由av_new_packet創建的，但也可能由FFMPEG的API創建（如av_read_frame）。當某個AVPacket結構的數據緩衝區不再被使用時，要需要通過調用av_free_packet釋放。av_free_packet調用的是結構體本身的destruct函數，它的值有兩種情況：1)av_destruct_packet_nofree或0；2)av_destruct_packet，其中，情況1)僅僅是將data和size的值清0而已，情況2)纔會真正地釋放緩衝區。

FFMPEG內部使用AVPacket結構建立緩衝區裝載數據，同時提供destruct函數，如果FFMPEG打算自己維護緩衝區，則將destruct設爲av_destruct_packet_nofree，用戶調用av_free_packet清理緩衝區時並不能夠將其釋放；如果FFMPEG打算將該緩衝區徹底交給調用者，則將destruct設爲av_destruct_packet，表示它能夠被釋放。安全起見，如果用戶希望自由地使用一個FFMPEG內部創建的AVPacket結構，最好調用av_dup_packet進行緩衝區的克隆，將其轉化爲緩衝區能夠被釋放的AVPacket，以免對緩衝區的不當佔用造成異常錯誤。av_dup_packet會爲destruct指針爲av_destruct_packet_nofree的AVPacket新建一個緩衝區，然後將原緩衝區的數據拷貝至新緩衝區，置data的值爲新緩衝區的地址，同時設destruct指針爲av_destruct_packet。

時間信息

時間信息用於實現多媒體同步。

同步的目的在於展示多媒體信息時，能夠保持媒體對象之間固有的時間關係。同步有兩類，一類是流內同步，其主要任務是保證單個媒體流內的時間關係，以滿足感知要求，如按照規定的幀率播放一段視頻；另一類是流間同步，主要任務是保證不同媒體流之間的時間關係，如音頻和視頻之間的關係（lipsync）。

對於固定速率的媒體，如固定幀率的視頻或固定比特率的音頻，可以將時間信息（幀率或比特率）置於文件首部（header），如AVI的hdrlList、MP4的moov box，還有一種相對複雜的方案是將時間信息嵌入媒體流的內部，如MPEGTS和Real video，這種方案可以處理變速率的媒體，亦可有效避免同步過程中的時間漂移。

FFMPEG會爲每一個數據包打上時間標籤，以更有效地支持上層應用的同步機制。時間標籤有兩種，一種是DTS，稱爲解碼時間標籤，另一種是PTS，稱爲顯示時間標籤。對於聲音來說，這兩個時間標籤是相同的，但對於某些視頻編碼格式，由於採用了雙向預測技術，會造成DTS和PTS的不一致。

無雙向預測幀的情況：

圖像類型: I P P P P P P... I P P

DTS: 0 1 2 3 4 5 6... 100 101 102

PTS: 0 1 2 3 4 5 6... 100 101 102

有雙向預測幀的情況：

圖像類型: I P B B P B B... I P B

DTS: 0 1 2 3 4 5 6... 100 101 102

PTS: 0 3 1 2 6 4 5... 100 104 102

對於存在雙向預測幀的情況，通常要求解碼器對圖像重排序，以保證輸出的圖像順序爲顯示順序：

解碼器輸入：I P B B P B B

(DTS) 0 1 2 3 4 5 6

(PTS) 0 3 1 2 6 4 5

解碼器輸出：X I B B P B B P

(PTS) X 0 1 2 3 4 5 6

時間信息的獲取：

通過調用avformat_find_stream_info，多媒體應用可以從AVFormatContext對象中拿到媒體文件的時間信息：主要是總時間長度和開始時間，此外還有與時間信息相關的比特率和文件大小。其中時間信息的單位是AV_TIME_BASE：微秒。

typedef structAVFormatContext {

......

int64_t start_time;

int64_t duration;

int64_tfile_size;

intbit_rate;

......

} AVFormatContext;

以上4個成員變量都是隻讀的，基於FFMpeg的中間件需要將其封裝到某個接口中，如：

LONG GetDuratioin(IntfX*);

LONG GetStartTime(IntfX*);

LONG GetFileSize(IntfX*);

LONG GetBitRate(IntfX*);

APIs

avformat_open_input

int avformat_open_input(AVFormatContext**ic_ptr, const char *filename, AVInputFormat *fmt, AVDictionary **options);

avformat_open_input完成兩個任務：

1. 打開一個文件或URL，基於字節流的底層輸入模塊得到初始化。

2. 解析多媒體文件或多媒體流的頭信息，創建AVFormatContext結構並填充其中的關鍵字段，依次爲各個原始流建立AVStream結構。

一個多媒體文件或多媒體流與其包含的原始流的關係如下：

多媒體文件/多媒體流 (movie.mkv)

原始流 1 (h.264 video)

原始流 2 (aac audio for Chinese)

原始流 3 (aac audio for english)

原始流 4 (Chinese Subtitle)

原始流 5 (English Subtitle)

...

關於輸入參數：

· ic_ptr，這是一個指向指針的指針，用於返回avformat_open_input內部構造的一個AVFormatContext結構體。

· filename，指定文件名。

· fmt，用於顯式指定輸入文件的格式，如果設爲空則自動判斷其輸入格式。

· options

這個函數通過解析多媒體文件或流的頭信息及其他輔助數據，能夠獲取足夠多的關於文件、流和編解碼器的信息，但由於任何一種多媒體格式提供的信息都是有限的，而且不同的多媒體內容製作軟件對頭信息的設置不盡相同，此外這些軟件在產生多媒體內容時難免會引入一些錯誤，因此這個函數並不保證能夠獲取所有需要的信息，在這種情況下，則需要考慮另一個函數：avformat_find_stream_info。

avformat_find_stream_info

intavformat_find_stream_info(AVFormatContext *ic, AVDictionary **options);

這個函數主要用於獲取必要的編解碼器參數，設置到ic→streams[i]→codec中。

首先必須得到各媒體流對應編解碼器的類型和id，這是兩個定義在avutils.h和avcodec.h中的枚舉：

enum AVMediaType {

AVMEDIA_TYPE_UNKNOWN = -1,

AVMEDIA_TYPE_VIDEO,

AVMEDIA_TYPE_AUDIO,

AVMEDIA_TYPE_DATA,

AVMEDIA_TYPE_SUBTITLE,

AVMEDIA_TYPE_ATTACHMENT,

AVMEDIA_TYPE_NB

};

enum CodecID {

CODEC_ID_NONE,

CODEC_ID_MPEG1VIDEO,

CODEC_ID_MPEG2VIDEO, ///< preferred ID for MPEG-1/2 video decoding

CODEC_ID_MPEG2VIDEO_XVMC,

CODEC_ID_H261,

CODEC_ID_H263,

...

};

通常，如果某種媒體格式具備完備而正確的頭信息，調用avformat_open_input即可以得到這兩個參數，但若是因某種原因avformat_open_input無法獲取它們，這一任務將由avformat_find_stream_info完成。

其次還要獲取各媒體流對應編解碼器的時間基準。

此外，對於音頻編解碼器，還需要得到：

1. 採樣率，

2. 聲道數，

3. 位寬，

4. 幀長度（對於某些編解碼器是必要的），

對於視頻編解碼器，則是：

1. 圖像大小，

2. 色彩空間及格式，

av_read_frame

intav_read_frame(AVFormatContext *s, AVPacket *pkt);

這個函數用於從多媒體文件或多媒體流中讀取媒體數據，獲取的數據由AVPacket結構pkt來存放。對於音頻數據，如果是固定比特率，則pkt中裝載着一個或多個音頻幀；如果是可變比特率，則pkt中裝載有一個音頻幀。對於視頻數據，pkt中裝載有一個視頻幀。需要注意的是：再次調用本函數之前，必須使用av_free_packet釋放pkt所佔用的資源。

通過pkt→stream_index可以查到獲取的媒體數據的類型，從而將數據送交相應的解碼器進行後續處理。

av_seek_frame

intav_seek_frame(AVFormatContext *s, int stream_index, int64_t timestamp, intflags);

這個函數通過改變媒體文件的讀寫指針來實現對媒體文件的隨機訪問，支持以下三種方式：

· 基於時間的隨機訪問：具體而言就是將媒體文件讀寫指針定位到某個給定的時間點上，則之後調用av_read_frame時能夠讀到時間標籤等於給定時間點的媒體數據，通常用於實現媒體播放器的快進、快退等功能。

· 基於文件偏移的隨機訪問：相當於普通文件的seek函數，timestamp也成爲文件的偏移量。

· 基於幀號的隨機訪問：timestamp爲要訪問的媒體數據的幀號。

關於參數：

· s：是個AVFormatContext指針，就是avformat_open_input返回的那個結構。

· stream_index：指定媒體流，如果是基於時間的隨機訪問，則第三個參數timestamp將以此媒體流的時間基準爲單位；如果設爲負數，則相當於不指定具體的媒體流，FFMPEG會按照特定的算法尋找缺省的媒體流，此時，timestamp的單位爲AV_TIME_BASE（微秒）。

· timestamp：時間標籤，單位取決於其他參數。

· flags：定位方式，AVSEEK_FLAG_BYTE表示基於字節偏移，AVSEEK_FLAG_FRAME表示基於幀號，其它表示基於時間。

av_close_input_file

voidav_close_input_file(AVFormatContext *s);

關閉一個媒體文件：釋放資源，關閉物理IO。

avcodec_find_decoder

AVCodec *avcodec_find_decoder(enum CodecIDid);

AVCodec *avcodec_find_decoder_by_name(constchar *name);

根據給定的codecid或解碼器名稱從系統中搜尋並返回一個AVCodec結構的指針。

avcodec_open

intavcodec_open(AVCodecContext *avctx, AVCodec *codec);

此函數根據輸入的AVCodec指針具體化AVCodecContext結構。在調用該函數之前，需要首先調用avcodec_alloc_context分配一個AVCodecContext結構，或調用avformat_open_input獲取媒體文件中對應媒體流的AVCodecContext結構；此外還需要通過avcodec_find_decoder獲取AVCodec結構。

這一函數還將初始化對應的解碼器。

avcodec_decode_video2

intavcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture, int *got_picture_ptr,AVPacket *avpkt);

解碼一個視頻幀。got_picture_ptr指示是否有解碼數據輸出。

輸入數據在AVPacket結構中，輸出數據在AVFrame結構中。AVFrame是定義在avcodec.h中的一個數據結構：

typedef structAVFrame {

FF_COMMON_FRAME

} AVFrame;

FF_COMMON_FRAME定義了諸多數據域，大部分由FFMpeg內部使用，對於用戶來說，比較重要的主要包括：

#defineFF_COMMON_FRAME \

......

uint8_t *data[4];\

intlinesize[4];\

intkey_frame;\

intpict_type;\

int64_t pts;\

intreference;\

......

FFMpeg內部以planar的方式存儲原始圖像數據，即將圖像像素分爲多個平面（R/G/B或Y/U/V），data數組內的指針分別指向四個像素平面的起始位置，linesize數組則存放各個存貯各個平面的緩衝區的行寬：

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

+++data[0]->#################################++++++++++++

++++++++++++###########picture data##########++++++++++++

++++++++++++#################################++++++++++++

........................

++++++++++++#################################++++++++++++

|<-------------------line_size[0]---------------------->|

此外，key_frame標識該圖像是否是關鍵幀；pict_type表示該圖像的編碼類型：I(1)/P(2)/B(3)……；pts是以time_base爲單位的時間標籤，對於部分解碼器如H.261、H.263和MPEG4，可以從頭信息中獲取；reference表示該圖像是否被用作參考。

avcodec_decode_audio4

intavcodec_decode_audio4(AVCodecContext *avctx, AVFrame *frame, int *got_frame_ptr,AVPacket *avpkt);

解碼一個音頻幀。輸入數據在AVPacket結構中，輸出數據在frame中，got_frame_ptr表示是否有數據輸出。

avcodec_close

int avcodec_close(AVCodecContext *avctx);

關閉解碼器，釋放avcodec_open中分配的資源。

FFMPEG 深入淺出

深入淺出FFMPEG

數字媒體處理的基本流程

認識FFMPEG

一個簡單的測試程序

用戶接口

數據結構

基本概念

AVCodecContext

AVStream

AVFormatContext

AVPacket

時間信息

APIs

avformat_open_input

avformat_find_stream_info

av_read_frame

av_seek_frame

av_close_input_file

avcodec_find_decoder

avcodec_open

avcodec_decode_video2

avcodec_decode_audio4

avcodec_close

python gdal 安裝使用（Windows， python 3.6.8）

FFMPEG 深入淺出

最簡單的基於FFmpeg的AVfilter例子（水印疊加）

av_read_frame的過程

最簡單的基於FFmpeg的封裝格式處理：視音頻複用器（muxer）

ffmpeg函數介紹

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結