http://blog.csdn.net/linzhiji/article/details/5840031

注释:

      1。3gp和MP4中的AAC的私有数据保存在esds的0x05标签的数据,

       结构为 05 + 长度 + 内容。

      将长度赋值给 extradatasize

      将内容赋值给 extradata

      长度的计算函数在ffmpeg中的static int mp4_read_descr_len(ByteIOContext *pb)

      2。avc/h264的extradata和extradata信息在avcc atom中,将avcc atom去掉type和长度(8个字节)后的长度赋予extradatasize,内容赋值给extradata.

MP4文件格式分为头部和数据两部分,头部是由许多被称作Atom的结构单元嵌套或排列而成,数据部分则完全为实际数据不包含元信息,因此具体解码时音视频帧的位置和大小都要在头部获取。详细内容见以下链接:
http://wqyuwss.52rd.net 
这里总结下音频解码信息获取的一些经验,当然详细内容需要查看quick time file format的文档。
MP4的音频解码信息保存在如下嵌套的Atom中,{moov{mdia{minf{smhd{stbl{stsd}}}}}}
stsd可能包括多个音频信息的描述,结构如下:

typedef struct stsdtable
{
    unsigned int size;//Atom大小
    char format[4];//音频编码格式
    int res1;
    int ref;
    short version;//版本
    short pad1;
    int pad2;
    short channels;//声道
    short bitspersample;
    short compress_id;
    short res2;
    short samplerate1;//采样率
    short samplerate2;
    //{if(version==1)

        int sampleperpacket;
        int bytesperpacket;
        int bytesperframe;
        int bytespersample;
    //}

} stsdtable;

其中format对应音频编码格式:
PCM_S32BE,  in32
PCM_S32LE,  in32
PCM_S24BE,  in24
PCM_S24LE,  in24
PCM_S16BE,  twos // 16 bits //
PCM_S16LE,  sowt //  
PCM_S16LE,  lpcm
PCM_F32BE,  fl32
PCM_F64BE,  fl64
PCM_S8,     sowt
PCM_U8,     raw  // 8 bits unsigned 
PCM_U8,     NONE // uncompressed 
PCM_MULAW,  ulaw //
PCM_ALAW,   alaw //
ADPCM_IMA_QT, ima4 // IMA-4 ADPCM //
MACE3,      MAC3 // Macintosh Audio Compression and Expansion 3:1 ///
MACE6,      MAC6 // Macintosh Audio Compression and Expansion 6:1 //
MP3,        .mp3 // MPEG layer 3 */ /* sample files at http://www.3ivx.com/showcase.html use this tag //
MP3,        0x6D730055  // MPEG layer 3 //
OGG_VORBIS, OggS //// sample files at http://heroinewarrior.com/xmovie.php3 use this tag //
AAC,        mp4a // MPEG-4 AAC //
AC3,        ac-3 // ETSI TS 102 366 Annex F //
AMR_NB,     samr // AMR-NB 3gp //
AMR_WB,     sawb // AMR-WB 3gp//
GSM,        agsm
ALAC,       alac // Apple Lossless //
QCELP,      Qclp
QCELP,      sqcp // ISO Media fourcc //
QDM2,       QDM2 // QDM2 //
DVAUDIO,    vdva
DVAUDIO,    dvca
WMAV2,      WMA2
这个获取比较简单,下面是解码私有数据的获取:
这些解码私有数据也保存在Atom中,通常在上面结构体的后面,有esds、frma、mp4a、wave。AAC的私有数据保存在esds的0x05标签的数据,QDM2的则是”wave”Atom的数据部分(以下按顺序分析):
   4字节 长度
   4字节 “esds” or “m4ds” 标志
   4字节 版本标识

   1字节 ES描述类型标签 0x03
   –3字节 扩展描述类型标签 可能没有
   1字节 描述类型长度 
   2字节 ES ID
   1字节 流优先级

   1字节 解码配置描述类型标签 0x04
   –3字节 扩展描述类型标签 可能没有
   1字节 描述类型长度 
   1字节 描述对象ID 
   1字节 
   3字节 
   4字节
   4字节

   1字节 解码配置描述类型标签 0x05
   –3字节 扩展描述类型标签 可能没有
   1字节 长度 

   1字节 0x06
   0x06不再分析
下面是一个例子:
长度 标签 
00015218h: 00 00 00 10 73 6D 68 64 00 00 00 00 00 00 00 00 ; ….smhd……..
00015228h: 00 00 00 24 64 69 6E 66 00 00 00 1C 64 72 65 66 ; …$dinf….dref
00015238h: 00 00 00 00 00 00 00 01 00 00 00 0C 75 72 6C 20 ; …………url 
00015248h: 00 00 00 01 00 02 C0 97 73 74 62 6C 00 00 00 5B ; ……罈stbl…[
00015258h: 73 74 73 64 00 00 00 00 00 00 00 01 00 00 00 4B ; stsd………..K
00015268h: 6D 70 34 61 00 00 00 00 00 00 00 01 00 00 00 00 ; mp4a…………
00015278h: 00 00 00 00 00 01 00 10 00 00 00 00 7D 00 00 00 ; …………}…
00015288h: 00 00 00 27 65 73 64 73 00 00 00 00 03 19 00 00 ; …’esds……..
00015298h: 00 04 11 40 15 00 00 D2 00 00 BB 88 00 00 7D 00 ; …@…?.粓..}.
000152a8h: 05 02 12 88 06 01 02                            ; …?..

0x12 0x88即私有数据(对应ffmpeg中AVCodecContext.extradata)
下面是mp4音频部分分析的代码:

//MP4Analyze.h

#define uint8_t unsigned char

/******atom tag*******/
uint8_t moov[] = "moov";
uint8_t trak[] = "trak";
uint8_t mdia[] = "mdia";
uint8_t minf[] = "minf";
uint8_t stbl[] = "stbl";
uint8_t stsd[] = "stsd";
uint8_t stsc[] = "stsc";
uint8_t stsz[] = "stsz";
uint8_t stco[] = "stco";
uint8_t ftyp[] = "ftyp";
uint8_t mdat[] = "mdat";

typedef struct Atom
{
    unsigned int size;
    uint8_t tag[4];
    int ver_flag;
    unsigned int num_of_entries;
    unsigned int pos;
    uint8_t *data;
} Atom;

/****audio format****/
uint8_t kmp3[] = {0x6D,0x73,0x00,0x55};
uint8_t fmp3[] = ".mp3";
uint8_t raw[] = "raw ";

uint8_t wave[] = "wave";
uint8_t mp4a[] = "mp4a";
uint8_t enca[] = "enca";//encrypted to ISO/IEC 14496-12 or 3GPP standards

uint8_t smar[] = "smar";//encoded to 3GPP GSM 6.10 AMR narrowband standards 

uint8_t sawb[] = "sawb";//encoded to 3GPP GSM 6.10 AMR wideband standards

uint8_t m4ds[] = "m4ds";//encoded to ISO/IEC 14496-10 AVC standards

uint8_t esds[] = "esds";
uint8_t fram[] = "fram";

/*** We may not need these ***/
#define MKTAG(a,b,c,d) (| (<< 8) | (<< 16) | (<< 24))
typedef struct AVCodecTag {
    int id;
    unsigned int tag;
} AVCodecTag;

typedef struct stsdtable
{
    unsigned int size;
    char format[4];
    int res1;
    int ref;
    short version;
    short pad1;
    int pad2;
    short channels;
    short bitspersample;
    short compress_id;
    short res2;
    short samplerate1;
    short samplerate2;
    //{if(version==1)

        int sampleperpacket;
        int bytesperpacket;
        int bytesperframe;
        int bytespersample;
    //}

} stsdtable;

/***** result is stored here ******/
typedef struct sampletable
{
    unsigned int size;
    unsigned int id_of_sd;
} sampletable;
//MP4Analyze.cpp

#include "MP4Analyze.h"
#include <vector>
#include <map>
#include <iostream>
#include <string>
#ifdef WIN32
#include <winsock2.h> 
#pragma comment(lib, "Ws2_32.lib")
#pragma warning (disable:4786)
#endif

#ifdef __GNUG__
#include <netinet/in.h>
#endif
using namespace std;

/**
*** mp4存在宽度为8字节的wide atom tag,需要注意,这里暂未考虑
**/

/*
* check if a mov/mp4/3gp type
*/

int check_format(uint8_t *data, int size)
{
    if(strncmp((char*)moov,(char*)(data+4),4)==||
        strncmp((char*)ftyp,(char*)(data+4),4)==||strncmp((char*)mdat,(char*)(data+4),4)==)
        return 0;
    return -1;
}

unsigned int get_size(const uint8_t *data,int size)
{
    unsigned int tmp = 0;
    for(int i=0; i<size; ++i)
    {
        tmp <<= 8;
        tmp += *data++;
    }
    return tmp;
}
/* if found,return the offset from the data[0]*/
int seek_tag(uint8_t tag[],uint8_t *data, unsigned int size1,uint8_t **pos,unsigned int *size2)
{
    if(data == NULL || size1 == 0)
        return -1;
    unsigned int tag_size = get_size(data,4);
    if(tag_size >size1 + 8)
        return -1;
    unsigned int tmp = 0;
    while(strncmp((char*)data+4,(char*)tag,4) != 0)
    {
    //    printf("%s/n",data+4);

        if(tag_size==0)
            return -1;
        if(tag_size < size1 + 8)
        {
            data += tag_size;
            tmp += tag_size;
        }
        else
            return -1;
        tag_size = get_size(data,4);
    }
    printf("find :%c%c%c%c/n",tag[0],tag[1],tag[2],tag[3]);
    if(tmp + tag_size > size1 )
     printf("warning: the atom may be not complete!/n");
    *pos = data+8;
    *size2 = tag_size -8;
    return tmp;
}
/*** elementary stream descriptor analyse ***/
/*
unsigned int codec_get_tag(const AVCodecTag *tags, int id)
{
    while (tags->id != CODEC_ID_NONE) {
        if (tags->id == id)
            return tags->tag;
        tags++;
    }
    return 0;
}
/* may not need analyse
int esds_analyze(uint8_t *data, unsigned int size)
{
    return 0;
}
*/

/*version == 2 ??? reffer to ffmpeg source mov.c line 943
if (format == MKTAG('l','p','c','m'))
        st->codec->codec_id = mov_get_lpcm_codec_id(st->codec->bits_per_coded_sample, flags);
*/

vector<stsdtable>& get_audio_info(uint8_t *data, unsigned int size, vector<stsdtable>& stable)//stsd

{
    uint8_t * datapos = data;
    Atom *stsd_audio =(Atom *)data;
    int tmp_size = 16;

    printf("size : %u/n",ntohl(stsd_audio->size));
    printf("num_entr: %u/n",ntohl(stsd_audio->num_of_entries));

    for(int i=0; i < ntohl(stsd_audio->num_of_entries); ++i)
    {
        if(tmp_size > size)//注意

            return stable;
        datapos += tmp_size;
        stsdtable * audio_entry = (stsdtable *)(datapos);
        stable.push_back(*audio_entry);//这里存入的是网络序的数据,使用时需进行转换

        tmp_size += ntohl(audio_entry->size);

     /***************/
        printf("--tablesize: %d/n",ntohl(audio_entry->size));
        printf("--format : %s/n",audio_entry->format);
        printf("--version : %d/n",ntohs(audio_entry->version));
        printf("--channels: %d/n",ntohs(audio_entry->channels));
        printf("--bitpersam: %d/n",ntohs(audio_entry->bitspersample));
        printf("--IDcompress: %d/n",ntohs(audio_entry->compress_id));    
        printf("--samplerate: %d.%d/n",ntohs(audio_entry->samplerate1),ntohs(audio_entry->samplerate2));
        /**************/

     tmp_size = sizeof(stsdtable);
        if(ntohs(audio_entry->version)==0)
        {
            tmp_size -= 16;
        }
        datapos += tmp_size;
        //if(ntohs(audio_entry->compress_id)==-2)//此处尚需考证

        if(ntohl(audio_entry->size) > sizeof(stsdtable))
        {
            printf("----atom size:%d/n",get_size(datapos,4));
            printf("----atom name:%c%c%c%c/n",datapos[4],datapos[5],datapos[6],datapos[7]);
            if(strncmp((char*)datapos,(char*)esds,4)==0)
            {
                //handle esds

            }
        }
    }
    return stable;
}
map<unsigned int,sampletable> & get_packet_offset(uint8_t *STBL[], map<unsigned int,sampletable>& table)
{
    //table.insert(pair<long,sampletable>(1,sample));

    unsigned int num_sam_to_chunk = get_size(STBL[0]-4,4);//stsc

    unsigned int num_sample = get_size(STBL[1]-4,4);//stsz

    unsigned int num_chunk = get_size(STBL[2]-4,4);//stco

    unsigned int chunk_index = 0;
    unsigned int next_chunk_index = 0;
    uint8_t *cur_sam_to_chunk = STBL[0];
    uint8_t *cur_sam_size = STBL[1];
    uint8_t *cur_chunk_offset = STBL[2];
    sampletable sample;
    printf("number of stsc entries:%d /nnumber of sample size:%d /nnumber of chunk offset:%d/n",num_sam_to_chunk,num_sample,num_chunk);
    for(unsigned int i = 0; i < num_sam_to_chunk; ++i)//对所有的entries

    {
        chunk_index = get_size(cur_sam_to_chunk,4);
        next_chunk_index = get_size(cur_sam_to_chunk+12,4);
        sample.id_of_sd = get_size(cur_sam_to_chunk+8,4);
        if(== num_sam_to_chunk -1)//最后一个

        {
            next_chunk_index = num_chunk+1;
        } 
        printf("chunk_index:(%d---%d)/n",chunk_index,next_chunk_index);
        for(unsigned int k=chunk_index; k < next_chunk_index; ++k)//当前chunk序号到下一个chunk序号之间的chunk

        {//处理所有重复的chunk

            printf("chunk_index:%d sample num:%d/n",chunk_index,get_size(cur_sam_to_chunk+4,4));
            unsigned int offset = get_size(cur_chunk_offset+(chunk_index-1)*4,4);
            for(unsigned int j=0; j < get_size(cur_sam_to_chunk+4,4); ++j)//chunk内地sample数目

            {//处理该chunk中的sample

                sample.size = get_size(cur_sam_size,4);    
                printf("--sample offset:%d %x size:%d/n",offset,offset,sample.size);
                table.insert(pair<unsigned int,sampletable>(offset,sample));
                offset = offset + sample.size;
                cur_sam_size += 4;
            }
            system("pause");
            chunk_index++;
        }
        cur_sam_to_chunk += 12;
    }
    return table;
}

int seek_audio_atom( uint8_t *data1, unsigned int size1)
{
    uint8_t tag[] = "mdiaminfsmhd";
    uint8_t *datapos;
    unsigned int tag_size;
    uint8_t *data;
    unsigned int size;
    int offset_of_atom = 0;
    if((offset_of_atom = seek_tag(moov, data1, size1, &data, &size)) == -1)
        return -1;
    if(offset_of_atom + size >size1)
    { //some handles

        printf("moov atom is not complete,need more data");
    }
    data1 = data;
    size1 = size;
    uint8_t *nexttrak = data;
    unsigned int traksize = size;
    int i=0;
    while(1)
    {
        printf("-----/n");
        if(seek_tag(trak, nexttrak, traksize, &datapos, &tag_size) != -1)
        {
            nexttrak = datapos + tag_size;
            if(size1 < (nexttrak - data1))
                return -1;
            traksize = size1 - (nexttrak - data1);
            data = datapos;
            size = tag_size;
        } 
        else
        {
            return -1;
        }
        i=0;
        while(i<3)
        {
            if(seek_tag(tag+i*4, data, size, &datapos, &tag_size) != -1)
            {
                if(i==2)
                 break;
                data = datapos;
                size = tag_size;
                ++i;
            }
            else
            {
                break;
            }
        }
        if(strncmp("smhd",(char*)(datapos-4),4) == 0)
        {
            if(seek_tag(stbl, data, size, &datapos, &tag_size)!= -1)
            {
                printf("—find audio stbl—!/n");
                data = datapos;
                size = tag_size;

                if(seek_tag(stsd, data, size, &datapos, &tag_size) != -1)
                {
                    vector<stsdtable> stable; //音频信息

                    get_audio_info(datapos-8, tag_size,stable);
                }

                uint8_t *STBL[3] ={NULL,NULL,NULL};//

                uint8_t *datapos1;
                unsigned int tag_size1;//

                if(seek_tag(stsc, data, size, &datapos1, &tag_size1) != -1)
                {
                    STBL[0] = datapos1 + 8;
                }
                uint8_t *datapos2;
                unsigned int tag_size2;
                if(seek_tag(stsz, data, size, &datapos2, &tag_size2) != -1)
                {
                    STBL[1] = datapos2 + 12;
                }
                uint8_t *datapos3;
                unsigned int tag_size3;
                if(seek_tag(stco, data, size, &datapos3, &tag_size3) != -1)
                {
                    STBL[2] = datapos3 + 8;
                }
                if(STBL[0] && STBL[1] && STBL[2] )
                {
                    map<unsigned int,sampletable> postable;//音频帧信息

                    get_packet_offset(STBL,postable);
                }
            }
            return 0;
        }
    }
    return -1;
}
int main(char arg, char *argv[])
{
    FILE *mp4;
    cout<<"please input the file name :"<<endl;
    string filename;
    cin>>filename;
    mp4 = fopen(filename.c_str(),"rb");
    uint8_t buffer[300000];
    fread(buffer,1,300000,mp4);

    seek_audio_atom((uint8_t*)buffer,300000);

    fclose(mp4);
    return 0;
}