x264去方塊濾波函數解析（二）

上一篇介紹了實際進行濾波的函數，本篇主要介紹，去方塊濾波這邊的函數調用關係。先看幾個定義：

//! 兩個函數指針，第一個是bS=1~3時調用的，第二個是bS=4時調用的。
typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta );
typedef struct
{
    x264_deblock_inter_t deblock_v_luma;
    x264_deblock_inter_t deblock_h_luma;
    x264_deblock_inter_t deblock_v_chroma;
    x264_deblock_inter_t deblock_h_chroma;
    x264_deblock_intra_t deblock_v_luma_intra;
    x264_deblock_intra_t deblock_h_luma_intra;
    x264_deblock_intra_t deblock_v_chroma_intra;
    x264_deblock_intra_t deblock_h_chroma_intra;
} x264_deblock_function_t; //!< 定義存放一組去方塊濾波的函數的結構體

接下來是去方塊濾波的初始化工作：

void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
{
	//!< 爲函數指針指定適當的函數入口地址
    pf->deblock_v_luma = deblock_v_luma_c;
    pf->deblock_h_luma = deblock_h_luma_c;
    pf->deblock_v_chroma = deblock_v_chroma_c;
    pf->deblock_h_chroma = deblock_h_chroma_c;
    pf->deblock_v_luma_intra = deblock_v_luma_intra_c;
    pf->deblock_h_luma_intra = deblock_h_luma_intra_c;
    pf->deblock_v_chroma_intra = deblock_v_chroma_intra_c;
    pf->deblock_h_chroma_intra = deblock_h_chroma_intra_c;

#ifdef HAVE_MMX
    if( cpu&X264_CPU_MMXEXT )
    {
        pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext;
        pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
        pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
        pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
#ifdef ARCH_X86
        pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
        pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
        pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
        pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
#endif
        if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) )
        {
            pf->deblock_v_luma = x264_deblock_v_luma_sse2;
            pf->deblock_h_luma = x264_deblock_h_luma_sse2;
            pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
            pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
        }
    }
#endif

#ifdef ARCH_PPC
    if( cpu&X264_CPU_ALTIVEC )
    {
        pf->deblock_v_luma = x264_deblock_v_luma_altivec;
        pf->deblock_h_luma = x264_deblock_h_luma_altivec;
   }
#endif // ARCH_PPC
}

在代碼中，這個函數可以認爲是去方塊濾波的頂層函數了：

static void x264_fdec_filter_row( x264_t *h, int mb_y )
{
    /* mb_y is the mb to be encoded next, not the mb to be filtered here */
    int b_hpel = h->fdec->b_kept_as_ref;
    int b_deblock = !h->sh.i_disable_deblocking_filter_idc; //!< 標記是否進行deblock
    int b_end = mb_y == h->sps->i_mb_height; //!< 標記是否爲最後一行
    int min_y = mb_y - (1 << h->sh.b_mbaff);
    int max_y = b_end ? h->sps->i_mb_height : mb_y;
    b_deblock &= b_hpel || h->param.psz_dump_yuv; //!< psz_dump_yuv--filename for reconstructed frames
    if( mb_y & h->sh.b_mbaff )
        return;
    if( min_y < 0 )
        return;

    if( !b_end ) //!< 不是最後一行
    {
        int i, j;
        for( j=0; j<=h->sh.b_mbaff; j++ )
            for( i=0; i<3; i++ )
            {
                memcpy( h->mb.intra_border_backup[j][i],
                        h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i],
                        h->sps->i_mb_width*16 >> !!i ); //!< 拷貝上一行的像素--bottom pixels of the previous mb row
            }
    }

    if( b_deblock ) //!< 進行deblock
    {
        int y;
        for( y = min_y; y < max_y; y += (1 << h->sh.b_mbaff) ) //!< 遍歷整一行
            x264_frame_deblock_row( h, y ); //!< 該函數將被調用用於去方塊濾波！！！
    }

    if( b_hpel )
    {
        x264_frame_expand_border( h, h->fdec, min_y, b_end );
        if( h->param.analyse.i_subpel_refine )
        {
            x264_frame_filter( h, h->fdec, min_y, b_end );
            x264_frame_expand_border_filtered( h, h->fdec, min_y, b_end );
        }
    }

    if( h->param.i_threads > 1 && h->fdec->b_kept_as_ref ) //!< 多線程且該幀被用作參考幀
    {
        x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
    }

    min_y = X264_MAX( min_y*16-8, 0 );
    max_y = b_end ? h->param.i_height : mb_y*16-8;

    if( h->param.analyse.b_psnr )
    {
        int i;
        for( i=0; i<3; i++ )
            h->stat.frame.i_ssd[i] +=
                x264_pixel_ssd_wxh( &h->pixf,
                    h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
                    h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
                    h->param.i_width >> !!i, (max_y-min_y) >> !!i );
    }

    if( h->param.analyse.b_ssim )
    {
        x264_emms();
        /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
         * and overlap by 4 */
        min_y += min_y == 0 ? 2 : -6;
        h->stat.frame.f_ssim +=
            x264_pixel_ssim_wxh( &h->pixf,
                h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
                h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
                h->param.i_width-2, max_y-min_y, h->scratch_buffer );
    }
}

註釋忽略了與濾波無關的部分，該函數將調用這個函數進行濾波工作：

void x264_frame_deblock_row( x264_t *h, int mb_y )
{
    const int s8x8 = 2 * h->mb.i_mb_stride; //!< 以8x8塊爲單位的每一行的跨度
    const int s4x4 = 4 * h->mb.i_mb_stride; //!< 以4x4塊爲單位的每一行的跨度 
    const int b_interlaced = h->sh.b_mbaff;
    const int mvy_limit = 4 >> b_interlaced;
    const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
    int mb_x;
    int stridey   = h->fdec->i_stride[0];
    int stride2y  = stridey << b_interlaced;
    int strideuv  = h->fdec->i_stride[1];
    int stride2uv = strideuv << b_interlaced;

    if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
        munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row );

    for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
    {
        const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x; //!< 宏塊序號
        const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x; //!< 8x8塊序號
        const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x; //!< 4x4塊序號
        const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
        const int i_qp = h->mb.qp[mb_xy];
        int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4; //!< 需要進行濾波的邊界數
        int no_sub8x8 = h->mb.type[mb_xy] != P_8x8 || !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
        uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x; //!< Y像素值
        uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x; //!< U像素值
        uint8_t *pixv = h->fdec->plane[2] +  8*mb_y*strideuv +  8*mb_x; //!< V像素值
        if( b_interlaced && (mb_y&1) )
        {
            pixy -= 15*stridey;
            pixu -=  7*strideuv;
            pixv -=  7*strideuv;
        }

        x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );

        if( i_qp <= qp_thresh )
            i_edge_end = 1;

        #define FILTER_DIR(intra, i_dir)\
        {\
            /* Y plane */\
            i_qpn= h->mb.qp[mbn_xy];\
            if( i_dir == 0 )\
            {\
                /* vertical edge */\
                deblock_edge##intra( h, pixy + 4*i_edge, NULL,\
                              stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
                              h->loopf.deblock_h_luma##intra );\
                if( !(i_edge & 1) )\
                {\
                    /* U/V planes */\
                    int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
                    deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\
                                  stride2uv, bS, i_qpc, 1,\
                                  h->loopf.deblock_h_chroma##intra );\
                }\
            }\
            else\
            {\
                /* horizontal edge */\
                deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\
                              stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\
                              h->loopf.deblock_v_luma##intra );\
                /* U/V planes */\
                if( !(i_edge & 1) )\
                {\
                    int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\
                    deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\
                                  stride2uv, bS, i_qpc, 1,\
                                  h->loopf.deblock_v_chroma##intra );\
                }\
            }\
        }
		//! i_edge標記是否爲宏塊最左邊的邊界或者是最頂部的邊界
		//! i_dir標記邊界類型：0--垂直邊界；1--水平邊界
		//! 得到bS的值
        #define DEBLOCK_STRENGTH(i_dir)\
        {\
            /* *** Get bS for each 4px for the current edge *** */\
            if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
                *(uint32_t*)bS = 0x03030303;\
            else\
            {\
                *(uint32_t*)bS = 0x00000000;\
                for( i = 0; i < 4; i++ )\
                {\
                    int x  = i_dir == 0 ? i_edge : i;\
                    int y  = i_dir == 0 ? i      : i_edge;\
                    int xn = i_dir == 0 ? (x - 1)&0x03 : x;\
                    int yn = i_dir == 0 ? y : (y - 1)&0x03;\
                    if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
                        h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
                        bS[i] = 2;\
                    else if(!(i_edge&no_sub8x8))\
                    {\
                        if((i&no_sub8x8) && bS[i-1] != 2)\
                            bS[i] = bS[i-1];\
                        else\
                        {\
                            /* FIXME: A given frame may occupy more than one position in\
                             * the reference list. So we should compare the frame numbers,\
                             * not the indices in the ref list.\
                             * No harm yet, as we don't generate that case.*/\
                            int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
                            int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
                            int i4p= mb_4x4+x+y*s4x4;\
                            int i4q= mbn_4x4+xn+yn*s4x4;\
                            if((h->mb.ref[0][i8p] != h->mb.ref[0][i8q] ||\
                                abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
                                abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
                               (h->sh.i_type == SLICE_TYPE_B &&\
                               (h->mb.ref[1][i8p] != h->mb.ref[1][i8q] ||\
                                abs( h->mb.mv[1][i4p][0] - h->mb.mv[1][i4q][0] ) >= 4 ||\
                                abs( h->mb.mv[1][i4p][1] - h->mb.mv[1][i4q][1] ) >= mvy_limit )))\
                            {\
                                bS[i] = 1;\
                            }\
                        }\
                    }\
                }\
            }\
        }

        /* i_dir == 0 -> vertical edge
         * i_dir == 1 -> horizontal edge */
        #define DEBLOCK_DIR(i_dir)\
        {\
            int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
            int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
            DECLARE_ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
            if( i_edge )\
                i_edge+= b_8x8_transform;\
            else\
            {\
                mbn_xy  = i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride;\
                mbn_8x8 = i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8;\
                mbn_4x4 = i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4;\
                if( b_interlaced && i_dir == 1 )\
                {\
                    mbn_xy -= h->mb.i_mb_stride;\
                    mbn_8x8 -= 2 * s8x8;\
                    mbn_4x4 -= 4 * s4x4;\
                }\
                else if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
                {\
                    FILTER_DIR( _intra, i_dir );\
                    goto end##i_dir;\
                }\
                DEBLOCK_STRENGTH(i_dir);\
                if( *(uint32_t*)bS )\
                    FILTER_DIR( , i_dir);\
                end##i_dir:\
                i_edge += b_8x8_transform+1;\
            }\
            mbn_xy  = mb_xy;\
            mbn_8x8 = mb_8x8;\
            mbn_4x4 = mb_4x4;\
            for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
            {\
                DEBLOCK_STRENGTH(i_dir);\
                if( *(uint32_t*)bS )\
                    FILTER_DIR( , i_dir);\
            }\
        }

        DEBLOCK_DIR(0); //!< 垂直邊界濾波
         DEBLOCK_DIR(1); //!< 水平邊界濾波
    }

    if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
        munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, restore_cavlc_nnz_row );
}

函數中間有一大段都是用宏來寫的，個人建議剛開始看時大概根據宏名判斷它的功能就行，直接跳到最後兩個DEBLOCK_DIR的宏，通過設置斷點的方式對它們進行調試，從而先熟悉濾波的實際工作過程，等之後再回過頭來分析這幾個宏的具體實現，畢竟這幾個宏所作的工作主要是確定bS值，再選擇合適的濾波函數進行實際的濾波。