Search::estIntraPredQT()

粗粒度率失真計算：

rdcost = satd(fenc, pred) + lambda * IPM_bits，其中satd在一定程度上表示了頻域的能量，彌補了IPM_bits沒有計算殘差係數等bits開銷的不足，該方法計算/時間開銷小，因爲沒有進行變換/量化/反量化/反變換等過程；但是其結果只具有一定代表性，其計算的最優可能並不一定是嚴格意義上的最優，而只是可能較優。
細粒度率失真計算：

rdcost = sse(fenc, recon) + lambda * all_bits，該方法嚴格計算了原始幀和重建幀之間的distortion，並對執行了整個編碼流程，包括變換/量化/反量化/反變換等等，是真正意義上的拉格朗日碼控計算，其計算的最優就是嚴格意義上的最優，但是計算成本大。
X265在分析最優幀內預測方向中，採用了兩種結合的方式，先使用粗粒度計算方式得到一些可能的最優幀內預測方向備選集，然後在這些備選集中用細粒度計算方式得到最後嚴格意義上的最優幀內預測方向
/*
	爲當前CU中各個PU分析最優的幀內預測方向，並返回整個CU的distortion

	過程：
		1.獲取depth、initTuDepth、TUsize、PU個數等信息
		2.檢查是否TransformSkip	
		3.遍歷當前CU的所有PU
			1.對當前PU分析其最優幀內預測方向
				·若指定了幀內預測方向，則直接將其定爲最優幀內預測方向
				·否則，進行最優幀內預測方向選擇
					1.獲取相鄰PU參考像素可用信息
					2.對相鄰PU參考像素信息進行填充並平滑濾波
					3.加載3個mpms，並得到未命中mpms時的bits開銷
					4.進行DC幀內預測方向計算
						1.進行DC幀內預測
						2.得到編碼DC幀內預測方向的mode_bits
						3.計算distortion = sa8d(fenc, pred)
						4.計算存儲cost[DC] = distortion + lambda * mode_bits，並將其設置爲最優開銷bcost
					5.進行PLANAR幀內預測方向計算
						1.進行PLANAR幀內預測，TUsize在8~32內用平滑濾波後的參考像素，否則使用未濾波的像素
						2.得到編碼PLANAR幀內預測方向的mode_bits
						3.計算distortion = sa8d(fenc, pred)
						4.計算存儲cost[PLANAR]= distortion + lambda * mode_bits，並基於cost更新bcost
					6.進行angle2~34幀內預測方向計算
						·若intra_pred_allangs函數定義，則
							1.轉置fenc矩陣爲fenc^
							2.進行intra_pred_allangs函數計算，輸出angle2~34一共33種預測方向的預測像素
							3.遍歷angle2~34
								1.得到編碼當前angle下幀內預測方向的mode_bits
								2.計算distortion
									·若angle在2~18中，即從水平向右的所有幀內預測方向，則distortion = satd(fenc^, pred)
									·否則，即angle在19~34中，也就是垂直向下的那些幀內預測方向，則distortion = satd(fenc, pred)
								3.計算cost[angle] = distortion + lambda * mode_bits
						·若沒有intra_pred_allangs函數定義，則遍歷angle2~34幀內預測方向
							1.得到編碼當前angle下幀內預測方向的mode_bits
							2.判斷是否使用平滑濾波後的參考像素
							3.計算distortion = sa8d(fenc, pred)
							4.計算cost[angle] = distortion + lambda * mode_bits
					7.選取最多maxCandCount個cost在1.25倍bcost內的幀內預測方向作爲幀內預測方向備選集cand
					8.遍歷所有cand，在cand中尋找嚴格意義上的最優
						1.加載熵編碼上下文，並設置好幀內預測方向
						2.針對指定的幀內預測方向，嚴格基於rdcost = sse(fenc, recon) + lambda * all_bits，確定最優的TU劃分，並得到rdcost、bits、distortion、energy開銷
						3.基於rdcost來更新最優開銷bcost以及最優幀內預測方向bmode
			2.設置得到的最優幀內預測方向
			3.載入熵編碼上下文
			4.再次調用codeIntraLumaTSkip/codeIntraLumaQT來重新得到其殘差係數、reconYUV、以及一些開銷
			5.累加當前PU最優預測方向的distortion到totalDistortion中
			6.提取存儲保留最優幀內預測方向的殘差係數和reconYUV
			7.若當前PU不是當前CU的最後一塊PU，則保留reconYUV，爲下一PU的幀內預測做參考
			8.若當前CU劃分了多個PU，則merge各個PU的cbf
			9.返回totalDistortion
*/
sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2])
{
    CUData& cu = intraMode.cu;

	//原始幀、預測幀、重建幀
    const Yuv* fencYuv = intraMode.fencYuv;
	Yuv* predYuv = &intraMode.predYuv;
	Yuv* reconYuv = &intraMode.reconYuv;

    uint32_t depth        = cuGeom.depth;					//CU深度
    uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;	//初始TU深度，2Nx2N=>深度0，NxN=>深度1
    uint32_t numPU        = 1 << (2 * initTuDepth);			//PU個數，2Nx2N=>1個，NxN=>4個
    uint32_t log2TrSize   = cuGeom.log2CUSize - initTuDepth;//TUsize，單位log(pixel)
    uint32_t tuSize       = 1 << log2TrSize;				//TUsize,單位pixel
    uint32_t qNumParts    = cuGeom.numPartitions >> 2;		
    uint32_t sizeIdx      = log2TrSize - 2;
    uint32_t absPartIdx   = 0;
    sse_t totalDistortion = 0;

	//是否跳過transform
    int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;

    // loop over partitions 遍歷所有PU
    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
    {
        uint32_t bmode = 0;
		
		//若指定了幀內預測方向，即非ALL_IDX，則不用進行幀內預測方向分析了
        if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
            bmode = intraMode.cu.m_lumaIntraDir[puIdx];
		//否則，進行最優幀內預測方向計算
        else
        {
            uint64_t candCostList[MAX_RD_INTRA_MODES];
            uint32_t rdModeList[MAX_RD_INTRA_MODES];
            uint64_t bcost;
            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);

            {
                ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);

                // Reference sample smoothing
                IntraNeighbors intraNeighbors;
				//獲取neighbor參考像素可用信息
                initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
                //對neighbor像素進行填充，並平滑濾波
				initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);

                // determine set of modes to be tested (using prediction signal only)
				//取原始YUV及其stride
                const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
                uint32_t stride = predYuv->m_size;

                int scaleTuSize = tuSize;
                int scaleStride = stride;
                int costShift = 0;

				//加載啥？？？
                m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);

                /* there are three cost tiers for intra modes:
                *  pred[0]          - mode probable, least cost
                *  pred[1], pred[2] - less probable, slightly more cost
                *  non-mpm modes    - all cost the same (rbits) */
                uint64_t mpms;			//mpms映射，低0~34bit有效
                uint32_t mpmModes[3];	//存儲三個mpm
				//加載mpms，並得到若沒有命中mpm時的bits開銷
                uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);

				//加載相應size的sa8d計算函數指針
                pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
               
				//存儲35個幀內預測方向的cost
				uint64_t modeCosts[35];

                /* 進行DC幀內預測，並得到其bits、distorton(sa8d)、cost開銷，並賦值給bcost*/
                primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
                //根據有沒有命中mpm返回不同的bits。這裏的bits僅爲記錄最優幀內預測方向的bits開銷
				uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
                //計算sa8d失真
				uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
                //計算rdcost
				modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);

				/* 進行PLANAR幀內預測，並得到其bits、distorton(sa8d)、cost開銷，更新bcost*/
				//若tuSize再8~32之間，使用平滑濾波後的參考像素，若不在區間內，則使用未平滑濾波的參考像素
                pixel* planar = intraNeighbourBuf[0];
                if (tuSize >= 8 && tuSize <= 32)
                    planar = intraNeighbourBuf[1];
				//PLANAR幀內預測
                primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
                //bits開銷
				bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
                //distortion
				sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
                //計算cost
				modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
                //基於cost更新最優幀內預測模式
				COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);

				/* 進行angle2~34幀內預測，得到其bits、distorton(sa8d)、cost開銷，並更新bcost
					intra_pred_allangs只是將33種幀內預測方向集中起來計算而已	*/
				//若intra_pred_allangs
                if (primitives.cu[sizeIdx].intra_pred_allangs)
                {
					/*	將原始YUC轉置，輸出到m_fencTransposed
						angle2~17的預測方向和angle19~34的預測方向是轉置關係	*/
                    primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
                    //進行angle2~34幀內預測，將33個預測的結果全部輸出到m_intraPredAngs
					primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
					//遍歷angle2~34
					for (int mode = 2; mode < 35; mode++)
                    {
						//計算最優幀內預測方向的bits開銷
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
						//若是angle2~18，則與轉置後的YUV矩陣計算sa8d
						if (mode < 18)
                            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
                        //若是angle19~24，則與原始YUV矩陣計算sa8d
						else
                            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
                        //得到rdcost
						modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
						//更新最優幀內預測方向
                        COPY1_IF_LT(bcost, modeCosts[mode]);
                    }
                }
				//若非intra_pred_allangs
                else
                {
					//遍歷angle2~34
                    for (int mode = 2; mode < 35; mode++)
                    {
						//計算bits開銷
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
                        //是否用平滑濾波後的參考像素
						int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
                        //以mode方向進行幀內預測
						primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
                        //計算sa8d
						sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
                        //計算rdcost
						modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
                        //更新最優幀內預測方向
						COPY1_IF_LT(bcost, modeCosts[mode]);
                    }
                }
				/* 到這裏只是簡單的基於
					cost = sa8d + lambda * IPM_bits
					確定了最優幀內預測開銷bcost，
					以及35種幀內預測方向各自的rdcost，存儲在modeCosts[35]
					有意義但並不準確，下面依據bcost縮小幀內預測方向搜索範圍，
					得到準確的最優幀內預測方向*/


                /* Find the top maxCandCount candidate modes with cost within 25% of best
                * or among the most probable modes. maxCandCount is derived from the
                * rdLevel and depth. In general we want to try more modes at slower RD
                * levels and at higher depths */

				//初始化candCostList所有爲MAX
                for (int i = 0; i < maxCandCount; i++)
                    candCostList[i] = MAX_INT64;

				//1.25倍的bcost爲閾值
                uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%

				//遍歷35種幀內預測方向，在滿足條件的幀內預測方向中尋找最優的maxCandCount個，存儲到candCostList中
                for (int mode = 0; mode < 35; mode++)
					//若該幀內預測方向之前簡單計算的cost在1.25倍最優幀內預測方向的cost以內，或命中了mpm，則進行更新CandList
                    if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0])) 
                        /* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */
                        updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
            }

            /* measure best candidates using simple RDO (no TU splits) */
            bcost = MAX_INT64;

			//遍歷所有Cand，將cand中的每一個幀內預測方向都嚴格計算一邊開銷
            for (int i = 0; i < maxCandCount; i++)
            {
				//若其cost爲MAX，則break，不需要繼續了，candCostList無可用幀內預測方向
                if (candCostList[i] == MAX_INT64)
                    break;

                ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);

				//加載熵編碼上下文
                m_entropyCoder.load(m_rqt[depth].cur);
				//設置好幀內預測方向
                cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);

                Cost icosts;

				/*	針對指定的幀內預測方向，
					嚴格基於rdcost = sse(fenc, recon) + lambda * all_bits
					確定最優的TU劃分，並得到rdcost、bits、distortion、energy開銷	*/
                if (checkTransformSkip)
                    codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
                else
                    codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
                
				//依據rdcost更新bcost和bmode
				COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
            }
			/*
				到這裏已經得到了嚴格意義上的最優幀內預測方向bmode及其bcost
			*/
        }

        ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);

        /* remeasure best mode, allowing TU splits */
		//重新設置剛剛在cand中確定的最優幀內預測方向
        cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
		//加載熵編碼上下文
        m_entropyCoder.load(m_rqt[depth].cur);

		//再次計算一遍
        Cost icosts;
		//計算當前intraMod下的最優TU劃分，並得到嚴格的distortion、bits、rdcost和energy
        if (checkTransformSkip)
            codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
        else
            codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
        
		//累加上當前PU的distortion
		totalDistortion += icosts.distortion;

		//將DCT係數和recon的YUV數據提取存儲下來
        extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);

        // set reconstruction for next intra prediction blocks
		//若不是最後一個PU，則將recon的YUV拷貝下來，爲下一個PU作像素參考
        if (puIdx != numPU - 1)
        {
            /* This has important implications for parallelism and RDO.  It is writing intermediate results into the
             * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
             * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
             * that the contexts should be tracked through each PU */
            PicYuv*  reconPic = m_frame->m_reconPic;
            pixel*   dst       = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
            uint32_t dststride = reconPic->m_stride;
            const pixel*   src = reconYuv->getLumaAddr(absPartIdx);
            uint32_t srcstride = reconYuv->m_size;
            primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
        }
    }// end of for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)

	//若CU劃分了多個PU，即4個
    if (numPU > 1)
    {
        uint32_t combCbfY = 0;
		//merge四個PU的cbf
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
            combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
		//m_cbf[plane][absPartIdx]，記錄下來
        cu.m_cbf[0][0] |= combCbfY;
    }

    // TODO: remove this，恢復熵編碼上下文
    m_entropyCoder.load(m_rqt[depth].cur);

    return totalDistortion;
}
Search::estIntraPredQT()

粗粒度率失真計算：

細粒度率失真計算：

Search::checkIntra()

Search::codeIntraLumaQT()

Search::estIntraPredQT()

Predict::initAdiPattern()

Analysis::checkBidir2Nx2N()

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結