CostEstimateGroup::estimateCUCost()

/*
	依賴線程tld來執行幀b中的CU(cuX, cuY)以p0爲前向參考，p1爲後向參考的satd
	inter_satd = min{intra_satd, inter_satd}

過程：
	1.分別取低分辨率前向參考幀p0，後向參考幀p1，當前待分析幀b
	2.得到幀在長寬上CU的個數widthInCU/heightInCU
	3.得到當前CU align後的序號cuXY
	4.得到低分辨率的CU尺寸cuSize
	5.得到當前CU align後的像素偏移量pelOffset
	6.加載運動估計的一些信息：失真函數、運動估計算法、YUV像素等
	7.將mv限制在幀範圍內[mvmin, mvmax]
	8.分別遍歷兩個預測方向，得到各個預測方向上的最優satd
		1.取lowerResMvCosts
		2.若不需要對該方向進行search，則表明之前已經計算過，直接更新後continue
		3.取低分辨率運動向量
		4.建立僞mvp集mvc，由於執行estimateCUCost()函數是逆zigzag順序進行，所以這裏的mvc與協議上的mvp位置相反，個數相同5個
			1.若CU不是最後一列，則將右邊CU的mv放進mvc
			2.若CU不是最後一行
				1.將下面CU的mv放進mvc
				2.若CU不是第一列，則將左下角CU的mv放進mvc
				3.若CU不是最後一列，則將右下角CU的mv放進mvc
			3.這個mvc備選集不知道什麼意義
		5.遍歷mvc中的每個mv，找到最優的mv，即mvp
			1.進行運動補償
			2.計算satd
			3.更新最優mvpcost和mvp
		6.以mvp爲中心，在[mvmin, mvmax]範圍內進行運動估計，得到運動估計最優低分辨率mv，返回其satd
		7.更新最優預測方向及其satd
	9.若允許雙向預測，則是Bslice，則計算雙向預測的最優satd
		1.分別以之前前後向運動估計得到的最優mv進行像素參考
		2.得到的前後向最優mv參考像素進行均值計算，並計算satd
		3.更新最優預測方向及其satd
		4.分別得到前向/後向參考幀的co-located CU像素
		5.得到前向/後向參考幀的co-located CU像素的均值，並計算satd
		6.更新最優預測方向及其satd
	10.若不允許雙向預測，則是Pslice，還要考慮intra的satd
		1.先將之前得到的inter satd加上一個懲罰lowresPenalty
		2.對比之前計算的intra satd，更新最優預測方向及其satd
	11.判斷當前CU是否是邊界CU，邊界CU不能計算在幀satd中，因爲他們不準
	12.基於satd來計算aq satd
	13.若是不是邊界CU，則分別將satd和aq satd加入到幀/slice的satd/aq satd中
	14.累加aq satd到行satd中
	15.累加satd到低分辨率satd中
*/
void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme)
{
	// 分別取前向參考幀p0，後向參考幀p1，當前待計算幀b
    Lowres *fref0 = m_frames[p0];
    Lowres *fref1 = m_frames[p1];
    Lowres *fenc  = m_frames[b];

	// 若前向參考是權重的，且沒開啓層級運動估計，則取權重的前向參考幀，否則取原始前向參考幀
    ReferencePlanes *wfref0 = (fenc->weightedRef[b - p0].isWeighted && !hme) ? &fenc->weightedRef[b - p0] : fref0;

	// 根據是否hme來得到幀在長寬上的CU個數
    const int widthInCU = hme ? m_lookahead.m_4x4Width : m_lookahead.m_8x8Width;
    const int heightInCU = hme ? m_lookahead.m_4x4Height : m_lookahead.m_8x8Height;
    // 若p1>b則雙向預測
	const int bBidir = (b < p1);
	// align後的CU偏移量
    const int cuXY = cuX + cuY * widthInCU;
    const int cuXY_4x4 = (cuX / 2) + (cuY / 2) * widthInCU / 2;
	// 低分辨率CU尺寸
    const int cuSize = X265_LOWRES_CU_SIZE;
	// align後的像素偏移量
    const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * (hme ? fenc->lumaStride/2 : fenc->lumaStride);

	// 載入運動估計必要信息：失真函數、運動估計算法、YUV像素等
    if ((bBidir || bDoSearch[0] || bDoSearch[1]) && hme)
        tld.me.setSourcePU(fenc->lowerResPlane[0], fenc->lumaStride / 2, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
	else if((bBidir || bDoSearch[0] || bDoSearch[1]) && !hme)
        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);


    /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
    int lowresPenalty = 4;
    int listDist[2] = { b - p0, p1 - b};

    MV mvmin, mvmax;
    int bcost = tld.me.COST_MAX;
    int listused = 0;

    // TODO: restrict to slices boundaries
    // establish search bounds that don't cross extended frame boundaries
	// 限制mv範圍在幀尺寸內
    mvmin.x = (int32_t)(-cuX * cuSize - 8);
    mvmin.y = (int32_t)(-cuY * cuSize - 8);
    mvmax.x = (int32_t)((widthInCU - cuX - 1) * cuSize + 8);
    mvmax.y = (int32_t)((heightInCU - cuY - 1) * cuSize + 8);

	// 遍歷運動方向
    for (int i = 0; i < 1 + bBidir; i++)
    {
		// 取fencCost
        int& fencCost = hme ? fenc->lowerResMvCosts[i][listDist[i]][cuXY] : fenc->lowresMvCosts[i][listDist[i]][cuXY];
        int skipCost = INT_MAX;

		// 如果不需要對該方向進行search，則表明之前已經計算過，直接更新
        if (!bDoSearch[i])
        {
            COPY2_IF_LT(bcost, fencCost, listused, i + 1);
            continue;
        }

        int numc = 0;
        MV mvc[5], mvp;
		// 取低分辨率運動向量
        MV* fencMV = hme ? &fenc->lowerResMvs[i][listDist[i]][cuXY] : &fenc->lowresMvs[i][listDist[i]][cuXY];
        ReferencePlanes* fref = i ? fref1 : wfref0;

        /* Reverse-order MV prediction
			建立僞mvp集mvc，由於estimateCUCost()是逆zigzag進行調用的
			所以這裏的mvc與協議上的mvp位置相反
			問題：爲什麼要逆zigzag調用？直接按正常來不行麼？ */
#define MVC(mv) mvc[numc++] = mv;
		// 若CU不是最後一列，則將右邊的mv放進mvc
        if (cuX < widthInCU - 1)
            MVC(fencMV[1]);
		// 若CU不是最後一行
        if (!lastRow)
        {
			// 將下面的mv放進mvc
            MVC(fencMV[widthInCU]);
			// 若CU不是第一列
            if (cuX > 0)
				// 將左下角的mv放進mvc
                MVC(fencMV[widthInCU - 1]);
			// 若CU不是最後一列
            if (cuX < widthInCU - 1)
				// 將右下角的mv方向mvc
                MVC(fencMV[widthInCU + 1]);
        }

        if (fenc->lowerResMvs[0][0] && !hme && fenc->lowerResMvCosts[i][listDist[i]][cuXY_4x4] > 0)
        {
            MVC((fenc->lowerResMvs[i][listDist[i]][cuXY_4x4]) * 2);
        }
#undef MVC

		// mvc備選集裏沒有mv，則置mvp = 0
        if (!numc)
            mvp = 0;
		// mvc備選集裏有mv
        else
        {
            ALIGN_VAR_32(pixel, subpelbuf[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
            int mvpcost = MotionEstimate::COST_MAX;

            /* measure SATD cost of each neighbor MV (estimating merge analysis)
             * and use the lowest cost MV as MVP (estimating AMVP). Since all
             * mvc[] candidates are measured here, none are passed to motionEstimate */
			// 遍歷mvc中的每個mv
            for (int idx = 0; idx < numc; idx++)
            {
                intptr_t stride = X265_LOWRES_CU_SIZE;
				// 給予mvc中的mv進行運動補償
                pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride, hme);
                // 得到satd
				int cost = tld.me.bufSATD(src, stride);
                // 更新最優mvp及其cost
				COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);
               
				/* Except for mv0 case, everyting else is likely to have enough residual to not trigger the skip. */
                // 若mvp爲0向量 && 雙向預測，則可能是skip，將該mvp的cost給skipCost
				if (!mvp.notZero() && bBidir)
                    skipCost = cost;
            }
        }

        /* ME will never return a cost larger than the cost @MVP, so we do not
         * have to check that ME cost is more than the estimated merge cost 
		 * 運動估計得到的satd一定會小於等於之前mvp得到的satd，因爲搜索的mv包含mvp */
		// 進行運動估計，得到其satd
        if(!hme)
            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);
        else
            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane[0]);
        
		// 若skipcost<64 且skipcost < 這裏運動估計的最優satd && 允許雙向，則定爲skip
		if (skipCost < 64 && skipCost < fencCost && bBidir)
        {
            fencCost = skipCost;
            *fencMV = 0;
        }

		// 更新最優mv的satd，並記錄下參考方向
		// listused = 0	intra 
		//			= 1	前向
		//			= 2 後向
		//			= 3 雙向
        COPY2_IF_LT(bcost, fencCost, listused, i + 1);
    }	// end of for (int i = 0; i < 1 + bBidir; i++)

    if (hme)
        return;

	// 若允許雙向預測，則進行雙向估計
    if (bBidir) /* B, also consider bidir */
    {
        /* NOTE: the wfref0 (weightp) is not used for BIDIR */

        /* avg(l0-mv, l1-mv) candidate */
        ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
        ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
        intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
        pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0, 0);
        pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1, 0);
        ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
        // 對雙向預測的像素進行均值計算
		primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
        // 得到雙向預測
		int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
		// 存儲最優satd
        COPY2_IF_LT(bcost, bicost, listused, 3);

        /* co-located candidate */
		// 得到前向co-located像素
        src0 = fref0->lowresPlane[0] + pelOffset;
		// 得到後向co-located像素
        src1 = fref1->lowresPlane[0] + pelOffset;
		// 計算他們的均值
        primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
        // 得到satd
		bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
		// 更新最優satd
		COPY2_IF_LT(bcost, bicost, listused, 3);

		// inter的cost要加上lowresPenalty
        bcost += lowresPenalty;
    }
	// Pslice，Pslice允許intra，所以將intra與inter對比
    else /* P, also consider intra */
    {
		// inter的cost要加上lowresPenalty
        bcost += lowresPenalty;

		// 若intra的satd < 之前計算的inter最優satd，則更新其爲listused和cost
        if (fenc->intraCost[cuXY] < bcost)
        {
            bcost = fenc->intraCost[cuXY];
            listused = 0;	// listused = 0表示intra
        }
    }

    /* do not include edge blocks in the frame cost estimates, they are not very accurate */
	// 判斷當前CU是否邊界CU，若是邊界CU則不加入到frame的cost中
    const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
                                cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
    // 得到adaptive quan satd
	int bcostAq;
    if (m_lookahead.m_param->rc.qgSize == 8)
        bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor8x8[cuXY] + 128) >> 8) : bcost;
    else
        bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] +128) >> 8) : bcost;

	// 若不是邊界CU，則累加上satd和adaptive quan satd到frame/slice的satd/aq satd中
    if (bFrameScoreCU)
    {
        if (slice < 0)
        {
            fenc->costEst[b - p0][p1 - b] += bcost;
            fenc->costEstAq[b - p0][p1 - b] += bcostAq;
            if (!listused && !bBidir)
                fenc->intraMbs[b - p0]++;
        }
        else
        {
            m_slice[slice].costEst += bcost;
            m_slice[slice].costEstAq += bcostAq;
            if (!listused && !bBidir)
                m_slice[slice].intraMbs++;
        }
    }

	// 累加上當前CU的satd到行satd中
    fenc->rowSatds[b - p0][p1 - b][cuY] += bcostAq;
	// 存儲下當前CU的satd
    fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
}
CostEstimateGroup::estimateCUCost()

985 碩士程序員，空窗 4 個月沒有 Offer！

一文搞懂 Spring 循環依賴

賽博鬥地主——使用大語言模型扮演Agent智能體玩牌類遊戲。

VScode右鍵打開(添加到右鍵)

記一次 .NET某工控視覺自動化系統卡死分析

LookaheadTLD::calcAdaptiveQuantFrame()

LookaheadTLD::lowresIntraEstimate()

CostEstimateGroup::estimateCUCost()

x265多線程-線程/線程池

x265多線程-event

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結