/***************************************************************************** * Copyright (C) 2013-2020 MulticoreWare, Inc * * Authors: Deepthi Nandakumar * Steve Borho * Min Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ #include "common.h" #include "frame.h" #include "framedata.h" #include "picyuv.h" #include "primitives.h" #include "threading.h" #include "analysis.h" #include "rdcost.h" #include "encoder.h" using namespace X265_NS; /* An explanation of rate distortion levels (--rd-level) * * rd-level 0 generates no recon per CU (NO RDO or Quant) * * sa8d selection between merge / skip / inter / intra and split * no recon pixels generated until CTU analysis is complete, requiring * intra predictions to use source pixels * * rd-level 1 uses RDO for merge and skip, sa8d for all else * * RDO selection between merge and skip * sa8d selection between (merge/skip) / inter modes / intra and split * intra prediction uses reconstructed pixels * * rd-level 2 uses RDO for merge/skip and split * * RDO selection between merge and skip * sa8d selection between (merge/skip) / inter modes / intra * RDO split decisions * * rd-level 3 uses RDO for merge/skip/best inter/intra * * RDO selection between merge and skip * sa8d selection of best inter mode * sa8d decisions include chroma residual cost * RDO selection between (merge/skip) / best inter mode / intra / split * * rd-level 4 enables RDOQuant * chroma residual cost included in satd decisions, including subpel refine * (as a result of --subme 3 being used by preset slow) * * rd-level 5,6 does RDO for each inter mode */ Analysis::Analysis() { m_reuseInterDataCTU = NULL; m_reuseRef = NULL; m_bHD = false; m_modeFlag[0] = false; m_modeFlag[1] = false; m_checkMergeAndSkipOnly[0] = false; m_checkMergeAndSkipOnly[1] = false; m_evaluateInter = 0; } bool Analysis::create(ThreadLocalData *tld) { m_tld = tld; m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2; int costArrSize = 1; uint32_t maxDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize]; for (uint32_t i = 1; i <= maxDQPDepth; i++) costArrSize += (1 << (i * 2)); cacheCost = X265_MALLOC(uint64_t, costArrSize); int csp = m_param->internalCsp; uint32_t cuSize = m_param->maxCUSize; bool ok = true; for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++, cuSize >>= 1) { ModeDepth &md = m_modeDepth[depth]; ok &= md.cuMemPool.create(depth, csp, MAX_PRED_TYPES, *m_param); ok &= md.fencYuv.create(cuSize, csp); if (ok) { for (int j = 0; j < MAX_PRED_TYPES; j++) { md.pred[j].cu.initialize(md.cuMemPool, depth, *m_param, j); ok &= md.pred[j].predYuv.create(cuSize, csp); ok &= md.pred[j].reconYuv.create(cuSize, csp); md.pred[j].fencYuv = &md.fencYuv; } } } if (m_param->sourceHeight >= 1080) m_bHD = true; return ok; } void Analysis::destroy() { for (uint32_t i = 0; i <= m_param->maxCUDepth; i++) { m_modeDepth[i].cuMemPool.destroy(); m_modeDepth[i].fencYuv.destroy(); for (int j = 0; j < MAX_PRED_TYPES; j++) { m_modeDepth[i].pred[j].predYuv.destroy(); m_modeDepth[i].pred[j].reconYuv.destroy(); } } X265_FREE(cacheCost); } Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext) { m_slice = ctu.m_slice; m_frame = &frame; m_bChromaSa8d = m_param->rdLevel >= 3; m_param = m_frame->m_param; #if _DEBUG || CHECKED_BUILD invalidateContexts(0); #endif int qp = setLambdaFromQP(ctu, m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, cuGeom) : m_slice->m_sliceQp); ctu.setQPSubParts((int8_t)qp, 0, 0); m_rqt[0].cur.load(initialContext); ctu.m_meanQP = initialContext.m_meanQP; m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0); if (m_param->bSsimRd) calculateNormFactor(ctu, qp); uint32_t numPartition = ctu.m_numPartitions; if (m_param->bCTUInfo && (*m_frame->m_ctuInfo + ctu.m_cuAddr)) { x265_ctu_info_t* ctuTemp = *m_frame->m_ctuInfo + ctu.m_cuAddr; int32_t depthIdx = 0; uint32_t maxNum8x8Partitions = 64; uint8_t* depthInfoPtr = m_frame->m_addOnDepth[ctu.m_cuAddr]; uint8_t* contentInfoPtr = m_frame->m_addOnCtuInfo[ctu.m_cuAddr]; int* prevCtuInfoChangePtr = m_frame->m_addOnPrevChange[ctu.m_cuAddr]; do { uint8_t depth = (uint8_t)ctuTemp->ctuPartitions[depthIdx]; uint8_t content = (uint8_t)(*((int32_t *)ctuTemp->ctuInfo + depthIdx)); int prevCtuInfoChange = m_frame->m_prevCtuInfoChange[ctu.m_cuAddr * maxNum8x8Partitions + depthIdx]; memset(depthInfoPtr, depth, sizeof(uint8_t) * numPartition >> 2 * depth); memset(contentInfoPtr, content, sizeof(uint8_t) * numPartition >> 2 * depth); memset(prevCtuInfoChangePtr, 0, sizeof(int) * numPartition >> 2 * depth); for (uint32_t l = 0; l < numPartition >> 2 * depth; l++) prevCtuInfoChangePtr[l] = prevCtuInfoChange; depthInfoPtr += ctu.m_numPartitions >> 2 * depth; contentInfoPtr += ctu.m_numPartitions >> 2 * depth; prevCtuInfoChangePtr += ctu.m_numPartitions >> 2 * depth; depthIdx++; } while (ctuTemp->ctuPartitions[depthIdx] != 0); m_additionalCtuInfo = m_frame->m_addOnCtuInfo[ctu.m_cuAddr]; m_prevCtuInfoChange = m_frame->m_addOnPrevChange[ctu.m_cuAddr]; memcpy(ctu.m_cuDepth, m_frame->m_addOnDepth[ctu.m_cuAddr], sizeof(uint8_t) * numPartition); //Calculate log2CUSize from depth for (uint32_t i = 0; i < cuGeom.numPartitions; i++) ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i]; } if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && (m_slice->m_sliceType != I_SLICE)) { int numPredDir = m_slice->isInterP() ? 1 : 2; m_reuseInterDataCTU = m_frame->m_analysisData.interData; for (int dir = 0; dir < numPredDir; dir++) { m_reuseMv[dir] = &m_reuseInterDataCTU->mv[dir][ctu.m_cuAddr * ctu.m_numPartitions]; m_reuseMvpIdx[dir] = &m_reuseInterDataCTU->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions]; } m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * ctu.m_numPartitions]; m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions]; m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions]; } int reuseLevel = X265_MAX(m_param->analysisSaveReuseLevel, m_param->analysisLoadReuseLevel); if ((m_param->analysisSave || m_param->analysisLoad) && m_slice->m_sliceType != I_SLICE && reuseLevel > 1 && reuseLevel < 10) { int numPredDir = m_slice->isInterP() ? 1 : 2; m_reuseInterDataCTU = m_frame->m_analysisData.interData; if (((m_param->analysisSaveReuseLevel > 1) && (m_param->analysisSaveReuseLevel < 7)) || ((m_param->analysisLoadReuseLevel > 1) && (m_param->analysisLoadReuseLevel < 7))) m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir]; m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions]; m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions]; if (reuseLevel > 4) { m_reusePartSize = &m_reuseInterDataCTU->partSize[ctu.m_cuAddr * ctu.m_numPartitions]; m_reuseMergeFlag = &m_reuseInterDataCTU->mergeFlag[ctu.m_cuAddr * ctu.m_numPartitions]; } if (m_param->analysisSave && !m_param->analysisLoad) for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * numPredDir; i++) m_reuseRef[i] = -1; } ProfileCUScope(ctu, totalCTUTime, totalCTUs); #if ENABLE_SCC_EXT memset(m_ibc.m_BVs, 0, sizeof(m_ibc.m_BVs)); memset(m_ibc.m_lastIntraBCMv, 0, sizeof(m_ibc.m_lastIntraBCMv)); m_ibc.m_numBV16s = 0; m_ibc.m_numBVs = 0; #endif if (m_slice->m_sliceType == I_SLICE || (m_param->bEnableSCC && (m_slice->m_numRefIdx[0] == 1) && m_slice->m_refPOCList[0][0] == m_slice->m_poc)) { x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData; if (m_param->analysisLoadReuseLevel > 1) { memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition); memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition); memcpy(ctu.m_partSize, &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * numPartition); memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition); } #if ENABLE_SCC_EXT compressIntraCU(ctu, cuGeom, qp, &m_ibc); #else compressIntraCU(ctu, cuGeom, qp); #endif } else { bool bCopyAnalysis = ((m_param->analysisLoadReuseLevel == 10) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16)); bool bCompressInterCUrd0_4 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel <= 4); bool bCompressInterCUrd5_6 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel >= 5 && m_param->rdLevel <= 6); bCopyAnalysis = bCopyAnalysis || bCompressInterCUrd0_4 || bCompressInterCUrd5_6; if (bCopyAnalysis) { x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData; int posCTU = ctu.m_cuAddr * numPartition; memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition); memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition); memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition); for (int list = 0; list < m_slice->isInterB() + 1; list++) memcpy(ctu.m_skipFlag[list], &m_frame->m_analysisData.modeFlag[list][posCTU], sizeof(uint8_t) * numPartition); if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO)) { x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData; memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition); memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition); } //Calculate log2CUSize from depth for (uint32_t i = 0; i < cuGeom.numPartitions; i++) ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i]; } if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE && ctu.m_cuPelX / m_param->maxCUSize >= frame.m_encData->m_pir.pirStartCol && ctu.m_cuPelX / m_param->maxCUSize < frame.m_encData->m_pir.pirEndCol) compressIntraCU(ctu, cuGeom, qp); else if (!m_param->rdLevel) { /* In RD Level 0/1, copy source pixels into the reconstructed block so * they are available for intra predictions */ m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic[0], ctu.m_cuAddr, 0); compressInterCU_rd0_4(ctu, cuGeom, qp); /* generate residual for entire CTU at once and copy to reconPic */ encodeResidue(ctu, cuGeom); } else if ((m_param->analysisLoadReuseLevel == 10 && (!(m_param->bAnalysisType == HEVC_INFO) || m_slice->m_sliceType != P_SLICE)) || ((m_param->bAnalysisType == AVC_INFO) && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16)) { x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData; int posCTU = ctu.m_cuAddr * numPartition; memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition); memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition); memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition); if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO)) { x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData; memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition); memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition); } //Calculate log2CUSize from depth for (uint32_t i = 0; i < cuGeom.numPartitions; i++) ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i]; qprdRefine (ctu, cuGeom, qp, qp); return *m_modeDepth[0].bestMode; } else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2) compressInterCU_dist(ctu, cuGeom, qp); else if (m_param->rdLevel <= 4) compressInterCU_rd0_4(ctu, cuGeom, qp); else #if ENABLE_SCC_EXT compressInterCU_rd5_6(ctu, cuGeom, qp, &m_ibc); #else compressInterCU_rd5_6(ctu, cuGeom, qp); #endif } if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP) qprdRefine(ctu, cuGeom, qp, qp); if (m_param->csvLogLevel >= 2) collectPUStatistics(ctu, cuGeom); return *m_modeDepth[0].bestMode; } void Analysis::collectPUStatistics(const CUData& ctu, const CUGeom& cuGeom) { uint8_t depth = 0; uint8_t partSize = 0; for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) { depth = ctu.m_cuDepth[absPartIdx]; partSize = ctu.m_partSize[absPartIdx]; uint32_t numPU = nbPartsTable[(int)partSize]; int shift = 2 * (m_param->maxCUDepth + 1 - depth); for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) { PredictionUnit pu(ctu, cuGeom, puIdx); int puabsPartIdx = ctu.getPUOffset(puIdx, absPartIdx); int mode = 1; if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_Nx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxN) mode = 2; else if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnU || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnD || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nLx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nRx2N) mode = 3; if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_SKIP) { ctu.m_encData->m_frameStats.cntSkipPu[depth] += 1ULL << shift; ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift; } else if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_INTRA) { if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_NxN) { ctu.m_encData->m_frameStats.cnt4x4++; ctu.m_encData->m_frameStats.totalPu[4]++; } else { ctu.m_encData->m_frameStats.cntIntraPu[depth] += 1ULL << shift; ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift; } } else if (mode == 3) { ctu.m_encData->m_frameStats.cntAmp[depth] += 1ULL << shift; ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift; break; } else { if (ctu.m_mergeFlag[puabsPartIdx + absPartIdx]) ctu.m_encData->m_frameStats.cntMergePu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode; else ctu.m_encData->m_frameStats.cntInterPu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode; ctu.m_encData->m_frameStats.totalPu[depth] += (1 << shift) / mode; } } } } int32_t Analysis::loadTUDepth(CUGeom cuGeom, CUData parentCTU) { float predDepth = 0; CUData* neighbourCU; uint8_t count = 0; int32_t maxTUDepth = -1; neighbourCU = &m_slice->m_refFrameList[0][0]->m_encData->m_picCTU[parentCTU.m_cuAddr]; predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId]; count++; if (m_slice->isInterB()) { neighbourCU = &m_slice->m_refFrameList[1][0]->m_encData->m_picCTU[parentCTU.m_cuAddr]; predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId]; count++; } if (parentCTU.m_cuAbove) { predDepth += parentCTU.m_cuAbove->m_refTuDepth[cuGeom.geomRecurId]; count++; if (parentCTU.m_cuAboveLeft) { predDepth += parentCTU.m_cuAboveLeft->m_refTuDepth[cuGeom.geomRecurId]; count++; } if (parentCTU.m_cuAboveRight) { predDepth += parentCTU.m_cuAboveRight->m_refTuDepth[cuGeom.geomRecurId]; count++; } } if (parentCTU.m_cuLeft) { predDepth += parentCTU.m_cuLeft->m_refTuDepth[cuGeom.geomRecurId]; count++; } predDepth /= count; if (predDepth == 0) maxTUDepth = 0; else if (predDepth < 1) maxTUDepth = 1; else if (predDepth >= 1 && predDepth <= 1.5) maxTUDepth = 2; else if (predDepth > 1.5 && predDepth <= 2.5) maxTUDepth = 3; else maxTUDepth = -1; return maxTUDepth; } void Analysis::tryLossless(const CUGeom& cuGeom) { ModeDepth& md = m_modeDepth[cuGeom.depth]; if (!md.bestMode->distortion) /* already lossless */ return; else if (md.bestMode->cu.isIntra(0)) { md.pred[PRED_LOSSLESS].initCosts(); md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom); PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0]; checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size); checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth); } else { md.pred[PRED_LOSSLESS].initCosts(); md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom); md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv); encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom); checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth); } } void Analysis::qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp) { uint32_t depth = cuGeom.depth; ModeDepth& md = m_modeDepth[depth]; md.bestMode = NULL; bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth; int bestCUQP = qp; int lambdaQP = lqp; bool doQPRefine = (bDecidedDepth && depth <= m_slice->m_pps->maxCuDQPDepth) || (!bDecidedDepth && depth == m_slice->m_pps->maxCuDQPDepth); if (m_param->analysisLoadReuseLevel >= 7) doQPRefine = false; if (doQPRefine) { uint64_t bestCUCost, origCUCost, cuCost, cuPrevCost; int cuIdx = (cuGeom.childOffset - 1) / 3; bestCUCost = origCUCost = cacheCost[cuIdx]; int direction = m_param->bOptCUDeltaQP ? 1 : 2; for (int dir = direction; dir >= -direction; dir -= (direction * 2)) { if (m_param->bOptCUDeltaQP && ((dir != 1) || ((qp + 3) >= (int32_t)parentCTU.m_meanQP))) break; int threshold = 1; int failure = 0; cuPrevCost = origCUCost; int modCUQP = qp + dir; while (modCUQP >= m_param->rc.qpMin && modCUQP <= QP_MAX_SPEC) { if (m_param->bOptCUDeltaQP && modCUQP > (int32_t)parentCTU.m_meanQP) break; recodeCU(parentCTU, cuGeom, modCUQP, qp); cuCost = md.bestMode->rdCost; COPY2_IF_LT(bestCUCost, cuCost, bestCUQP, modCUQP); if (cuCost < cuPrevCost) failure = 0; else failure++; if (failure > threshold) break; cuPrevCost = cuCost; modCUQP += dir; } } lambdaQP = bestCUQP; } recodeCU(parentCTU, cuGeom, bestCUQP, lambdaQP); /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[0], parentCTU.m_cuAddr, cuGeom.absPartIdx); } #if ENABLE_SCC_EXT uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc) #else uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp) #endif { uint32_t depth = cuGeom.depth; ModeDepth& md = m_modeDepth[depth]; md.bestMode = NULL; MV iMVCandList[4][10]; memset(iMVCandList, 0, sizeof(MV) * 4 * 10); bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); bool bAlreadyDecided = m_param->intraRefine != 4 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX && !(m_param->bAnalysisType == HEVC_INFO); bool bDecidedDepth = m_param->intraRefine != 4 && parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth; int split = 0; if (m_param->intraRefine && m_param->intraRefine != 4) { split = m_param->scaleFactor && bDecidedDepth && (!mightNotSplit || ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1)))); if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth) bAlreadyDecided = false; } if (bAlreadyDecided) { if (bDecidedDepth && mightNotSplit) { Mode& mode = md.pred[0]; md.bestMode = &mode; mode.cu.initSubCU(parentCTU, cuGeom, qp); bool reuseModes = !((m_param->intraRefine == 3) || (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX)); if (reuseModes) { memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions); memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions); } checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]); if (m_bTryLossless) tryLossless(cuGeom); if (mightSplit) addSplitFlagCost(*md.bestMode, cuGeom.depth); } } else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit) { md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp); checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N); checkBestMode(md.pred[PRED_INTRA], depth); if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3) { md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp); checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN); checkBestMode(md.pred[PRED_INTRA_NxN], depth); } #if ENABLE_SCC_EXT bool intraBlockCopyFastSearch = (m_param->bEnableSCC == 1) ? true : false, bUse1DSearchFor8x8 = false; if (m_param->bEnableSCC) { md.pred[PRED_MERGE_IBC].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); checkRDCostIntraBCMerge2Nx2N(md.pred[PRED_MERGE_IBC], cuGeom); md.pred[PRED_IBC_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); checkIntraBC_rd5_6(md.pred[PRED_IBC_2Nx2N], cuGeom, SIZE_2Nx2N, false, bUse1DSearchFor8x8, *ibc); checkBestMode(md.pred[PRED_IBC_2Nx2N], depth); if (intraBlockCopyFastSearch) { if ((int)depth == m_slice->m_sps->log2DiffMaxMinCodingBlockSize) { md.pred[PRED_IBC_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); checkIntraBC_rd5_6(md.pred[PRED_IBC_Nx2N], cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_Nx2N] + 8)); checkBestMode(md.pred[PRED_IBC_Nx2N], depth); md.pred[PRED_IBC_2NxN].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); checkIntraBC_rd5_6(md.pred[PRED_IBC_2NxN], cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_2NxN] + 8)); checkBestMode(md.pred[PRED_IBC_2NxN], depth); } } else { md.pred[PRED_IBC_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); checkIntraBC_rd5_6(md.pred[PRED_IBC_2NxN], cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_2NxN] + 8)); checkBestMode(md.pred[PRED_IBC_2NxN], depth); md.pred[PRED_IBC_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkIntraBC_rd5_6(md.pred[PRED_IBC_Nx2N], cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_Nx2N] + 8)); checkBestMode(md.pred[PRED_IBC_Nx2N], depth); } } #endif if (m_bTryLossless) tryLossless(cuGeom); if (mightSplit) addSplitFlagCost(*md.bestMode, cuGeom.depth); } #if ENABLE_SCC_EXT // If Intra BC keep last coded Mv if (md.bestMode && md.bestMode->cu.isInter(0)) { MVField mvField; const CUData* cu = &md.bestMode->cu; md.bestMode->cu.getMvField(cu, 0, 0, mvField); int iRefIdxFirst = mvField.refIdx; md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField); int iRefIdxLast = mvField.refIdx; bool isIntraBCFirst = (iRefIdxFirst >= 0) ? cu->m_slice->m_refFrameList[0][iRefIdxFirst]->m_poc == cu->m_slice->m_poc : false; bool isIntraBCLast = (iRefIdxLast >= 0) ? cu->m_slice->m_refFrameList[0][iRefIdxLast]->m_poc == cu->m_slice->m_poc : false; if (isIntraBCFirst || isIntraBCLast) { if (cu->m_partSize[0] == SIZE_2Nx2N) { md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField); if (mvField.mv != cu->m_lastIntraBCMv[0]) { md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0]; md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv; } } else if (cu->m_partSize[0] == SIZE_2NxN || cu->m_partSize[0] == SIZE_Nx2N) { // mixed PU, only one partition is IntraBC coded if (isIntraBCFirst != isIntraBCLast) { if (isIntraBCFirst) { // Part 0 md.bestMode->cu.getMvField(cu, 0, 0, mvField); if (mvField.mv != cu->m_lastIntraBCMv[0]) { md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0]; md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv; } } else if (isIntraBCLast) { // Part 1 md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField); if (mvField.mv != cu->m_lastIntraBCMv[0]) { md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0]; md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv; } } } else // normal IntraBC CU { // Part 0 md.bestMode->cu.getMvField(cu, 0, 0, mvField); if (mvField.mv != cu->m_lastIntraBCMv[0]) { md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0]; md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv; } // Part 1 md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField); if (mvField.mv != cu->m_lastIntraBCMv[0]) { md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0]; md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv; } } } else { // NxN for (int part = 0; part < 4; part++) { md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 4 + part, 0, mvField); if (mvField.mv != cu->m_lastIntraBCMv[0]) { md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0]; md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv; } } } } } // is inter #endif // stop recursion if we reach the depth of previous analysis decision mightSplit &= !(bAlreadyDecided && bDecidedDepth) || split; if (mightSplit) { Mode* splitPred = &md.pred[PRED_SPLIT]; splitPred->initCosts(); CUData* splitCU = &splitPred->cu; splitCU->initSubCU(parentCTU, cuGeom, qp); uint32_t nextDepth = depth + 1; ModeDepth& nd = m_modeDepth[nextDepth]; invalidateContexts(nextDepth); Entropy* nextContext = &m_rqt[depth].cur; int32_t nextQP = qp; uint64_t curCost = 0; int skipSplitCheck = 0; for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); if (childGeom.flags & CUGeom::PRESENT) { m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx); m_rqt[nextDepth].cur.load(*nextContext); if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth) nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom)); if (m_param->bEnableSplitRdSkip) { #if ENABLE_SCC_EXT curCost += compressIntraCU(parentCTU, childGeom, nextQP, ibc); #else curCost += compressIntraCU(parentCTU, childGeom, nextQP); #endif if (m_modeDepth[depth].bestMode && curCost > m_modeDepth[depth].bestMode->rdCost) { skipSplitCheck = 1; break; } } else #if !ENABLE_SCC_EXT compressIntraCU(parentCTU, childGeom, nextQP); #else compressIntraCU(parentCTU, childGeom, nextQP, ibc); if (nd.bestMode->cu.m_lastIntraBCMv[0].x != 0 || nd.bestMode->cu.m_lastIntraBCMv[0].y != 0) { for (int i = 0; i < 2; i++) { ibc->m_lastIntraBCMv[i] = nd.bestMode->cu.m_lastIntraBCMv[i]; } } #endif // Save best CU and pred data for this sub CU splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); splitPred->addSubCosts(*nd.bestMode); nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); nextContext = &nd.bestMode->contexts; } else { /* record the depth of this non-present sub-CU */ splitCU->setEmptyPart(childGeom, subPartIdx); /* Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP */ if (bAlreadyDecided) memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions); } } if (!skipSplitCheck) { nextContext->store(splitPred->contexts); if (mightNotSplit) addSplitFlagCost(*splitPred, cuGeom.depth); else updateModeCost(*splitPred); checkDQPForSplitPred(*splitPred, cuGeom); checkBestMode(*splitPred, depth); } } if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth) { int cuIdx = (cuGeom.childOffset - 1) / 3; cacheCost[cuIdx] = md.bestMode->rdCost; } if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4) { CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr); int8_t maxTUDepth = -1; for (uint32_t i = 0; i < cuGeom.numPartitions; i++) maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]); ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth; } /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); if (md.bestMode != &md.pred[PRED_SPLIT]) { for (int i = 0; i < !!m_param->bEnableSCC + 1; i++) md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[i], parentCTU.m_cuAddr, cuGeom.absPartIdx); } return md.bestMode->rdCost; } void Analysis::PMODE::processTasks(int workerThreadId) { #if DETAILED_CU_STATS int fe = master.m_modeDepth[cuGeom.depth].pred[PRED_2Nx2N].cu.m_encData->m_frameEncoderID; master.m_stats[fe].countPModeTasks++; ScopedElapsedTime pmodeTime(master.m_stats[fe].pmodeTime); #endif ProfileScopeEvent(pmode); master.processPmode(*this, master.m_tld[workerThreadId].analysis); } /* process pmode jobs until none remain; may be called by the master thread or by * a bonded peer (slave) thread via pmodeTasks() */ void Analysis::processPmode(PMODE& pmode, Analysis& slave) { /* acquire a mode task, else exit early */ int task; pmode.m_lock.acquire(); if (pmode.m_jobTotal > pmode.m_jobAcquired) { task = pmode.m_jobAcquired++; pmode.m_lock.release(); } else { pmode.m_lock.release(); return; } ModeDepth& md = m_modeDepth[pmode.cuGeom.depth]; /* setup slave Analysis */ if (&slave != this) { slave.m_slice = m_slice; slave.m_frame = m_frame; slave.m_param = m_param; slave.m_bChromaSa8d = m_param->rdLevel >= 3; slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp); slave.invalidateContexts(0); slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur); } /* perform Mode task, repeat until no more work is available */ do { uint32_t refMasks[2] = { 0, 0 }; if (m_param->rdLevel <= 4) { switch (pmode.modes[task]) { case PRED_INTRA: slave.checkIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom); if (m_param->rdLevel > 2) slave.encodeIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom); break; case PRED_2Nx2N: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks); if (m_slice->m_sliceType == B_SLICE) slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom); break; case PRED_Nx2N: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */ refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */ slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks); break; case PRED_2NxN: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */ refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */ slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks); break; case PRED_2NxnU: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */ refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */ slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks); break; case PRED_2NxnD: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */ refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */ slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks); break; case PRED_nLx2N: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */ refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */ slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks); break; case PRED_nRx2N: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */ refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */ slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks); break; default: X265_CHECK(0, "invalid job ID for parallel mode analysis\n"); break; } } else { switch (pmode.modes[task]) { case PRED_INTRA: slave.checkIntra(md.pred[PRED_INTRA], pmode.cuGeom, SIZE_2Nx2N); if (pmode.cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3) slave.checkIntra(md.pred[PRED_INTRA_NxN], pmode.cuGeom, SIZE_NxN); break; case PRED_2Nx2N: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks); md.pred[PRED_BIDIR].rdCost = MAX_INT64; if (m_slice->m_sliceType == B_SLICE) { slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom); if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64) slave.encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], pmode.cuGeom); } break; case PRED_Nx2N: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */ refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */ slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks); break; case PRED_2NxN: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */ refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */ slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks); break; case PRED_2NxnU: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */ refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */ slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks); break; case PRED_2NxnD: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */ refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */ slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks); break; case PRED_nLx2N: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */ refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */ slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks); break; case PRED_nRx2N: refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */ refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */ slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks); break; default: X265_CHECK(0, "invalid job ID for parallel mode analysis\n"); break; } } task = -1; pmode.m_lock.acquire(); if (pmode.m_jobTotal > pmode.m_jobAcquired) task = pmode.m_jobAcquired++; pmode.m_lock.release(); } while (task >= 0); } uint32_t Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp) { uint32_t depth = cuGeom.depth; uint32_t cuAddr = parentCTU.m_cuAddr; ModeDepth& md = m_modeDepth[depth]; md.bestMode = NULL; bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0; uint32_t splitRefs[4] = { 0, 0, 0, 0 }; X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n"); PMODE pmode(*this, cuGeom); if (mightNotSplit && depth >= minDepth) { /* Initialize all prediction CUs based on parentCTU */ md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); if (m_param->rdLevel <= 4) checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); else checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); } bool bNoSplit = false; bool splitIntra = true; if (md.bestMode) { bNoSplit = md.bestMode->cu.isSkipped(0); if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4) bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode); } if (mightSplit && !bNoSplit) { Mode* splitPred = &md.pred[PRED_SPLIT]; splitPred->initCosts(); CUData* splitCU = &splitPred->cu; splitCU->initSubCU(parentCTU, cuGeom, qp); uint32_t nextDepth = depth + 1; ModeDepth& nd = m_modeDepth[nextDepth]; invalidateContexts(nextDepth); Entropy* nextContext = &m_rqt[depth].cur; int nextQP = qp; splitIntra = false; for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); if (childGeom.flags & CUGeom::PRESENT) { m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx); m_rqt[nextDepth].cur.load(*nextContext); if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth) nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom)); splitRefs[subPartIdx] = compressInterCU_dist(parentCTU, childGeom, nextQP); // Save best CU and pred data for this sub CU splitIntra |= nd.bestMode->cu.isIntra(0); splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); splitPred->addSubCosts(*nd.bestMode); nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); nextContext = &nd.bestMode->contexts; } else splitCU->setEmptyPart(childGeom, subPartIdx); } nextContext->store(splitPred->contexts); if (mightNotSplit) addSplitFlagCost(*splitPred, cuGeom.depth); else updateModeCost(*splitPred); checkDQPForSplitPred(*splitPred, cuGeom); } if (mightNotSplit && depth >= minDepth) { int bTryAmp = m_slice->m_sps->maxAMPDepth > depth; int bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (!m_param->limitReferences || splitIntra) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE); if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0) setLambdaFromQP(parentCTU, qp); if (bTryIntra) { md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp); if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3 && m_param->rdLevel >= 5) md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_INTRA; } md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2Nx2N; md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp); if (m_param->bEnableRectInter) { md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxN; md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_Nx2N; } if (bTryAmp) { md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnU; md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnD; md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nLx2N; md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N; } m_splitRefIdx[0] = splitRefs[0]; m_splitRefIdx[1] = splitRefs[1]; m_splitRefIdx[2] = splitRefs[2]; m_splitRefIdx[3] = splitRefs[3]; pmode.tryBondPeers(*m_frame->m_encData->m_jobProvider, pmode.m_jobTotal); /* participate in processing jobs, until all are distributed */ processPmode(pmode, *this); /* the master worker thread (this one) does merge analysis. By doing * merge after all the other jobs are at least started, we usually avoid * blocking on another thread */ if (m_param->rdLevel <= 4) { { ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters); pmode.waitForExit(); } /* select best inter mode based on sa8d cost */ Mode *bestInter = &md.pred[PRED_2Nx2N]; if (m_param->bEnableRectInter) { if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_Nx2N]; if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_2NxN]; } if (bTryAmp) { if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_2NxnU]; if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_2NxnD]; if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_nLx2N]; if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_nRx2N]; } if (m_param->rdLevel > 2) { /* RD selection between merge, inter, bidir and intra */ if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */ { uint32_t numPU = bestInter->cu.getNumPartInter(0); for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) { PredictionUnit pu(bestInter->cu, cuGeom, puIdx); motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true); } } encodeResAndCalcRdInterCU(*bestInter, cuGeom); checkBestMode(*bestInter, depth); /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */ if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 && md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17) { encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom); checkBestMode(md.pred[PRED_BIDIR], depth); } if (bTryIntra) checkBestMode(md.pred[PRED_INTRA], depth); } else /* m_param->rdLevel == 2 */ { if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost) md.bestMode = bestInter; if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost) md.bestMode = &md.pred[PRED_BIDIR]; if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost) { md.bestMode = &md.pred[PRED_INTRA]; encodeIntraInInter(*md.bestMode, cuGeom); } else if (!md.bestMode->cu.m_mergeFlag[0]) { /* finally code the best mode selected from SA8D costs */ uint32_t numPU = md.bestMode->cu.getNumPartInter(0); for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) { PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx); motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true); } encodeResAndCalcRdInterCU(*md.bestMode, cuGeom); } } } else { { ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters); pmode.waitForExit(); } checkBestMode(md.pred[PRED_2Nx2N], depth); if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < MAX_INT64) checkBestMode(md.pred[PRED_BIDIR], depth); if (m_param->bEnableRectInter) { checkBestMode(md.pred[PRED_Nx2N], depth); checkBestMode(md.pred[PRED_2NxN], depth); } if (bTryAmp) { checkBestMode(md.pred[PRED_2NxnU], depth); checkBestMode(md.pred[PRED_2NxnD], depth); checkBestMode(md.pred[PRED_nLx2N], depth); checkBestMode(md.pred[PRED_nRx2N], depth); } if (bTryIntra) { checkBestMode(md.pred[PRED_INTRA], depth); if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3) checkBestMode(md.pred[PRED_INTRA_NxN], depth); } } if (m_bTryLossless) tryLossless(cuGeom); if (mightSplit) addSplitFlagCost(*md.bestMode, cuGeom.depth); } /* compare split RD cost against best cost */ if (mightSplit && !bNoSplit) checkBestMode(md.pred[PRED_SPLIT], depth); /* determine which motion references the parent CU should search */ uint32_t refMask; if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH)) refMask = 0; else if (md.bestMode == &md.pred[PRED_SPLIT]) refMask = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3]; else { /* use best merge/inter mode, in case of intra use 2Nx2N inter references */ CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu; uint32_t numPU = cu.getNumPartInter(0); refMask = 0; for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0)) refMask |= cu.getBestRefIdx(subPartIdx); } if (mightNotSplit) { /* early-out statistics */ FrameData& curEncData = *m_frame->m_encData; FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth]; cuStat.count[depth] += 1; cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth]; } /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[0], cuAddr, cuGeom.absPartIdx); return refMask; } SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp) { if (parentCTU.m_vbvAffected && calculateQpforCuSize(parentCTU, cuGeom, 1)) return compressInterCU_rd5_6(parentCTU, cuGeom, qp); uint32_t depth = cuGeom.depth; uint32_t cuAddr = parentCTU.m_cuAddr; ModeDepth& md = m_modeDepth[depth]; if (m_param->searchMethod == X265_SEA) { int numPredDir = m_slice->isInterP() ? 1 : 2; int offset = (int)(m_frame->m_reconPic[0]->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic[0]->m_buOffsetY[cuGeom.absPartIdx]); for (int list = 0; list < numPredDir; list++) for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++) for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++) m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset; } PicYuv& reconPic = *m_frame->m_reconPic[0]; SplitData splitCUData; bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16); bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])); bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO); if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading) { md.bestMode = NULL; bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom); bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth; bool skipModes = false; /* Skip any remaining mode analyses at current depth */ bool skipRecursion = false; /* Skip recursion */ bool splitIntra = true; bool skipRectAmp = false; bool chooseMerge = false; bool bCtuInfoCheck = false; int sameContentRef = 0; if (m_evaluateInter) { if (m_refineLevel == 2) { if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP) skipModes = true; if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N) skipRectAmp = true; } mightSplit &= false; minDepth = depth; } if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4) m_maxTUDepth = loadTUDepth(cuGeom, parentCTU); SplitData splitData[4]; splitData[0].initSplitCUData(); splitData[1].initSplitCUData(); splitData[2].initSplitCUData(); splitData[3].initSplitCUData(); // avoid uninitialize value in below reference if (m_param->limitModes) { md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0 md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1 md.pred[PRED_2Nx2N].sa8dCost = 0; } if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx]) { if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx]) sameContentRef = findSameContentRefCount(parentCTU, cuGeom); if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx]) { mightNotSplit &= bDecidedDepth; bCtuInfoCheck = skipRecursion = false; skipModes = true; } else if (mightNotSplit && bDecidedDepth) { if (m_additionalCtuInfo[cuGeom.absPartIdx]) { bCtuInfoCheck = skipRecursion = true; md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); if (!sameContentRef) { if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth)) { qp -= int32_t(0.04 * qp); setLambdaFromQP(parentCTU, qp); } if (m_param->bCTUInfo & 4) skipModes = false; } if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4))) { if (m_param->rdLevel) skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); if ((m_param->bCTUInfo & 4) && sameContentRef) skipModes = md.bestMode && true; } } else { md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); if (m_param->rdLevel) skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); } mightSplit &= !bDecidedDepth; } } if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)) { if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx]) { if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP) { md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); skipRecursion = !!m_param->recursionSkipMode && md.bestMode; if (m_param->rdLevel) skipModes = m_param->bEnableEarlySkip && md.bestMode; } if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N) { if (m_reuseModes[cuGeom.absPartIdx] != MODE_INTRA && m_reuseModes[cuGeom.absPartIdx] != 4) { skipRectAmp = true && !!md.bestMode; chooseMerge = !!m_reuseMergeFlag[cuGeom.absPartIdx] && !!md.bestMode; } } } } if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU) { if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx]) { if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP) { md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); skipRecursion = !!m_param->recursionSkipMode && md.bestMode; if (m_param->rdLevel) skipModes = m_param->bEnableEarlySkip && md.bestMode; } } } /* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */ if ((mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]))) /* TODO: Re-evaluate if analysis load/save still works */ { /* Compute Merge Cost */ md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); if (m_param->rdLevel) skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2) && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth } if (md.bestMode && m_param->recursionSkipMode && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]))) { skipRecursion = md.bestMode->cu.isSkipped(0); if (mightSplit && !skipRecursion) { if (depth >= minDepth && m_param->recursionSkipMode == RDCOST_BASED_RSKIP) { if (depth) skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode); if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE) skipRecursion = complexityCheckCU(*md.bestMode); } else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP) { skipRecursion = complexityCheckCU(*md.bestMode); } } } if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7) skipRecursion = true; /* Step 2. Evaluate each of the 4 split sub-blocks in series */ if (mightSplit && !skipRecursion) { if (bCtuInfoCheck && m_param->bCTUInfo & 2) qp = int((1 / 0.96) * qp + 0.5); Mode* splitPred = &md.pred[PRED_SPLIT]; splitPred->initCosts(); CUData* splitCU = &splitPred->cu; splitCU->initSubCU(parentCTU, cuGeom, qp); uint32_t nextDepth = depth + 1; ModeDepth& nd = m_modeDepth[nextDepth]; invalidateContexts(nextDepth); Entropy* nextContext = &m_rqt[depth].cur; int nextQP = qp; splitIntra = false; for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); if (childGeom.flags & CUGeom::PRESENT) { m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx); m_rqt[nextDepth].cur.load(*nextContext); if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth) nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom)); splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP); // Save best CU and pred data for this sub CU splitIntra |= nd.bestMode->cu.isIntra(0); splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); splitPred->addSubCosts(*nd.bestMode); if (m_param->rdLevel) nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); else nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx); if (m_param->rdLevel > 1) nextContext = &nd.bestMode->contexts; } else splitCU->setEmptyPart(childGeom, subPartIdx); } nextContext->store(splitPred->contexts); if (mightNotSplit) addSplitFlagCost(*splitPred, cuGeom.depth); else if (m_param->rdLevel > 1) updateModeCost(*splitPred); else splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits); } /* If analysis mode is simple do not Evaluate other modes */ if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7) { if (m_slice->m_sliceType == P_SLICE) { if (m_checkMergeAndSkipOnly[0]) skipModes = true; } else { if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1]) skipModes = true; } } /* Split CUs * 0 1 * 2 3 */ uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs; /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */ if (mightNotSplit && (depth >= minDepth || (m_param->bCTUInfo && !md.bestMode))) { if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0) setLambdaFromQP(parentCTU, qp); if (!skipModes) { uint32_t refMasks[2]; refMasks[0] = allSplitRefs; md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks); if (m_param->limitReferences & X265_REF_LIMIT_CU) { CUData& cu = md.pred[PRED_2Nx2N].cu; uint32_t refMask = cu.getBestRefIdx(0); allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask; } if (m_slice->m_sliceType == B_SLICE) { md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp); checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom); } Mode *bestInter = &md.pred[PRED_2Nx2N]; if (!skipRectAmp) { if (m_param->bEnableRectInter) { uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost; uint32_t threshold_2NxN, threshold_Nx2N; if (m_slice->m_sliceType == P_SLICE) { threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0]; threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0]; } else { threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0] + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1; threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0] + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1; } int try_2NxN_first = threshold_2NxN < threshold_Nx2N; if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN) { refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */ refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */ md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks); if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_2NxN]; } if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N) { refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */ refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */ md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks); if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_Nx2N]; } if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN) { refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */ refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */ md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks); if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_2NxN]; } } if (m_slice->m_sps->maxAMPDepth > depth) { uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost; uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N; if (m_slice->m_sliceType == P_SLICE) { threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0]; threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0]; threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0]; threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0]; } else { threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0] + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1; threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0] + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1; threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0] + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1; threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0] + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1; } bool bHor = false, bVer = false; if (bestInter->cu.m_partSize[0] == SIZE_2NxN) bHor = true; else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N) bVer = true; else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N && md.bestMode && md.bestMode->cu.getQtRootCbf(0)) { bHor = true; bVer = true; } if (bHor) { int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU; if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD) { refMasks[0] = allSplitRefs; /* 75% top */ refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */ md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks); if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_2NxnD]; } if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU) { refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */ refMasks[1] = allSplitRefs; /* 75% bot */ md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks); if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_2NxnU]; } if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD) { refMasks[0] = allSplitRefs; /* 75% top */ refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */ md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks); if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_2NxnD]; } } if (bVer) { int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N; if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N) { refMasks[0] = allSplitRefs; /* 75% left */ refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */ md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks); if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_nRx2N]; } if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N) { refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left */ refMasks[1] = allSplitRefs; /* 75% right */ md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks); if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_nLx2N]; } if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N) { refMasks[0] = allSplitRefs; /* 75% left */ refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */ md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks); if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost) bestInter = &md.pred[PRED_nRx2N]; } } } } bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && !((m_param->bCTUInfo & 4) && bCtuInfoCheck); if (m_param->rdLevel >= 3) { /* Calculate RD cost of best inter option */ if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */ { uint32_t numPU = bestInter->cu.getNumPartInter(0); for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) { PredictionUnit pu(bestInter->cu, cuGeom, puIdx); motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true); } } if (!chooseMerge) { encodeResAndCalcRdInterCU(*bestInter, cuGeom); checkBestMode(*bestInter, depth); /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */ if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 && md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17) { uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0); if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400) for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) { PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx); motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true); } encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom); checkBestMode(md.pred[PRED_BIDIR], depth); } } if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) || md.bestMode->sa8dCost == MAX_INT64) { if (!m_param->limitReferences || splitIntra) { ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]); md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp); checkIntraInInter(md.pred[PRED_INTRA], cuGeom); encodeIntraInInter(md.pred[PRED_INTRA], cuGeom); checkBestMode(md.pred[PRED_INTRA], depth); } else { ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]); } } } else { /* SA8D choice between merge/skip, inter, bidir, and intra */ if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost) md.bestMode = bestInter; if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost) md.bestMode = &md.pred[PRED_BIDIR]; if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64) { if (!m_param->limitReferences || splitIntra) { ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]); md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp); checkIntraInInter(md.pred[PRED_INTRA], cuGeom); if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost) md.bestMode = &md.pred[PRED_INTRA]; } else { ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]); } } /* finally code the best mode selected by SA8D costs: * RD level 2 - fully encode the best mode * RD level 1 - generate recon pixels * RD level 0 - generate chroma prediction */ if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N) { /* prediction already generated for this CU, and if rd level * is not 0, it is already fully encoded */ } else if (md.bestMode->cu.isInter(0)) { uint32_t numPU = md.bestMode->cu.getNumPartInter(0); if (m_csp != X265_CSP_I400) { for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) { PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx); motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true); } } if (m_param->rdLevel == 2) encodeResAndCalcRdInterCU(*md.bestMode, cuGeom); else if (m_param->rdLevel == 1) { /* generate recon pixels with no rate distortion considerations */ CUData& cu = md.bestMode->cu; uint32_t tuDepthRange[2]; cu.getInterTUQtDepthRange(tuDepthRange, 0); m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize, m_frame->m_fencPic->m_picCsp); residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange); if (cu.getQtRootCbf(0)) md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0], m_frame->m_fencPic->m_picCsp); else { md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv); if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N) cu.setPredModeSubParts(MODE_SKIP); } } } else { if (m_param->rdLevel == 2) encodeIntraInInter(*md.bestMode, cuGeom); else if (m_param->rdLevel == 1) { /* generate recon pixels with no rate distortion considerations */ CUData& cu = md.bestMode->cu; uint32_t tuDepthRange[2]; cu.getIntraTUQtDepthRange(tuDepthRange, 0); residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange); if (m_csp != X265_CSP_I400) { getBestIntraModeChroma(*md.bestMode, cuGeom); residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0); } md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO: } } } } // !earlyskip if (m_bTryLossless) tryLossless(cuGeom); if (mightSplit) addSplitFlagCost(*md.bestMode, cuGeom.depth); } if (mightSplit && !skipRecursion) { Mode* splitPred = &md.pred[PRED_SPLIT]; if (!md.bestMode) md.bestMode = splitPred; else if (m_param->rdLevel > 1) checkBestMode(*splitPred, cuGeom.depth); else if (splitPred->sa8dCost < md.bestMode->sa8dCost) md.bestMode = splitPred; checkDQPForSplitPred(*md.bestMode, cuGeom); } /* determine which motion references the parent CU should search */ splitCUData.initSplitCUData(); if (m_param->limitReferences & X265_REF_LIMIT_DEPTH) { if (md.bestMode == &md.pred[PRED_SPLIT]) splitCUData.splitRefs = allSplitRefs; else { /* use best merge/inter mode, in case of intra use 2Nx2N inter references */ CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu; uint32_t numPU = cu.getNumPartInter(0); for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0)) splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx); } } if (m_param->limitModes) { splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0 splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1 splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost; } if (mightNotSplit && md.bestMode->cu.isSkipped(0)) { FrameData& curEncData = *m_frame->m_encData; FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth]; cuStat.count[depth] += 1; cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth]; } /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); if (m_param->rdLevel) md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx); if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4) { if (mightNotSplit) { CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr); int8_t maxTUDepth = -1; for (uint32_t i = 0; i < cuGeom.numPartitions; i++) maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]); ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth; } } } else { if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16) { qprdRefine(parentCTU, cuGeom, qp, qp); SplitData splitData[4]; splitData[0].initSplitCUData(); splitData[1].initSplitCUData(); splitData[2].initSplitCUData(); splitData[3].initSplitCUData(); uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs; splitCUData.initSplitCUData(); if (m_param->limitReferences & X265_REF_LIMIT_DEPTH) { if (md.bestMode == &md.pred[PRED_SPLIT]) splitCUData.splitRefs = allSplitRefs; else { /* use best merge/inter mode, in case of intra use 2Nx2N inter references */ CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu; uint32_t numPU = cu.getNumPartInter(0); for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0)) splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx); } } if (m_param->limitModes) { splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0 splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1 splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost; } } } return splitCUData; } #if ENABLE_SCC_EXT SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc) #else SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp) #endif { if (parentCTU.m_vbvAffected && !calculateQpforCuSize(parentCTU, cuGeom, 1)) return compressInterCU_rd0_4(parentCTU, cuGeom, qp); uint32_t depth = cuGeom.depth; ModeDepth& md = m_modeDepth[depth]; md.bestMode = NULL; Mode* interBest = NULL; // store the best modes in inter prediction MV iMVCandList[4][10]; memset(iMVCandList, 0, sizeof(MV) * 4 * 10); if (m_param->searchMethod == X265_SEA) { int numPredDir = m_slice->isInterP() ? 1 : 2; int offset = (int)(m_frame->m_reconPic[0]->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic[0]->m_buOffsetY[cuGeom.absPartIdx]); for (int list = 0; list < numPredDir; list++) for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++) for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++) m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset; } SplitData splitCUData; bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16); bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])); bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO); if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading) { bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth; bool skipRecursion = false; bool skipModes = false; bool splitIntra = true; bool skipRectAmp = false; bool bCtuInfoCheck = false; int sameContentRef = 0; if (m_evaluateInter) { if (m_refineLevel == 2) { if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP) skipModes = true; if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N) skipRectAmp = true; } mightSplit &= false; } // avoid uninitialize value in below reference if (m_param->limitModes) { md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0 md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1 md.pred[PRED_2Nx2N].rdCost = 0; } if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4) m_maxTUDepth = loadTUDepth(cuGeom, parentCTU); SplitData splitData[4]; splitData[0].initSplitCUData(); splitData[1].initSplitCUData(); splitData[2].initSplitCUData(); splitData[3].initSplitCUData(); uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs; uint32_t refMasks[2]; if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx]) { if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx]) sameContentRef = findSameContentRefCount(parentCTU, cuGeom); if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx]) { mightNotSplit &= bDecidedDepth; bCtuInfoCheck = skipRecursion = false; skipModes = true; } else if (mightNotSplit && bDecidedDepth) { if (m_additionalCtuInfo[cuGeom.absPartIdx]) { bCtuInfoCheck = skipRecursion = true; refMasks[0] = allSplitRefs; md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks); checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth); if (!sameContentRef) { if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth)) { qp -= int32_t(0.04 * qp); setLambdaFromQP(parentCTU, qp); } if (m_param->bCTUInfo & 4) skipModes = false; } if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4))) { if (m_param->rdLevel) skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); if ((m_param->bCTUInfo & 4) && sameContentRef) skipModes = md.bestMode && true; } } else { md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); skipModes = !!m_param->bEnableEarlySkip && md.bestMode; refMasks[0] = allSplitRefs; md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks); checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth); #if ENABLE_SCC_EXT interBest = md.bestMode; #endif } mightSplit &= !bDecidedDepth; } } if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10) { if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx]) { if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP) { md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); skipModes = !!m_param->bEnableEarlySkip && md.bestMode; refMasks[0] = allSplitRefs; md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks); checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth); if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode) skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0); #if ENABLE_SCC_EXT interBest = md.bestMode; #endif } if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N) skipRectAmp = true && !!md.bestMode; } } if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU) { if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx]) { if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP) { md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); skipModes = !!m_param->bEnableEarlySkip && md.bestMode; refMasks[0] = allSplitRefs; md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks); checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth); if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode) skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0); #if ENABLE_SCC_EXT interBest = md.bestMode; #endif } } } /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */ if ((mightNotSplit && !md.bestMode && !bCtuInfoCheck) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]))) { md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2) && md.bestMode && !md.bestMode->cu.getQtRootCbf(0); refMasks[0] = allSplitRefs; md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks); checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth); #if ENABLE_SCC_EXT interBest = md.bestMode; if (m_param->bEnableSCC) { md.pred[PRED_MERGE_IBC].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); checkRDCostIntraBCMerge2Nx2N(md.pred[PRED_MERGE_IBC], cuGeom); } #endif if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP && depth && m_modeDepth[depth - 1].bestMode) skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0); else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP) skipRecursion = md.bestMode && complexityCheckCU(*md.bestMode); } if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7) skipRecursion = true; // estimate split cost /* Step 2. Evaluate each of the 4 split sub-blocks in series */ if (mightSplit && !skipRecursion) { if (bCtuInfoCheck && m_param->bCTUInfo & 2) qp = int((1 / 0.96) * qp + 0.5); Mode* splitPred = &md.pred[PRED_SPLIT]; splitPred->initCosts(); CUData* splitCU = &splitPred->cu; splitCU->initSubCU(parentCTU, cuGeom, qp); uint32_t nextDepth = depth + 1; ModeDepth& nd = m_modeDepth[nextDepth]; invalidateContexts(nextDepth); Entropy* nextContext = &m_rqt[depth].cur; int nextQP = qp; splitIntra = false; for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); if (childGeom.flags & CUGeom::PRESENT) { m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx); m_rqt[nextDepth].cur.load(*nextContext); if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth) nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom)); #if ENABLE_SCC_EXT splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, nextQP, ibc); if (nd.bestMode->cu.m_lastIntraBCMv[0].x != 0 || nd.bestMode->cu.m_lastIntraBCMv[0].y != 0) { for (int i = 0; i < 2; i++) ibc->m_lastIntraBCMv[i] = nd.bestMode->cu.m_lastIntraBCMv[i]; } #else splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, nextQP); #endif // Save best CU and pred data for this sub CU splitIntra |= nd.bestMode->cu.isIntra(0); splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); splitPred->addSubCosts(*nd.bestMode); nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); nextContext = &nd.bestMode->contexts; } else { splitCU->setEmptyPart(childGeom, subPartIdx); } } nextContext->store(splitPred->contexts); if (mightNotSplit) addSplitFlagCost(*splitPred, cuGeom.depth); else updateModeCost(*splitPred); checkDQPForSplitPred(*splitPred, cuGeom); } /* If analysis mode is simple do not Evaluate other modes */ if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7) { if (m_slice->m_sliceType == P_SLICE) { if (m_checkMergeAndSkipOnly[0]) skipModes = true; } else { if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1]) skipModes = true; } } /* Split CUs * 0 1 * 2 3 */ allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs; /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */ if (mightNotSplit) { if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0) setLambdaFromQP(parentCTU, qp); if (!skipModes) { refMasks[0] = allSplitRefs; if (m_param->limitReferences & X265_REF_LIMIT_CU) { CUData& cu = md.pred[PRED_2Nx2N].cu; uint32_t refMask = cu.getBestRefIdx(0); allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask; } if (m_slice->m_sliceType == B_SLICE) { md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp); checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom); if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64) { uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0); if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400) for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) { PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx); motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true); } encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom); checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth); } } if (!skipRectAmp) { if (m_param->bEnableRectInter) { uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost; uint32_t threshold_2NxN, threshold_Nx2N; if (m_slice->m_sliceType == P_SLICE) { threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0]; threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0]; } else { threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0] + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1; threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0] + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1; } int try_2NxN_first = threshold_2NxN < threshold_Nx2N; if (try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN) { refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */ refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */ md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); #if ENABLE_SCC_EXT checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks, iMVCandList[SIZE_2NxN]); interBest = (md.pred[PRED_2NxN].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxN] : interBest; #else checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks); #endif checkBestMode(md.pred[PRED_2NxN], cuGeom.depth); } if (splitCost < md.bestMode->rdCost + threshold_Nx2N) { refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */ refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */ md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); #if ENABLE_SCC_EXT checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks, iMVCandList[SIZE_Nx2N]); interBest = (md.pred[PRED_Nx2N].rdCost < interBest->rdCost) ? &md.pred[PRED_Nx2N] : interBest; #else checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks); #endif checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth); } if (!try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN) { refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */ refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */ md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); #if ENABLE_SCC_EXT checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks, iMVCandList[SIZE_2NxN]); interBest = (md.pred[PRED_2NxN].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxN] : interBest; #else checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks); #endif checkBestMode(md.pred[PRED_2NxN], cuGeom.depth); } } // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N) if (m_slice->m_sps->maxAMPDepth > depth) { uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost; uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N; if (m_slice->m_sliceType == P_SLICE) { threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0]; threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0]; threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0]; threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0]; } else { threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0] + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1; threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0] + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1; threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0] + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1; threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0] + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1; } bool bHor = false, bVer = false; if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN) bHor = true; else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N) bVer = true; else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0]) { bHor = true; bVer = true; } if (bHor) { int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU; if (try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD) { refMasks[0] = allSplitRefs; /* 75% top */ refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */ md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks); checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth); #if ENABLE_SCC_EXT interBest = (md.pred[PRED_2NxnD].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxnD] : interBest; #endif } if (splitCost < md.bestMode->rdCost + threshold_2NxnU) { refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */ refMasks[1] = allSplitRefs; /* 75% bot */ md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks); checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth); #if ENABLE_SCC_EXT interBest = (md.pred[PRED_2NxnU].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxnU] : interBest; #endif } if (!try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD) { refMasks[0] = allSplitRefs; /* 75% top */ refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */ md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks); checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth); #if ENABLE_SCC_EXT interBest = (md.pred[PRED_2NxnD].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxnD] : interBest; #endif } } if (bVer) { int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N; if (try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N) { refMasks[0] = allSplitRefs; /* 75% left */ refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */ md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks); checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth); #if ENABLE_SCC_EXT interBest = (md.pred[PRED_nRx2N].rdCost < interBest->rdCost) ? &md.pred[PRED_nRx2N] : interBest; #endif } if (splitCost < md.bestMode->rdCost + threshold_nLx2N) { refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left */ refMasks[1] = allSplitRefs; /* 75% right */ md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks); checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth); #if ENABLE_SCC_EXT interBest = (md.pred[PRED_nLx2N].rdCost < interBest->rdCost) ? &md.pred[PRED_nLx2N] : interBest; #endif } if (!try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N) { refMasks[0] = allSplitRefs; /* 75% left */ refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */ md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks); checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth); #if ENABLE_SCC_EXT interBest = (md.pred[PRED_nRx2N].rdCost < interBest->rdCost) ? &md.pred[PRED_nRx2N] : interBest; #endif } } } } #if ENABLE_SCC_EXT if (m_param->bEnableSCC) { bool intraBlockCopyFastSearch = (m_param->bEnableSCC == 1) ? true : false, bUse1DSearchFor8x8 = false, bValid; md.pred[PRED_IBC_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); checkIntraBC_rd5_6(md.pred[PRED_IBC_2Nx2N], cuGeom, SIZE_2Nx2N, false, bUse1DSearchFor8x8, *ibc); checkBestMode(md.pred[PRED_IBC_2Nx2N], depth); if (intraBlockCopyFastSearch) { if ((int)depth == m_slice->m_sps->log2DiffMaxMinCodingBlockSize) { md.pred[PRED_IBC_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); checkIntraBC_rd5_6(md.pred[PRED_IBC_Nx2N], cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_Nx2N] + 8)); checkBestMode(md.pred[PRED_IBC_Nx2N], depth); md.pred[PRED_MIXED_IBC_NX2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); bValid = predMixedIntraBCInterSearch(md.pred[PRED_MIXED_IBC_NX2N], cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, SIZE_Nx2N, iMVCandList[SIZE_Nx2N]); if (bValid) encodeResAndCalcRdInterCU(md.pred[PRED_MIXED_IBC_NX2N], cuGeom); else md.pred[PRED_MIXED_IBC_NX2N].rdCost = UINT64_MAX; checkBestMode(md.pred[PRED_MIXED_IBC_NX2N], depth); md.pred[PRED_IBC_2NxN].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); checkIntraBC_rd5_6(md.pred[PRED_IBC_2NxN], cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_2NxN] + 8)); checkBestMode(md.pred[PRED_IBC_2NxN], depth); md.pred[PRED_MIXED_IBC_2NXN].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); bValid = predMixedIntraBCInterSearch(md.pred[PRED_MIXED_IBC_2NXN], cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, SIZE_2NxN, iMVCandList[SIZE_2NxN]); if (bValid) encodeResAndCalcRdInterCU(md.pred[PRED_MIXED_IBC_2NXN], cuGeom); else md.pred[PRED_MIXED_IBC_2NXN].rdCost = UINT64_MAX; checkBestMode(md.pred[PRED_MIXED_IBC_2NXN], depth); } } else // full search { md.pred[PRED_IBC_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); checkIntraBC_rd5_6(md.pred[PRED_IBC_2NxN], cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_2NxN] + 8)); checkBestMode(md.pred[PRED_IBC_2NxN], depth); md.pred[PRED_IBC_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); checkIntraBC_rd5_6(md.pred[PRED_IBC_Nx2N], cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_Nx2N] + 8)); checkBestMode(md.pred[PRED_IBC_Nx2N], depth); } } #endif if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE) && !((m_param->bCTUInfo & 4) && bCtuInfoCheck)) { if (!m_param->limitReferences || splitIntra) { ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]); md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp); checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N); checkBestMode(md.pred[PRED_INTRA], depth); if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3) { md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp); checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN); checkBestMode(md.pred[PRED_INTRA_NxN], depth); } } else { ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]); } } } #if ENABLE_SCC_EXT // If Intra BC keep last coded Mv if (md.bestMode->cu.isInter(0)) { MVField mvField; const CUData* cu = &md.bestMode->cu; md.bestMode->cu.getMvField(cu, 0, 0, mvField); int iRefIdxFirst = mvField.refIdx; md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField); int iRefIdxLast = mvField.refIdx; bool isIntraBCFirst = (iRefIdxFirst >= 0) ? cu->m_slice->m_refFrameList[0][iRefIdxFirst]->m_poc == cu->m_slice->m_poc : false; bool isIntraBCLast = (iRefIdxLast >= 0) ? cu->m_slice->m_refFrameList[0][iRefIdxLast]->m_poc == cu->m_slice->m_poc : false; if (isIntraBCFirst || isIntraBCLast) { if (cu->m_partSize[0] == SIZE_2Nx2N) { md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField); if (mvField.mv != cu->m_lastIntraBCMv[0]) { md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0]; md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv; } } else if (cu->m_partSize[0] == SIZE_2NxN || cu->m_partSize[0] == SIZE_Nx2N) { // mixed PU, only one partition is IntraBC coded if (isIntraBCFirst != isIntraBCLast) { if (isIntraBCFirst) { // Part 0 md.bestMode->cu.getMvField(cu, 0, 0, mvField); if (mvField.mv != cu->m_lastIntraBCMv[0]) { md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0]; md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv; } } else if (isIntraBCLast) { // Part 1 md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField); if (mvField.mv != cu->m_lastIntraBCMv[0]) { md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0]; md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv; } } } else // normal IntraBC CU { // Part 0 md.bestMode->cu.getMvField(cu, 0, 0, mvField); if (mvField.mv != cu->m_lastIntraBCMv[0]) { md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0]; md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv; } // Part 1 md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField); if (mvField.mv != cu->m_lastIntraBCMv[0]) { md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0]; md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv; } } } else { // NxN for (int part = 0; part < 4; part++) { md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 4 + part, 0, mvField); if (mvField.mv != cu->m_lastIntraBCMv[0]) { md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0]; md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv; } } } } } // is inter #endif if ((md.bestMode->cu.isInter(0) && !(md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)) && (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) { uint32_t numPU = md.bestMode->cu.getNumPartInter(0); for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) { PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx); motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, m_csp != X265_CSP_I400); } encodeResAndCalcRdInterCU(*md.bestMode, cuGeom); } if (m_bTryLossless) tryLossless(cuGeom); if (mightSplit) addSplitFlagCost(*md.bestMode, cuGeom.depth); } if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4) { if (mightNotSplit) { CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr); int8_t maxTUDepth = -1; for (uint32_t i = 0; i < cuGeom.numPartitions; i++) maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]); ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth; } } /* compare split RD cost against best cost */ if (mightSplit && !skipRecursion) checkBestMode(md.pred[PRED_SPLIT], depth); if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth) { int cuIdx = (cuGeom.childOffset - 1) / 3; cacheCost[cuIdx] = md.bestMode->rdCost; } /* determine which motion references the parent CU should search */ splitCUData.initSplitCUData(); if (m_param->limitReferences & X265_REF_LIMIT_DEPTH) { if (md.bestMode == &md.pred[PRED_SPLIT]) splitCUData.splitRefs = allSplitRefs; else { /* use best merge/inter mode, in case of intra use 2Nx2N inter references */ CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : (m_param->bEnableSCC ? interBest->cu : md.bestMode->cu); uint32_t numPU = cu.getNumPartInter(0); for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0)) splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx); } } if (m_param->limitModes) { splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0 splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1 splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost; } /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); for (int i = 0; i < !!m_param->bEnableSCC + 1; i++) md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[i], parentCTU.m_cuAddr, cuGeom.absPartIdx); } else { if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16) { qprdRefine(parentCTU, cuGeom, qp, qp); SplitData splitData[4]; splitData[0].initSplitCUData(); splitData[1].initSplitCUData(); splitData[2].initSplitCUData(); splitData[3].initSplitCUData(); uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs; splitCUData.initSplitCUData(); if (m_param->limitReferences & X265_REF_LIMIT_DEPTH) { if (md.bestMode == &md.pred[PRED_SPLIT]) splitCUData.splitRefs = allSplitRefs; else { /* use best merge/inter mode, in case of intra use 2Nx2N inter references */ CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu; uint32_t numPU = cu.getNumPartInter(0); for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0)) splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx); } } if (m_param->limitModes) { splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0 splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1 splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost; } } } return splitCUData; } void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp) { uint32_t depth = cuGeom.depth; ModeDepth& md = m_modeDepth[depth]; md.bestMode = NULL; m_evaluateInter = 0; bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth; int split = 0; TrainingData td; td.init(parentCTU, cuGeom); if (!m_param->bDynamicRefine) m_refineLevel = m_param->interRefine; else m_refineLevel = m_frame->m_classifyFrame ? 1 : 3; if (m_param->interRefine == 1) split = (m_param->scaleFactor && bDecidedDepth && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP && (!mightNotSplit || (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1)))); else split = (m_param->scaleFactor && bDecidedDepth && (!mightNotSplit || (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1)))); td.split = split; if ((bDecidedDepth && mightNotSplit) || (m_param->bAnalysisType == HEVC_INFO && parentCTU.m_cuDepth[cuGeom.absPartIdx] == 4)) { setLambdaFromQP(parentCTU, qp, lqp); Mode& mode = md.pred[0]; md.bestMode = &mode; mode.cu.initSubCU(parentCTU, cuGeom, qp); PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]; if (parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2) { if (m_param->intraRefine == 4) compressIntraCU(parentCTU, cuGeom, qp); else { bool reuseModes = !((m_param->intraRefine == 3) || (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX)); if (reuseModes) { memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions); memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions); } checkIntra(mode, cuGeom, size); } } else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2) { mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false); uint32_t numPU = parentCTU.getNumPartInter(cuGeom.absPartIdx); for (uint32_t part = 0; part < numPU; part++) { PredictionUnit pu(mode.cu, cuGeom, part); if (m_param->analysisLoadReuseLevel == 10 || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7)) { x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData; int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx; mode.cu.m_mergeFlag[pu.puAbsPartIdx] = interDataCTU->mergeFlag[cuIdx + part]; mode.cu.setPUInterDir(interDataCTU->interDir[cuIdx + part], pu.puAbsPartIdx, part); for (int list = 0; list < m_slice->isInterB() + 1; list++) { mode.cu.setPUMv(list, interDataCTU->mv[list][cuIdx + part].word, pu.puAbsPartIdx, part); mode.cu.setPURefIdx(list, interDataCTU->refIdx[list][cuIdx + part], pu.puAbsPartIdx, part); mode.cu.m_mvpIdx[list][pu.puAbsPartIdx] = interDataCTU->mvpIdx[list][cuIdx + part]; } if (!mode.cu.m_mergeFlag[pu.puAbsPartIdx]) { if (m_param->interRefine == 1) m_me.setSourcePU(*mode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, false); //AMVP MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2]; mode.cu.getNeighbourMV(part, pu.puAbsPartIdx, mode.interNeighbours); for (int list = 0; list < m_slice->isInterB() + 1; list++) { int ref = mode.cu.m_refIdx[list][pu.puAbsPartIdx]; if (ref == -1) continue; MV mvp; int numMvc = mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc, part, pu.puAbsPartIdx); mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]]; if (m_param->interRefine == 1) { MV outmv, mvpSelect[3]; mvpSelect[0] = interDataCTU->mv[list][cuIdx + part].word; if (m_param->mvRefine > 1) { mvpSelect[1] = mvp; if(m_param->mvRefine > 2) mvpSelect[2] = mode.amvpCand[list][ref][!(mode.cu.m_mvpIdx[list][pu.puAbsPartIdx])]; } searchMV(mode, list, ref, outmv, mvpSelect, numMvc, mvc); mode.cu.setPUMv(list, outmv, pu.puAbsPartIdx, part); } mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]]/*mvp*/; } } else { MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists uint8_t candDir[MRG_MAX_NUM_CANDS]; mode.cu.getInterMergeCandidates(pu.puAbsPartIdx, part, candMvField, candDir); uint8_t mvpIdx = mode.cu.m_mvpIdx[0][pu.puAbsPartIdx]; if (mode.cu.isBipredRestriction()) { /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */ if (candDir[mvpIdx] == 3) { candDir[mvpIdx] = 1; candMvField[mvpIdx][1].refIdx = REF_NOT_VALID; } } mode.cu.setPUInterDir(candDir[mvpIdx], pu.puAbsPartIdx, part); mode.cu.setPUMv(0, candMvField[mvpIdx][0].mv, pu.puAbsPartIdx, part); mode.cu.setPUMv(1, candMvField[mvpIdx][1].mv, pu.puAbsPartIdx, part); mode.cu.setPURefIdx(0, (int8_t)candMvField[mvpIdx][0].refIdx, pu.puAbsPartIdx, part); mode.cu.setPURefIdx(1, (int8_t)candMvField[mvpIdx][1].refIdx, pu.puAbsPartIdx, part); } } motionCompensation(mode.cu, pu, mode.predYuv, true, (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)); } if (!m_param->interRefine && !m_param->bDynamicRefine && parentCTU.isSkipped(cuGeom.absPartIdx)) encodeResAndCalcRdSkipCU(mode); else encodeResAndCalcRdInterCU(mode, cuGeom); /* checkMerge2Nx2N function performs checkDQP after encoding residual, do the same */ bool mergeInter2Nx2N = size == SIZE_2Nx2N && mode.cu.m_mergeFlag[0]; if (parentCTU.isSkipped(cuGeom.absPartIdx) || mergeInter2Nx2N) checkDQP(mode, cuGeom); } if (m_refineLevel < 2) { if (m_bTryLossless) tryLossless(cuGeom); if (mightSplit) addSplitFlagCost(*md.bestMode, cuGeom.depth); if (mightSplit && m_param->rdLevel < 5) checkDQPForSplitPred(*md.bestMode, cuGeom); } if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7) { for (int list = 0; list < m_slice->isInterB() + 1; list++) { m_modeFlag[list] = true; if (parentCTU.m_skipFlag[list][cuGeom.absPartIdx] == 1 && cuGeom.numPartitions <= 16) m_checkMergeAndSkipOnly[list] = true; } m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp); for (int list = 0; list < m_slice->isInterB() + 1; list++) { m_modeFlag[list] = false; m_checkMergeAndSkipOnly[list] = false; } } if (m_param->bDynamicRefine) classifyCU(parentCTU,cuGeom, *md.bestMode, td); if (m_refineLevel > 1 || (m_refineLevel && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP && !mode.cu.isSkipped(0))) { if (parentCTU.m_cuDepth[cuGeom.absPartIdx] < 4 && mightNotSplit) m_evaluateInter = 1; else bDecidedDepth = true; m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp); m_evaluateInter = 0; } } if (!bDecidedDepth || split) { Mode* splitPred = &md.pred[PRED_SPLIT]; if (!split) md.bestMode = splitPred; splitPred->initCosts(); CUData* splitCU = &splitPred->cu; splitCU->initSubCU(parentCTU, cuGeom, qp); uint32_t nextDepth = depth + 1; ModeDepth& nd = m_modeDepth[nextDepth]; invalidateContexts(nextDepth); Entropy* nextContext = &m_rqt[depth].cur; int nextQP = qp; for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); if (childGeom.flags & CUGeom::PRESENT) { m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx); m_rqt[nextDepth].cur.load(*nextContext); if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth) nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom)); int lamdaQP = (m_param->analysisLoadReuseLevel >= 7) ? nextQP : lqp; if (split) m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP); else qprdRefine(parentCTU, childGeom, nextQP, lamdaQP); // Save best CU and pred data for this sub CU splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); splitPred->addSubCosts(*nd.bestMode); nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); nextContext = &nd.bestMode->contexts; } else { splitCU->setEmptyPart(childGeom, subPartIdx); // Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions); } } nextContext->store(splitPred->contexts); if (mightNotSplit) addSplitFlagCost(*splitPred, cuGeom.depth); else updateModeCost(*splitPred); if (m_refineLevel) { if (m_param->rdLevel > 1) checkBestMode(*splitPred, cuGeom.depth); else if (splitPred->sa8dCost < md.bestMode->sa8dCost) md.bestMode = splitPred; } checkDQPForSplitPred(*splitPred, cuGeom); /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[0], parentCTU.m_cuAddr, cuGeom.absPartIdx); } if (m_param->bDynamicRefine && bDecidedDepth) trainCU(parentCTU, cuGeom, *md.bestMode, td); } void Analysis::classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData) { uint32_t depth = cuGeom.depth; trainData.cuVariance = calculateCUVariance(ctu, cuGeom); if (m_frame->m_classifyFrame) { uint64_t diffRefine[X265_REFINE_INTER_LEVELS]; uint64_t diffRefineRd[X265_REFINE_INTER_LEVELS]; float probRefine[X265_REFINE_INTER_LEVELS] = { 0 }; uint8_t varRefineLevel = 1; uint8_t rdRefineLevel = 1; uint64_t cuCost = bestMode.rdCost; int offset = (depth * X265_REFINE_INTER_LEVELS); if (cuCost < m_frame->m_classifyRd[offset]) m_refineLevel = 1; else { uint64_t trainingCount = 0; for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++) { offset = (depth * X265_REFINE_INTER_LEVELS) + i; trainingCount += m_frame->m_classifyCount[offset]; } for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++) { offset = (depth * X265_REFINE_INTER_LEVELS) + i; /* Calculate distance values */ diffRefine[i] = abs((int64_t)(trainData.cuVariance - m_frame->m_classifyVariance[offset])); diffRefineRd[i] = abs((int64_t)(cuCost - m_frame->m_classifyRd[offset])); /* Calculate prior probability - ranges between 0 and 1 */ if (trainingCount) probRefine[i] = ((float)m_frame->m_classifyCount[offset] / (float)trainingCount); /* Bayesian classification - P(c|x)P(x) = P(x|c)P(c) P(c|x) is the posterior probability of class given predictor. P(c) is the prior probability of class. P(x|c) is the likelihood which is the probability of predictor given class. P(x) is the prior probability of predictor.*/ int curRefineLevel = m_refineLevel - 1; if ((diffRefine[i] * probRefine[curRefineLevel]) < (diffRefine[curRefineLevel] * probRefine[i])) varRefineLevel = i + 1; if ((diffRefineRd[i] * probRefine[curRefineLevel]) < (diffRefineRd[curRefineLevel] * probRefine[i])) rdRefineLevel = i + 1; } m_refineLevel = X265_MAX(varRefineLevel, rdRefineLevel); } } } void Analysis::trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData) { uint32_t depth = cuGeom.depth; int classify = 1; if (!m_frame->m_classifyFrame) { /* classify = 1 : CUs for which the save data matches with that after encoding with refine-inter 3 and CUs that has split. classify = 2 : CUs which are encoded as simple modes (Skip/Merge/2Nx2N). classify = 3 : CUs encoded as any other mode. */ bool refineInter0 = (trainData.predMode == ctu.m_predMode[cuGeom.absPartIdx] && trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx] && trainData.mergeFlag == ctu.m_mergeFlag[cuGeom.absPartIdx]); bool refineInter1 = (depth == m_param->maxCUDepth - 1) && trainData.split; if (refineInter0 || refineInter1) classify = 1; else if (trainData.partSize == SIZE_2Nx2N && trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx]) classify = 2; else classify = 3; } else classify = m_refineLevel; uint64_t cuCost = bestMode.rdCost; int offset = (depth * X265_REFINE_INTER_LEVELS) + classify - 1; ctu.m_collectCURd[offset] += cuCost; ctu.m_collectCUVariance[offset] += trainData.cuVariance; ctu.m_collectCUCount[offset]++; } /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom) { uint32_t depth = cuGeom.depth; ModeDepth& md = m_modeDepth[depth]; Yuv *fencYuv = &md.fencYuv; /* Note that these two Mode instances are named MERGE and SKIP but they may * hold the reverse when the function returns. We toggle between the two modes */ Mode* tempPred = &merge; Mode* bestPred = &skip; X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n"); tempPred->initCosts(); tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N); tempPred->cu.setPredModeSubParts(MODE_INTER); tempPred->cu.m_mergeFlag[0] = true; bestPred->initCosts(); bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N); bestPred->cu.setPredModeSubParts(MODE_INTER); bestPred->cu.m_mergeFlag[0] = true; MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists uint8_t candDir[MRG_MAX_NUM_CANDS]; uint32_t numMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, candMvField, candDir); PredictionUnit pu(merge.cu, cuGeom, 0); bestPred->sa8dCost = MAX_INT64; int bestSadCand = -1; int sizeIdx = cuGeom.log2CUSize - 2; int safeX, maxSafeMv; if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE) { safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3; maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4; } for (uint32_t i = 0; i < numMergeCand; ++i) { if (m_bFrameParallel) { // Parallel slices bound check if (m_param->maxSlices > 1) { // NOTE: First row in slice can't negative if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY) continue; // Last row in slice can't reference beyond bound since it is another slice area // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY) continue; } if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 || candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4) continue; } if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE && tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol && candMvField[i][0].mv.x > maxSafeMv) // skip merge candidates which reference beyond safe reference area continue; tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx X265_CHECK(m_slice->m_sliceType == B_SLICE || !(candDir[i] & 0x10), " invalid merge for P slice\n"); tempPred->cu.m_interDir[0] = candDir[i]; tempPred->cu.m_mv[0][0] = candMvField[i][0].mv; tempPred->cu.m_mv[1][0] = candMvField[i][1].mv; tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx; tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx; motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)); tempPred->sa8dBits = getTUBits(i, numMergeCand); tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size); if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)) { tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize); tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize); } tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits); if (tempPred->sa8dCost < bestPred->sa8dCost) { bestSadCand = i; std::swap(tempPred, bestPred); } } /* force mode decision to take inter or intra */ if (bestSadCand < 0) return; /* calculate the motion compensation for chroma for the best mode selected */ if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* Chroma MC was done above */ motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true); if (m_param->rdLevel) { if (m_param->bLossless) bestPred->rdCost = MAX_INT64; else encodeResAndCalcRdSkipCU(*bestPred); /* Encode with residual */ tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand; tempPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0); tempPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0); tempPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0); tempPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0); tempPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0); tempPred->sa8dCost = bestPred->sa8dCost; tempPred->sa8dBits = bestPred->sa8dBits; tempPred->predYuv.copyFromYuv(bestPred->predYuv); encodeResAndCalcRdInterCU(*tempPred, cuGeom); md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred; } else md.bestMode = bestPred; /* broadcast sets of MV field data */ md.bestMode->cu.setPUInterDir(candDir[bestSadCand], 0, 0); md.bestMode->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0); md.bestMode->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0); md.bestMode->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0); md.bestMode->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0); checkDQP(*md.bestMode, cuGeom); } /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom) { uint32_t depth = cuGeom.depth; /* Note that these two Mode instances are named MERGE and SKIP but they may * hold the reverse when the function returns. We toggle between the two modes */ Mode* tempPred = &merge; Mode* bestPred = &skip; merge.initCosts(); merge.cu.setPredModeSubParts(MODE_INTER); merge.cu.setPartSizeSubParts(SIZE_2Nx2N); merge.cu.m_mergeFlag[0] = true; skip.initCosts(); skip.cu.setPredModeSubParts(MODE_INTER); skip.cu.setPartSizeSubParts(SIZE_2Nx2N); skip.cu.m_mergeFlag[0] = true; MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists uint8_t candDir[MRG_MAX_NUM_CANDS]; uint32_t numMergeCand = merge.cu.getInterMergeCandidates(0, 0, candMvField, candDir); #if ENABLE_SCC_EXT restrictBipredMergeCand(&merge.cu, 0, candMvField, candDir, numMergeCand); #endif PredictionUnit pu(merge.cu, cuGeom, 0); bool foundCbf0Merge = false; bool triedPZero = false, triedBZero = false; bestPred->rdCost = MAX_INT64; int safeX, maxSafeMv; if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE) { safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3; maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4; } for (uint32_t i = 0; i < numMergeCand; i++) { if (m_bFrameParallel) { // Parallel slices bound check if (m_param->maxSlices > 1) { // NOTE: First row in slice can't negative if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY) continue; // Last row in slice can't reference beyond bound since it is another slice area // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY) continue; } if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 || candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4) continue; } /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */ if (candDir[i] == 1 && !candMvField[i][0].mv.word && !candMvField[i][0].refIdx) { if (triedPZero) continue; triedPZero = true; } else if (candDir[i] == 3 && !candMvField[i][0].mv.word && !candMvField[i][0].refIdx && !candMvField[i][1].mv.word && !candMvField[i][1].refIdx) { if (triedBZero) continue; triedBZero = true; } if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE && tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol && candMvField[i][0].mv.x > maxSafeMv) // skip merge candidates which reference beyond safe reference area continue; #if ENABLE_SCC_EXT if ((candDir[i] == 1 || candDir[i] == 3) && (m_slice->m_refPOCList[0][candMvField[i][0].refIdx] == m_slice->m_poc)) { continue; } #endif tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */ tempPred->cu.m_interDir[0] = candDir[i]; tempPred->cu.m_mv[0][0] = candMvField[i][0].mv; tempPred->cu.m_mv[1][0] = candMvField[i][1].mv; tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx; tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx; tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */ motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_csp != X265_CSP_I400); uint8_t hasCbf = true; bool swapped = false; if (!foundCbf0Merge) { /* if the best prediction has CBF (not a skip) then try merge with residual */ encodeResAndCalcRdInterCU(*tempPred, cuGeom); hasCbf = tempPred->cu.getQtRootCbf(0); foundCbf0Merge = !hasCbf; if (tempPred->rdCost < bestPred->rdCost) { std::swap(tempPred, bestPred); swapped = true; } } if (!m_param->bLossless && hasCbf) { /* try merge without residual (skip), if not lossless coding */ if (swapped) { tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; tempPred->cu.m_interDir[0] = candDir[i]; tempPred->cu.m_mv[0][0] = candMvField[i][0].mv; tempPred->cu.m_mv[1][0] = candMvField[i][1].mv; tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx; tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx; tempPred->cu.setPredModeSubParts(MODE_INTER); tempPred->predYuv.copyFromYuv(bestPred->predYuv); } encodeResAndCalcRdSkipCU(*tempPred); if (tempPred->rdCost < bestPred->rdCost) std::swap(tempPred, bestPred); } } if (bestPred->rdCost < MAX_INT64) { m_modeDepth[depth].bestMode = bestPred; /* broadcast sets of MV field data */ uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0]; bestPred->cu.setPUInterDir(candDir[bestCand], 0, 0); bestPred->cu.setPUMv(0, candMvField[bestCand][0].mv, 0, 0); bestPred->cu.setPUMv(1, candMvField[bestCand][1].mv, 0, 0); bestPred->cu.setPURefIdx(0, (int8_t)candMvField[bestCand][0].refIdx, 0, 0); bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestCand][1].refIdx, 0, 0); checkDQP(*bestPred, cuGeom); } } #if ENABLE_SCC_EXT void Analysis::checkRDCostIntraBCMerge2Nx2N(Mode& mergeIBC, const CUGeom& cuGeom) { mergeIBC.initCosts(); MVField cMvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS]; uint32_t numValidMergeCand = 0; CUData cu = mergeIBC.cu; PredictionUnit pu(mergeIBC.cu, cuGeom, 0); mergeIBC.rdCost = MAX_INT64; for (uint32_t ui = 0; ui < m_slice->m_maxNumMergeCand; ++ui) { interDirNeighbours[ui] = 0; } int org_qp; int xPos = cu.m_cuPelX; int yPos = cu.m_cuPelY; int width = 1 << cu.m_log2CUSize[0]; int height = 1 << cu.m_log2CUSize[0]; uint8_t depth = cu.m_cuDepth[0]; mergeIBC.cu.setPartSizeSubParts(SIZE_2Nx2N); Mode tempPred = m_modeDepth[depth].pred[PRED_MERGE_IBC]; numValidMergeCand = cu.getInterMergeCandidates(0, 0, cMvFieldNeighbours, interDirNeighbours); cu.roundMergeCandidates(cMvFieldNeighbours, numValidMergeCand); restrictBipredMergeCand(&cu, 0, cMvFieldNeighbours, interDirNeighbours, numValidMergeCand); for (uint32_t mergeCand = 0; mergeCand < numValidMergeCand; ++mergeCand) { if (interDirNeighbours[mergeCand] != 1) { continue; } if (m_slice->m_refPOCList[0][cMvFieldNeighbours[mergeCand][0].refIdx] != m_slice->m_poc) { continue; } if (!isBlockVectorValid(xPos, yPos, width, height, &cu, 0, 0, (cMvFieldNeighbours[mergeCand][0].mv.x >> 2), (cMvFieldNeighbours[mergeCand][0].mv.y >> 2), m_param->maxCUSize)) { continue; } // set MC parameters cu.setPredModeSubParts(MODE_INTER); cu.setPartSizeSubParts(SIZE_2Nx2N); cu.m_mergeFlag[0] = true; cu.m_mvpIdx[0][0] = mergeCand; cu.setPUInterDir(interDirNeighbours[mergeCand], 0, 0); cu.setPUMv(0, cMvFieldNeighbours[mergeCand][0].mv, 0, 0); cu.setPUMv(1, cMvFieldNeighbours[mergeCand][1].mv, 0, 0); cu.setPURefIdx(0, (int8_t)cMvFieldNeighbours[mergeCand][0].refIdx, 0, 0); cu.setPURefIdx(1, (int8_t)cMvFieldNeighbours[mergeCand][1].refIdx, 0, 0); motionCompensation(cu, pu, mergeIBC.predYuv, true, m_csp != X265_CSP_I400); org_qp = cu.m_qp[0]; encodeResAndCalcRdInterCU(mergeIBC, cuGeom); if (mergeIBC.rdCost < tempPred.rdCost) std::swap(mergeIBC, tempPred); cu.setQPSubParts(org_qp, 0, depth); } std::swap(tempPred, mergeIBC); checkBestMode(mergeIBC, depth); checkDQP(mergeIBC, cuGeom); } #endif void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2]) { interMode.initCosts(); interMode.cu.setPartSizeSubParts(partSize); interMode.cu.setPredModeSubParts(MODE_INTER); int numPredDir = m_slice->isInterP() ? 1 : 2; if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU) { int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2; int index = 0; uint32_t numPU = interMode.cu.getNumPartInter(0); for (uint32_t part = 0; part < numPU; part++) { MotionData* bestME = interMode.bestME[part]; for (int32_t i = 0; i < numPredDir; i++) bestME[i].ref = m_reuseRef[refOffset + index++]; } } if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU) { uint32_t numPU = interMode.cu.getNumPartInter(0); for (uint32_t part = 0; part < numPU; part++) { MotionData* bestME = interMode.bestME[part]; for (int32_t i = 0; i < numPredDir; i++) { int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame]; bestME[i].ref = ref[cuGeom.absPartIdx]; bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word; bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx]; } } } predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400), refMask); /* predInterSearch sets interMode.sa8dBits */ const Yuv& fencYuv = *interMode.fencYuv; Yuv& predYuv = interMode.predYuv; int part = partitionFromLog2Size(cuGeom.log2CUSize); interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size); if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)) { interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize); interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize); } interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits); if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU) { int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2; int index = 0; uint32_t numPU = interMode.cu.getNumPartInter(0); for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) { MotionData* bestME = interMode.bestME[puIdx]; for (int32_t i = 0; i < numPredDir; i++) m_reuseRef[refOffset + index++] = bestME[i].ref; } } } void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2], MV* iMVCandList) { interMode.initCosts(); interMode.cu.setPartSizeSubParts(partSize); interMode.cu.setPredModeSubParts(MODE_INTER); int numPredDir = m_slice->isInterP() ? 1 : 2; if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU) { int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2; int index = 0; uint32_t numPU = interMode.cu.getNumPartInter(0); for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) { MotionData* bestME = interMode.bestME[puIdx]; for (int32_t i = 0; i < numPredDir; i++) bestME[i].ref = m_reuseRef[refOffset + index++]; } } if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU) { uint32_t numPU = interMode.cu.getNumPartInter(0); for (uint32_t part = 0; part < numPU; part++) { MotionData* bestME = interMode.bestME[part]; for (int32_t i = 0; i < numPredDir; i++) { int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame]; bestME[i].ref = ref[cuGeom.absPartIdx]; bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word; bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx]; } } } predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, refMask, iMVCandList); /* predInterSearch sets interMode.sa8dBits, but this is ignored */ encodeResAndCalcRdInterCU(interMode, cuGeom); if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU) { int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2; int index = 0; uint32_t numPU = interMode.cu.getNumPartInter(0); for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) { MotionData* bestME = interMode.bestME[puIdx]; for (int32_t i = 0; i < numPredDir; i++) m_reuseRef[refOffset + index++] = bestME[i].ref; } } } #if ENABLE_SCC_EXT void Analysis::checkIntraBC_rd5_6(Mode& intraBCMode, const CUGeom& cuGeom, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc, MV* iMVCandList) { intraBCMode.initCosts(); intraBCMode.cu.setPartSizeSubParts(ePartSize); intraBCMode.cu.setPredModeSubParts(MODE_INTER); intraBCMode.cu.setLumaIntraDirSubParts(DC_IDX, 0, cuGeom.depth); intraBCMode.cu.setChromIntraDirSubParts(DC_IDX, 0, cuGeom.depth); for (int i = 0; i < 2; i++) intraBCMode.cu.m_lastIntraBCMv[i] = ibc.m_lastIntraBCMv[i]; bool bValid = predIntraBCSearch(intraBCMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, ePartSize, testOnlyPred, bUse1DSearchFor8x8, ibc); if (bValid) encodeResAndCalcRdInterCU(intraBCMode, cuGeom); else intraBCMode.rdCost = UINT64_MAX; if (bValid && (intraBCMode.cu.m_log2CUSize[0] <= 4) && (intraBCMode.cu.m_partSize[0] == SIZE_2NxN || intraBCMode.cu.m_partSize[0] == SIZE_Nx2N)) { int dummyWidth, dummyHeight; uint32_t partAddr = 0; intraBCMode.cu.getPartIndexAndSize(1, partAddr, dummyWidth, dummyHeight); iMVCandList[0] = intraBCMode.cu.m_mv[0][0]; iMVCandList[1] = intraBCMode.cu.m_mv[0][partAddr]; } } #endif void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom) { CUData& cu = bidir2Nx2N.cu; #if ENABLE_SCC_EXT if ((cu.is8x8BipredRestriction(inter2Nx2N.bestME[0][0].mv, inter2Nx2N.bestME[0][1].mv, inter2Nx2N.bestME[0][0].ref, inter2Nx2N.bestME[0][1].ref) ? (cu.m_cuDepth[0] == 3) : cu.isBipredRestriction()) || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT) #else if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT) #endif { bidir2Nx2N.sa8dCost = MAX_INT64; bidir2Nx2N.rdCost = MAX_INT64; return; } const Yuv& fencYuv = *bidir2Nx2N.fencYuv; MV mvzero(0, 0); int partEnum = cuGeom.log2CUSize - 2; bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0]; bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1]; MotionData* bestME = bidir2Nx2N.bestME[0]; int ref0 = bestME[0].ref; MV mvp0 = bestME[0].mvp; int mvpIdx0 = bestME[0].mvpIdx; int ref1 = bestME[1].ref; MV mvp1 = bestME[1].mvp; int mvpIdx1 = bestME[1].mvpIdx; bidir2Nx2N.initCosts(); cu.setPartSizeSubParts(SIZE_2Nx2N); cu.setPredModeSubParts(MODE_INTER); cu.setPUInterDir(3, 0, 0); cu.setPURefIdx(0, (int8_t)ref0, 0, 0); cu.setPURefIdx(1, (int8_t)ref1, 0, 0); cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0; cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1; cu.m_mergeFlag[0] = 0; /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */ cu.setPUMv(0, bestME[0].mv, 0, 0); cu.m_mvd[0][0] = bestME[0].mv - mvp0; cu.setPUMv(1, bestME[1].mv, 0, 0); cu.m_mvd[1][0] = bestME[1].mv - mvp1; PredictionUnit pu(cu, cuGeom, 0); motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)); int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size); if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)) { /* Add in chroma distortion */ sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize); sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize); } bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits); bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero(); if (bTryZero) { /* Do not try zero MV if unidir motion predictors are beyond * valid search area */ MV mvmin, mvmax; int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight); setSearchRange(cu, mvzero, merange, mvmin, mvmax); mvmax.y += 2; // there is some pad for subpel refine mvmin <<= 2; mvmax <<= 2; bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax); bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax); } if (bTryZero) { /* Estimate cost of BIDIR using coincident blocks */ Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; int zsa8d; if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)) { cu.m_mv[0][0] = mvzero; cu.m_mv[1][0] = mvzero; motionCompensation(cu, pu, tmpPredYuv, true, true); zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size); zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize); zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize); } else { pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx); pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx); intptr_t refStride = m_slice->m_mref[0][0].lumaStride; primitives.pu[partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32); zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size); } uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0); uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1); uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1); /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */ mvp0 = checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvpIdx0, bits0, zcost); mvp1 = checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvpIdx1, bits1, zcost); uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); zcost = zsa8d + m_rdCost.getCost(zbits); if (zcost < bidir2Nx2N.sa8dCost) { bidir2Nx2N.sa8dBits = zbits; bidir2Nx2N.sa8dCost = zcost; cu.setPUMv(0, mvzero, 0, 0); cu.m_mvd[0][0] = mvzero - mvp0; cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0; cu.setPUMv(1, mvzero, 0, 0); cu.m_mvd[1][0] = mvzero - mvp1; cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1; if (m_bChromaSa8d) /* real MC was already performed */ bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv); else motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400); } else if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)) { /* recover overwritten motion vectors */ cu.m_mv[0][0] = bestME[0].mv; cu.m_mv[1][0] = bestME[1].mv; } } } void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom) { if (cuGeom.depth < ctu.m_cuDepth[cuGeom.absPartIdx] && cuGeom.depth < ctu.m_encData->m_param->maxCUDepth) { for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); if (childGeom.flags & CUGeom::PRESENT) encodeResidue(ctu, childGeom); } return; } uint32_t absPartIdx = cuGeom.absPartIdx; int sizeIdx = cuGeom.log2CUSize - 2; /* reuse the bestMode data structures at the current depth */ Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode; CUData& cu = bestMode->cu; cu.copyFromPic(ctu, cuGeom, m_csp); PicYuv& reconPic = *m_frame->m_reconPic[0]; Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv; if (cuGeom.depth) m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx); X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n"); if (cu.isIntra(0)) { ProfileCUScope(ctu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); // not really RDO, but close enough uint32_t tuDepthRange[2]; cu.getIntraTUQtDepthRange(tuDepthRange, 0); residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange); if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) { getBestIntraModeChroma(*bestMode, cuGeom); residualQTIntraChroma(*bestMode, cuGeom, 0, 0); } } else // if (cu.isInter(0)) { ProfileCUScope(ctu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]); // not really RDO, but close enough X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n"); /* Calculate residual for current CU part into depth sized resiYuv */ ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */ Yuv& predYuv = m_modeDepth[0].bestMode->predYuv; pixel* predY = predYuv.getLumaAddr(absPartIdx); primitives.cu[sizeIdx].sub_ps(resiYuv.m_buf[0], resiYuv.m_size, fencYuv.m_buf[0], predY, fencYuv.m_size, predYuv.m_size); if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) { pixel* predU = predYuv.getCbAddr(absPartIdx); pixel* predV = predYuv.getCrAddr(absPartIdx); primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize, fencYuv.m_buf[1], predU, fencYuv.m_csize, predYuv.m_csize); primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize, fencYuv.m_buf[2], predV, fencYuv.m_csize, predYuv.m_csize); } uint32_t tuDepthRange[2]; cu.getInterTUQtDepthRange(tuDepthRange, 0); residualTransformQuantInter(*bestMode, cuGeom, 0, 0, tuDepthRange); if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0)) cu.setPredModeSubParts(MODE_SKIP); /* residualTransformQuantInter() wrote transformed residual back into * resiYuv. Generate the recon pixels by adding it to the prediction */ if (cu.m_cbf[0][0]) { bool reconPicAlign = (reconPic.m_cuOffsetY[cu.m_cuAddr] + reconPic.m_buOffsetY[absPartIdx]) % 64 == 0; bool predYalign = predYuv.getAddrOffset(absPartIdx, predYuv.m_size) % 64 == 0; primitives.cu[sizeIdx].add_ps[reconPicAlign && predYalign && (reconPic.m_stride % 64 == 0) && (predYuv.m_size % 64 == 0) && (resiYuv.m_size % 64 == 0)](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size); } else primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, predY, predYuv.m_size); if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) { pixel* predU = predYuv.getCbAddr(absPartIdx); pixel* predV = predYuv.getCrAddr(absPartIdx); if (cu.m_cbf[1][0]) { bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0; bool predUalign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0; primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predUalign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) && (resiYuv.m_csize % 64 == 0)](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize); } else primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predU, predYuv.m_csize); if (cu.m_cbf[2][0]) { bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0; bool predValign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0; primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predValign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) && (resiYuv.m_csize % 64 == 0)](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize); } else primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predV, predYuv.m_csize); } } cu.updatePic(cuGeom.depth, m_frame->m_fencPic->m_picCsp); } void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth) { if (m_param->rdLevel >= 3) { /* code the split flag (0 or 1) and update bit costs */ mode.contexts.resetBits(); mode.contexts.codeSplitFlag(mode.cu, 0, depth); uint32_t bits = mode.contexts.getNumberOfWrittenBits(); mode.totalBits += bits; updateModeCost(mode); } else if (m_param->rdLevel <= 1) { mode.sa8dBits++; mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits); } else { mode.totalBits++; updateModeCost(mode); } } uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom) { /* Do not attempt to code a block larger than the largest block in the * co-located CTUs in L0 and L1 */ int currentQP = parentCTU.m_qp[0]; int previousQP = currentQP; uint32_t minDepth0 = 4, minDepth1 = 4; uint32_t sum = 0; int numRefs = 0; int refPresent = (!m_slice->m_param->bEnableSCC && m_slice->m_numRefIdx[0]) || ((!m_slice->m_param->bEnableSCC && (m_slice->m_numRefIdx[0] - 1))); if (refPresent) { numRefs++; const CUData& cu = *m_slice->m_refFrameList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr); previousQP = cu.m_qp[0]; if (!cu.m_cuDepth[cuGeom.absPartIdx]) return 0; for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4) { uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i]; minDepth0 = X265_MIN(d, minDepth0); sum += d; } } if (m_slice->m_numRefIdx[1]) { numRefs++; const CUData& cu = *m_slice->m_refFrameList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr); if (!cu.m_cuDepth[cuGeom.absPartIdx]) return 0; for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4) { uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i]; minDepth1 = X265_MIN(d, minDepth1); sum += d; } } if (!numRefs) return 0; uint32_t minDepth = X265_MIN(minDepth0, minDepth1); uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2); /* allow block size growth if QP is raising or avg depth is * less than 1.5 of min depth */ if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1))) minDepth -= 1; return minDepth; } /* returns true if recursion should be stopped */ bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode) { /* early exit when the RD cost of best mode at depth n is less than the sum * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright, * left, colocated) and avg cost of that CU at depth "n" with weightage for * each quantity */ uint32_t depth = cuGeom.depth; FrameData& curEncData = *m_frame->m_encData; FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth]; uint64_t cuCount = cuStat.count[depth]; uint64_t neighCost = 0, neighCount = 0; const CUData* above = parentCTU.m_cuAbove; if (above) { FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr]; neighCost += astat.avgCost[depth] * astat.count[depth]; neighCount += astat.count[depth]; const CUData* aboveLeft = parentCTU.m_cuAboveLeft; if (aboveLeft) { FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr]; neighCost += lstat.avgCost[depth] * lstat.count[depth]; neighCount += lstat.count[depth]; } const CUData* aboveRight = parentCTU.m_cuAboveRight; if (aboveRight) { FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr]; neighCost += rstat.avgCost[depth] * rstat.count[depth]; neighCount += rstat.count[depth]; } } const CUData* left = parentCTU.m_cuLeft; if (left) { FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr]; neighCost += nstat.avgCost[depth] * nstat.count[depth]; neighCount += nstat.count[depth]; } // give 60% weight to all CU's and 40% weight to neighbour CU's if (neighCount + cuCount) { uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount)); uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost; if (curCost < avgCost && avgCost) return true; } return false; } bool Analysis::complexityCheckCU(const Mode& bestMode) { if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP) { uint32_t mean = 0; uint32_t homo = 0; uint32_t cuSize = bestMode.fencYuv->m_size; for (uint32_t y = 0; y < cuSize; y++) { for (uint32_t x = 0; x < cuSize; x++) { mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]); } } mean = mean / (cuSize * cuSize); for (uint32_t y = 0; y < cuSize; y++) { for (uint32_t x = 0; x < cuSize; x++) { homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean)); } } homo = homo / (cuSize * cuSize); if (homo < (.1 * mean)) return true; return false; } else { int blockType = bestMode.cu.m_log2CUSize[0] - LOG2_UNIT_SIZE; int shift = bestMode.cu.m_log2CUSize[0] * LOG2_UNIT_SIZE; intptr_t stride = m_frame->m_fencPic->m_stride; intptr_t blockOffsetLuma = bestMode.cu.m_cuPelX + bestMode.cu.m_cuPelY * stride; uint64_t sum_ss = primitives.cu[blockType].var(m_frame->m_edgeBitPic + blockOffsetLuma, stride); uint32_t sum = (uint32_t)sum_ss; uint32_t ss = (uint32_t)(sum_ss >> 32); uint32_t pixelCount = 1 << shift; double cuEdgeVariance = (ss - ((double)sum * sum / pixelCount)) / pixelCount; if (cuEdgeVariance > (double)m_param->edgeVarThreshold) return false; else return true; } } uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom) { uint32_t cuVariance = 0; uint32_t *blockVariance = m_frame->m_lowres.blockVariance; int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16; uint32_t width = m_frame->m_fencPic->m_picWidth; uint32_t height = m_frame->m_fencPic->m_picHeight; uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]; uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]; uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr; uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth; uint32_t cnt = 0; for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr) { for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr) { uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr); cuVariance += blockVariance[idx]; cnt++; } } return cuVariance / cnt; } double Analysis::aqQPOffset(const CUData& ctu, const CUGeom& cuGeom) { uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1); PicQPAdaptationLayer* pQPLayer = &m_frame->m_lowres.pAQLayer[aqDepth]; uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pQPLayer->aqPartWidth; uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pQPLayer->aqPartHeight; uint32_t aqStride = pQPLayer->numAQPartInWidth; double dQpOffset = pQPLayer->dQpOffset[aqPosY * aqStride + aqPosX]; return dQpOffset; } double Analysis::cuTreeQPOffset(const CUData& ctu, const CUGeom& cuGeom) { uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1); PicQPAdaptationLayer* pcAQLayer = &m_frame->m_lowres.pAQLayer[aqDepth]; uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pcAQLayer->aqPartWidth; uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pcAQLayer->aqPartHeight; uint32_t aqStride = pcAQLayer->numAQPartInWidth; double dQpOffset = pcAQLayer->dCuTreeOffset[aqPosY * aqStride + aqPosX]; return dQpOffset; } int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp) { FrameData& curEncData = *m_frame->m_encData; double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp; bool bCuTreeOffset = IS_REFERENCED(m_frame) && m_param->rc.cuTree && !complexCheck; if ((m_param->analysisMultiPassDistortion && m_param->rc.bStatRead) || (m_param->ctuDistortionRefine && m_param->analysisLoad)) { x265_analysis_distortion_data* distortionData = m_frame->m_analysisData.distortionData; if ((distortionData->threshold[ctu.m_cuAddr] < 0.9 || distortionData->threshold[ctu.m_cuAddr] > 1.1) && distortionData->highDistortionCtuCount && distortionData->lowDistortionCtuCount) qp += distortionData->offset[ctu.m_cuAddr]; } if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree) { int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx; if (ctu.m_slice->m_sliceType == I_SLICE) return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_intra_data*)m_frame->m_analysisData.intraData)->cuQPOff[cuIdx])); else return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_inter_data*)m_frame->m_analysisData.interData)->cuQPOff[cuIdx])); } if (m_param->rc.hevcAq) { /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */ double dQpOffset = 0; if (bCuTreeOffset) { dQpOffset = cuTreeQPOffset(ctu, cuGeom); } else { dQpOffset = aqQPOffset(ctu, cuGeom); if (complexCheck) { int32_t offset = (int32_t)(dQpOffset * 100 + .5); double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5)); int32_t max_threshold = (int32_t)(threshold * 100 + .5); return (offset < max_threshold); } } qp += dQpOffset; } else { int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16; /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */ double *qpoffs = bCuTreeOffset ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset; if (qpoffs) { uint32_t width = m_frame->m_fencPic->m_picWidth; uint32_t height = m_frame->m_fencPic->m_picHeight; uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]; uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]; uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr; uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth; double dQpOffset = 0; uint32_t cnt = 0; for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr) { for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr) { uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr); dQpOffset += qpoffs[idx]; cnt++; } } dQpOffset /= cnt; qp += dQpOffset; if (complexCheck) { int32_t offset = (int32_t)(dQpOffset * 100 + .5); double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5)); int32_t max_threshold = (int32_t)(threshold * 100 + .5); return (offset < max_threshold); } } } return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)(qp + 0.5)); } void Analysis::normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype) { static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); // 416 static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); // 235963 int shift = (X265_DEPTH - 8); double s = 1 + 0.005 * qp; // Calculate denominator of normalization factor uint64_t fDc_den = 0, fAc_den = 0; // 1. Calculate dc component uint64_t z_o = 0; for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 4) { for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 4) { uint32_t temp = src[block_yy * blockSize + block_xx] >> shift; z_o += temp * temp; // 2 * (Z(0)) pow(2) } } fDc_den = (2 * z_o) + (blockSize * blockSize * ssim_c1); // 2 * (Z(0)) pow(2) + N * C1 fDc_den /= ((blockSize >> 2) * (blockSize >> 2)); // 2. Calculate ac component uint64_t z_k = 0; int block = (int)(((log((double)blockSize) / log((double)2)) - 2) + 0.5); primitives.cu[block].normFact(src, blockSize, shift, &z_k); // Remove the DC part z_k -= z_o; fAc_den = z_k + int(s * z_k) + ssim_c2; fAc_den /= ((blockSize >> 2) * (blockSize >> 2)); ctu.m_fAc_den[ttype] = fAc_den; ctu.m_fDc_den[ttype] = fDc_den; } void Analysis::calculateNormFactor(CUData& ctu, int qp) { const pixel* srcY = m_modeDepth[0].fencYuv.m_buf[0]; uint32_t blockSize = m_modeDepth[0].fencYuv.m_size; normFactor(srcY, blockSize, ctu, qp, TEXT_LUMA); if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) { const pixel* srcU = m_modeDepth[0].fencYuv.m_buf[1]; const pixel* srcV = m_modeDepth[0].fencYuv.m_buf[2]; uint32_t blockSizeC = m_modeDepth[0].fencYuv.m_csize; normFactor(srcU, blockSizeC, ctu, qp, TEXT_CHROMA_U); normFactor(srcV, blockSizeC, ctu, qp, TEXT_CHROMA_V); } } int Analysis::findSameContentRefCount(const CUData& parentCTU, const CUGeom& cuGeom) { int sameContentRef = 0; int m_curPoc = parentCTU.m_slice->m_poc; int prevChange = m_prevCtuInfoChange[cuGeom.absPartIdx]; int numPredDir = m_slice->isInterP() ? 1 : 2; for (int list = 0; list < numPredDir; list++) { for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++) { int refPoc = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_poc; #if ENABLE_SCC_EXT if (refPoc == m_curPoc) continue; #endif int refPrevChange = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_addOnPrevChange[parentCTU.m_cuAddr][cuGeom.absPartIdx]; if ((refPoc < prevChange && refPoc < m_curPoc) || (refPoc > m_curPoc && prevChange < m_curPoc && refPrevChange > m_curPoc) || ((refPoc == prevChange) && (m_additionalCtuInfo[cuGeom.absPartIdx] == CTU_INFO_CHANGE))) sameContentRef++; /* Content changed */ } } return sameContentRef; }