From 8b9fbd46c68c3d19566b1dd53267a2ac33281e35 Mon Sep 17 00:00:00 2001 From: Montes Claros Date: Thu, 12 Oct 2017 21:12:39 -0700 Subject: [PATCH 1/2] Implementation of low-pass subband dct approximation. --- readme.rst | 9 +- source/CMakeLists.txt | 2 +- source/common/CMakeLists.txt | 2 +- source/common/lowpassdct.cpp | 233 +++++++++++++++++++++++++++++++++++ source/common/param.cpp | 6 +- source/common/primitives.cpp | 4 + source/x265.h | 5 + source/x265cli.h | 2 + 8 files changed, 259 insertions(+), 4 deletions(-) create mode 100644 source/common/lowpassdct.cpp diff --git a/readme.rst b/readme.rst index e698899403..85f2e55aed 100644 --- a/readme.rst +++ b/readme.rst @@ -11,4 +11,11 @@ source HEVC encoder. See the developer wiki for instructions for downloading and building the source. x265 is free to use under the `GNU GPL `_ -and is also available under a commercial `license `_ +and is also available under a commercial `license `_ + +## Modifications ## +Here the x265 code was modified to support a lowpass subband approximation for the DCT. +During performance tests this approximation had a gain of around 10% in performance. +Thus allowing encoding time to be reduced at the same rate. + +It also produced very small loss for 23 <= Qp =< 25 and minimal loss for Qp > 25, compared to the standard DCT. diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 1a8f23a016..ecc303a709 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -29,7 +29,7 @@ option(NATIVE_BUILD "Target the build CPU" OFF) option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 134) +set(X265_BUILD 135) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 541abe6d51..7f2e01b846 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -131,7 +131,7 @@ endif(WIN32) add_library(common OBJECT ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${ALTIVEC_PRIMITIVES} ${WINXP} primitives.cpp primitives.h - pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp + pixel.cpp dct.cpp lowpassdct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp constants.cpp constants.h cpu.cpp cpu.h version.cpp threading.cpp threading.h diff --git a/source/common/lowpassdct.cpp b/source/common/lowpassdct.cpp new file mode 100644 index 0000000000..7130bf936c --- /dev/null +++ b/source/common/lowpassdct.cpp @@ -0,0 +1,233 @@ +/***************************************************************************** + * Copyright (C) 2017 Montez Claros + * + * Authors: Montez Claros + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "common.h" +#include "primitives.h" +//#include "contexts.h" // costCoeffNxN_c +//#include "threading.h" // CLZ + +using namespace X265_NS; + +/* original set of encoder primitives */ +static EncoderPrimitives s_rootPrimitives; + +static void lowPassDct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride) +{ + ALIGN_VAR_32(int16_t, coef[4 * 4]); + ALIGN_VAR_32(int16_t, avgBlock[4 * 4]); + int16_t totalSum = 0; + int16_t sum = 0; + + // Calculate average of 2x2 cells + for (int i = 0; i < 4; i++) + for (int j =0; j < 4; j++) + { + sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1] + + src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1]; + totalSum += sum; + avgBlock[i*4 + j] = sum >> 2; + } + + //dct4 + s_rootPrimitives.cu[BLOCK_4x4].dct(avgBlock, coef, 4); + memset(dst, 0, 64 * sizeof(int16_t)); + for (int i = 0; i < 4; i++) + { + memcpy(&dst[i * 8], &coef[i * 4], 4 * sizeof(int16_t)); + } + + // fix first coef with total block average + dst[0] = totalSum << 1; +} + +static void lowPassDct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride) +{ + ALIGN_VAR_32(int16_t, coef[8 * 8]); + ALIGN_VAR_32(int16_t, avgBlock[8 * 8]); + int32_t totalSum = 0; + int16_t sum = 0; + for (int i = 0; i < 8; i++) + for (int j =0; j < 8; j++) + { + sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1] + + src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1]; + totalSum += sum; + avgBlock[i*8 + j] = sum >> 2; + } + + // dct8 + s_rootPrimitives.cu[BLOCK_8x8].dct(avgBlock, coef, 8); + memset(dst, 0, 256 * sizeof(int16_t)); + for (int i = 0; i < 8; i++) + { + memcpy(&dst[i * 16], &coef[i * 8], 8 * sizeof(int16_t)); + } + dst[0] = static_cast(totalSum >> 1); +} + +static void lowPassDct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride) +{ + ALIGN_VAR_32(int16_t, coef[16 * 16]); + ALIGN_VAR_32(int16_t, avgBlock[16 * 16]); + int32_t totalSum = 0; + int16_t sum = 0; + for (int i = 0; i < 16; i++) + for (int j =0; j < 16; j++) + { + sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1] + + src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1]; + totalSum += sum; + avgBlock[i*16 + j] = sum >> 2; + } + + // dct16 + s_rootPrimitives.cu[BLOCK_16x16].dct(avgBlock, coef, 16); + memset(dst, 0, 1024 * sizeof(int16_t)); + for (int i = 0; i < 16; i++) + { + memcpy(&dst[i * 32], &coef[i * 16], 16 * sizeof(int16_t)); + } + dst[0] = static_cast(totalSum >> 3); +} + +/* +static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift) +{ +#if HIGH_BIT_DEPTH + X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > (X265_DEPTH - 8)), "dequant invalid scale %d\n", scale); +#else + // NOTE: maximum of scale is (72 * 256) + X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale); +#endif + X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num); + X265_CHECK((num % 8) == 0, "dequant num %d not multiple of 8\n", num); + X265_CHECK(shift <= 10, "shift too large %d\n", shift); + X265_CHECK(((intptr_t)coef & 31) == 0, "dequant coef buffer not aligned\n"); + + int add, coeffQ; + + add = 1 << (shift - 1); + + for (int n = 0; n < num; n++) + { + coeffQ = (quantCoef[n] * scale + add) >> shift; + coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ); + } +} + +static void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift) +{ + X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num); + + int add, coeffQ; + + shift += 4; + + if (shift > per) + { + add = 1 << (shift - per - 1); + + for (int n = 0; n < num; n++) + { + coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per); + coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ); + } + } + else + { + for (int n = 0; n < num; n++) + { + coeffQ = x265_clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]); + coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ << (per - shift)); + } + } +} + +static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff) +{ + X265_CHECK(qBits >= 8, "qBits less than 8\n"); + X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n"); + int qBits8 = qBits - 8; + uint32_t numSig = 0; + + for (int blockpos = 0; blockpos < numCoeff; blockpos++) + { + int level = coef[blockpos]; + int sign = (level < 0 ? -1 : 1); + + int tmplevel = abs(level) * quantCoeff[blockpos]; + level = ((tmplevel + add) >> qBits); + deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8); + if (level) + ++numSig; + level *= sign; + qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level); + } + + return numSig; +} + +static uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff) +{ + X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n"); + X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n"); + X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n"); + + uint32_t numSig = 0; + + for (int blockpos = 0; blockpos < numCoeff; blockpos++) + { + int level = coef[blockpos]; + int sign = (level < 0 ? -1 : 1); + + int tmplevel = abs(level) * quantCoeff[blockpos]; + level = ((tmplevel + add) >> qBits); + if (level) + ++numSig; + level *= sign; + + // TODO: when we limit range to [-32767, 32767], we can get more performance with output change + // But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible + qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level)); + } + + return numSig; +} +*/ + +namespace X265_NS { +// x265 private namespace + +void setupLowPassPrimitives(EncoderPrimitives& p) +{ + s_rootPrimitives = p; + + //p.dequant_scaling = dequant_scaling_c; + //p.dequant_normal = dequant_normal_c; + //p.quant = quant_c; + //p.nquant = nquant_c; + p.cu[BLOCK_8x8].dct = lowPassDct8_c; + p.cu[BLOCK_16x16].dct = lowPassDct16_c; + p.cu[BLOCK_32x32].dct = lowPassDct32_c; +} +} diff --git a/source/common/param.cpp b/source/common/param.cpp index b12004e3bc..b4afdf3de2 100644 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -287,6 +287,9 @@ void x265_param_default(x265_param* param) param->bUseAnalysisFile = 1; param->csvfpt = NULL; param->forceFlush = 0; + + /* DCT Approximations */ + param->bLowPassDct = 0; } int x265_param_default_preset(x265_param* param, const char* preset, const char* tune) @@ -926,7 +929,8 @@ int x265_param_parse(x265_param* p, const char* name, const char* value) OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2; OPT("min-luma") p->minLuma = (uint16_t)atoi(value); OPT("max-luma") p->maxLuma = (uint16_t)atoi(value); - OPT("uhd-bd") p->uhdBluray = atobool(value); + OPT("lowpass-dct") p->bLowPassDct = atobool(value); + OPT("uhd-bd") p->uhdBluray = atobool(value); else bExtraParams = true; diff --git a/source/common/primitives.cpp b/source/common/primitives.cpp index 211dc2f487..d71ce7b74f 100644 --- a/source/common/primitives.cpp +++ b/source/common/primitives.cpp @@ -58,6 +58,7 @@ void setupIntraPrimitives_c(EncoderPrimitives &p); void setupLoopFilterPrimitives_c(EncoderPrimitives &p); void setupSaoPrimitives_c(EncoderPrimitives &p); void setupSeaIntegralPrimitives_c(EncoderPrimitives &p); +void setupLowPassPrimitives(EncoderPrimitives& p); void setupCPrimitives(EncoderPrimitives &p) { @@ -255,6 +256,9 @@ void x265_setup_primitives(x265_param *param) } #endif + if (param->bLowPassDct) + setupLowPassPrimitives(primitives); + setupAliasPrimitives(primitives); } diff --git a/source/x265.h b/source/x265.h index 4c94b07a8c..637daa9b1e 100644 --- a/source/x265.h +++ b/source/x265.h @@ -1485,6 +1485,11 @@ typedef struct x265_param /* Enable skipping split RD analysis when sum of split CU rdCost larger than none split CU rdCost for Intra CU */ int bEnableSplitRdSkip; + + /* Use low-pass truncated dct approximation + * This DCT approximation is less computational intensive and gives results close to + * standard DCT for QP >= 23 */ + int bLowPassDct; } x265_param; /* x265_param_alloc: diff --git a/source/x265cli.h b/source/x265cli.h index 13b28b4da4..91a363dfcc 100644 --- a/source/x265cli.h +++ b/source/x265cli.h @@ -282,6 +282,7 @@ static const struct option long_options[] = { "force-flush", required_argument, NULL, 0 }, { "splitrd-skip", no_argument, NULL, 0 }, { "no-splitrd-skip", no_argument, NULL, 0 }, + { "lowpass-dct", no_argument, NULL, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, @@ -541,6 +542,7 @@ static void showHelp(x265_param *param) H1("-r/--recon Reconstructed raw image YUV or Y4M output file name\n"); H1(" --recon-depth Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n"); H1(" --recon-y4m-exec pipe reconstructed frames to Y4M viewer, ex:\"ffplay -i pipe:0 -autoexit\"\n"); + H0(" --lowpass-dct Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct)); H1("\nExecutable return codes:\n"); H1(" 0 - encode successful\n"); H1(" 1 - unable to parse command line\n"); From 9358c368ab46926df569747ba2f2c2f03fab19cf Mon Sep 17 00:00:00 2001 From: Montes Claros Date: Thu, 12 Oct 2017 21:22:15 -0700 Subject: [PATCH 2/2] Fix reamde for lowpass dct --- readme.rst | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/readme.rst b/readme.rst index 85f2e55aed..cc54adc084 100644 --- a/readme.rst +++ b/readme.rst @@ -13,9 +13,21 @@ downloading and building the source. x265 is free to use under the `GNU GPL `_ and is also available under a commercial `license `_ -## Modifications ## +================= +Modifications +================= + Here the x265 code was modified to support a lowpass subband approximation for the DCT. During performance tests this approximation had a gain of around 10% in performance. Thus allowing encoding time to be reduced at the same rate. It also produced very small loss for 23 <= Qp =< 25 and minimal loss for Qp > 25, compared to the standard DCT. + +================= +Usage +================= + +´´´ +./x265 input_file -o output_file --qp 25 --lowpass-dct +´´´ +