Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of lowpass subband dct approximation with new param --lowpass-dct #3

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion readme.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,23 @@ source HEVC encoder. See the developer wiki for instructions for
downloading and building the source.

x265 is free to use under the `GNU GPL <http://www.gnu.org/licenses/gpl-2.0.html>`_
and is also available under a commercial `license <http://x265.org>`_
and is also available under a commercial `license <http://x265.org>`_

=================
Modifications
=================

Here the x265 code was modified to support a lowpass subband approximation for the DCT.
During performance tests this approximation had a gain of around 10% in performance.
Thus allowing encoding time to be reduced at the same rate.

It also produced very small loss for 23 <= Qp =< 25 and minimal loss for Qp > 25, compared to the standard DCT.

=================
Usage
=================

´´´
./x265 input_file -o output_file --qp 25 --lowpass-dct
´´´

2 changes: 1 addition & 1 deletion source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ option(NATIVE_BUILD "Target the build CPU" OFF)
option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
# X265_BUILD must be incremented each time the public API is changed
set(X265_BUILD 134)
set(X265_BUILD 135)
configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
"${PROJECT_BINARY_DIR}/x265.def")
configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
Expand Down
2 changes: 1 addition & 1 deletion source/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ endif(WIN32)
add_library(common OBJECT
${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${ALTIVEC_PRIMITIVES} ${WINXP}
primitives.cpp primitives.h
pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
pixel.cpp dct.cpp lowpassdct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
constants.cpp constants.h
cpu.cpp cpu.h version.cpp
threading.cpp threading.h
Expand Down
233 changes: 233 additions & 0 deletions source/common/lowpassdct.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
/*****************************************************************************
* Copyright (C) 2017 Montez Claros
*
* Authors: Montez Claros <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/

#include "common.h"
#include "primitives.h"
//#include "contexts.h" // costCoeffNxN_c
//#include "threading.h" // CLZ

using namespace X265_NS;

/* original set of encoder primitives */
static EncoderPrimitives s_rootPrimitives;

static void lowPassDct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
ALIGN_VAR_32(int16_t, coef[4 * 4]);
ALIGN_VAR_32(int16_t, avgBlock[4 * 4]);
int16_t totalSum = 0;
int16_t sum = 0;

// Calculate average of 2x2 cells
for (int i = 0; i < 4; i++)
for (int j =0; j < 4; j++)
{
sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1]
+ src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1];
totalSum += sum;
avgBlock[i*4 + j] = sum >> 2;
}

//dct4
s_rootPrimitives.cu[BLOCK_4x4].dct(avgBlock, coef, 4);
memset(dst, 0, 64 * sizeof(int16_t));
for (int i = 0; i < 4; i++)
{
memcpy(&dst[i * 8], &coef[i * 4], 4 * sizeof(int16_t));
}

// fix first coef with total block average
dst[0] = totalSum << 1;
}

static void lowPassDct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
ALIGN_VAR_32(int16_t, coef[8 * 8]);
ALIGN_VAR_32(int16_t, avgBlock[8 * 8]);
int32_t totalSum = 0;
int16_t sum = 0;
for (int i = 0; i < 8; i++)
for (int j =0; j < 8; j++)
{
sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1]
+ src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1];
totalSum += sum;
avgBlock[i*8 + j] = sum >> 2;
}

// dct8
s_rootPrimitives.cu[BLOCK_8x8].dct(avgBlock, coef, 8);
memset(dst, 0, 256 * sizeof(int16_t));
for (int i = 0; i < 8; i++)
{
memcpy(&dst[i * 16], &coef[i * 8], 8 * sizeof(int16_t));
}
dst[0] = static_cast<int16_t>(totalSum >> 1);
}

static void lowPassDct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
ALIGN_VAR_32(int16_t, coef[16 * 16]);
ALIGN_VAR_32(int16_t, avgBlock[16 * 16]);
int32_t totalSum = 0;
int16_t sum = 0;
for (int i = 0; i < 16; i++)
for (int j =0; j < 16; j++)
{
sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1]
+ src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1];
totalSum += sum;
avgBlock[i*16 + j] = sum >> 2;
}

// dct16
s_rootPrimitives.cu[BLOCK_16x16].dct(avgBlock, coef, 16);
memset(dst, 0, 1024 * sizeof(int16_t));
for (int i = 0; i < 16; i++)
{
memcpy(&dst[i * 32], &coef[i * 16], 16 * sizeof(int16_t));
}
dst[0] = static_cast<int16_t>(totalSum >> 3);
}

/*
static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
{
#if HIGH_BIT_DEPTH
X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > (X265_DEPTH - 8)), "dequant invalid scale %d\n", scale);
#else
// NOTE: maximum of scale is (72 * 256)
X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale);
#endif
X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
X265_CHECK((num % 8) == 0, "dequant num %d not multiple of 8\n", num);
X265_CHECK(shift <= 10, "shift too large %d\n", shift);
X265_CHECK(((intptr_t)coef & 31) == 0, "dequant coef buffer not aligned\n");

int add, coeffQ;

add = 1 << (shift - 1);

for (int n = 0; n < num; n++)
{
coeffQ = (quantCoef[n] * scale + add) >> shift;
coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
}
}

static void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
{
X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);

int add, coeffQ;

shift += 4;

if (shift > per)
{
add = 1 << (shift - per - 1);

for (int n = 0; n < num; n++)
{
coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
}
}
else
{
for (int n = 0; n < num; n++)
{
coeffQ = x265_clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ << (per - shift));
}
}
}

static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
{
X265_CHECK(qBits >= 8, "qBits less than 8\n");
X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
int qBits8 = qBits - 8;
uint32_t numSig = 0;

for (int blockpos = 0; blockpos < numCoeff; blockpos++)
{
int level = coef[blockpos];
int sign = (level < 0 ? -1 : 1);

int tmplevel = abs(level) * quantCoeff[blockpos];
level = ((tmplevel + add) >> qBits);
deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
if (level)
++numSig;
level *= sign;
qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
}

return numSig;
}

static uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
{
X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n");

uint32_t numSig = 0;

for (int blockpos = 0; blockpos < numCoeff; blockpos++)
{
int level = coef[blockpos];
int sign = (level < 0 ? -1 : 1);

int tmplevel = abs(level) * quantCoeff[blockpos];
level = ((tmplevel + add) >> qBits);
if (level)
++numSig;
level *= sign;

// TODO: when we limit range to [-32767, 32767], we can get more performance with output change
// But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
}

return numSig;
}
*/

namespace X265_NS {
// x265 private namespace

void setupLowPassPrimitives(EncoderPrimitives& p)
{
s_rootPrimitives = p;

//p.dequant_scaling = dequant_scaling_c;
//p.dequant_normal = dequant_normal_c;
//p.quant = quant_c;
//p.nquant = nquant_c;
p.cu[BLOCK_8x8].dct = lowPassDct8_c;
p.cu[BLOCK_16x16].dct = lowPassDct16_c;
p.cu[BLOCK_32x32].dct = lowPassDct32_c;
}
}
6 changes: 5 additions & 1 deletion source/common/param.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,9 @@ void x265_param_default(x265_param* param)
param->bUseAnalysisFile = 1;
param->csvfpt = NULL;
param->forceFlush = 0;

/* DCT Approximations */
param->bLowPassDct = 0;
}

int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
Expand Down Expand Up @@ -926,7 +929,8 @@ int x265_param_parse(x265_param* p, const char* name, const char* value)
OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
OPT("uhd-bd") p->uhdBluray = atobool(value);
OPT("lowpass-dct") p->bLowPassDct = atobool(value);
OPT("uhd-bd") p->uhdBluray = atobool(value);
else
bExtraParams = true;

Expand Down
4 changes: 4 additions & 0 deletions source/common/primitives.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ void setupIntraPrimitives_c(EncoderPrimitives &p);
void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
void setupSaoPrimitives_c(EncoderPrimitives &p);
void setupSeaIntegralPrimitives_c(EncoderPrimitives &p);
void setupLowPassPrimitives(EncoderPrimitives& p);

void setupCPrimitives(EncoderPrimitives &p)
{
Expand Down Expand Up @@ -255,6 +256,9 @@ void x265_setup_primitives(x265_param *param)
}
#endif

if (param->bLowPassDct)
setupLowPassPrimitives(primitives);

setupAliasPrimitives(primitives);
}

Expand Down
5 changes: 5 additions & 0 deletions source/x265.h
Original file line number Diff line number Diff line change
Expand Up @@ -1485,6 +1485,11 @@ typedef struct x265_param

/* Enable skipping split RD analysis when sum of split CU rdCost larger than none split CU rdCost for Intra CU */
int bEnableSplitRdSkip;

/* Use low-pass truncated dct approximation
* This DCT approximation is less computational intensive and gives results close to
* standard DCT for QP >= 23 */
int bLowPassDct;
} x265_param;

/* x265_param_alloc:
Expand Down
2 changes: 2 additions & 0 deletions source/x265cli.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ static const struct option long_options[] =
{ "force-flush", required_argument, NULL, 0 },
{ "splitrd-skip", no_argument, NULL, 0 },
{ "no-splitrd-skip", no_argument, NULL, 0 },
{ "lowpass-dct", no_argument, NULL, 0 },
{ 0, 0, 0, 0 },
{ 0, 0, 0, 0 },
{ 0, 0, 0, 0 },
Expand Down Expand Up @@ -541,6 +542,7 @@ static void showHelp(x265_param *param)
H1("-r/--recon <filename> Reconstructed raw image YUV or Y4M output file name\n");
H1(" --recon-depth <integer> Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
H1(" --recon-y4m-exec <string> pipe reconstructed frames to Y4M viewer, ex:\"ffplay -i pipe:0 -autoexit\"\n");
H0(" --lowpass-dct Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
H1("\nExecutable return codes:\n");
H1(" 0 - encode successful\n");
H1(" 1 - unable to parse command line\n");
Expand Down