From 8b9fbd46c68c3d19566b1dd53267a2ac33281e35 Mon Sep 17 00:00:00 2001
From: Montes Claros <mont3z.claro5@gmail.com>
Date: Thu, 12 Oct 2017 21:12:39 -0700
Subject: [PATCH 1/2] Implementation of low-pass subband dct approximation.

---
 readme.rst                   |   9 +-
 source/CMakeLists.txt        |   2 +-
 source/common/CMakeLists.txt |   2 +-
 source/common/lowpassdct.cpp | 233 +++++++++++++++++++++++++++++++++++
 source/common/param.cpp      |   6 +-
 source/common/primitives.cpp |   4 +
 source/x265.h                |   5 +
 source/x265cli.h             |   2 +
 8 files changed, 259 insertions(+), 4 deletions(-)
 create mode 100644 source/common/lowpassdct.cpp

diff --git a/readme.rst b/readme.rst
index e698899403..85f2e55aed 100644
--- a/readme.rst
+++ b/readme.rst
@@ -11,4 +11,11 @@ source HEVC encoder. See the developer wiki for instructions for
 downloading and building the source.
 
 x265 is free to use under the `GNU GPL <http://www.gnu.org/licenses/gpl-2.0.html>`_ 
-and is also available under a commercial `license <http://x265.org>`_ 
+and is also available under a commercial `license <http://x265.org>`_
+
+## Modifications ##
+Here the x265 code was modified to support a lowpass subband approximation for the DCT.
+During performance tests this approximation had a gain of around 10% in performance. 
+Thus allowing encoding time to be reduced at the same rate.
+
+It also produced very small loss for 23 <= Qp =< 25 and minimal loss for Qp > 25, compared to the standard DCT.
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 1a8f23a016..ecc303a709 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -29,7 +29,7 @@ option(NATIVE_BUILD "Target the build CPU" OFF)
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 134)
+set(X265_BUILD 135)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 541abe6d51..7f2e01b846 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -131,7 +131,7 @@ endif(WIN32)
 add_library(common OBJECT
     ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${ALTIVEC_PRIMITIVES} ${WINXP}
     primitives.cpp primitives.h
-    pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
+    pixel.cpp dct.cpp lowpassdct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
     constants.cpp constants.h
     cpu.cpp cpu.h version.cpp
     threading.cpp threading.h
diff --git a/source/common/lowpassdct.cpp b/source/common/lowpassdct.cpp
new file mode 100644
index 0000000000..7130bf936c
--- /dev/null
+++ b/source/common/lowpassdct.cpp
@@ -0,0 +1,233 @@
+/*****************************************************************************
+ * Copyright (C) 2017 Montez Claros 
+ *
+ * Authors: Montez Claros <mont3z.claros@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+//#include "contexts.h"   // costCoeffNxN_c
+//#include "threading.h"  // CLZ
+
+using namespace X265_NS;
+
+/* original set of encoder primitives */
+static EncoderPrimitives s_rootPrimitives;
+
+static void lowPassDct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+{
+    ALIGN_VAR_32(int16_t, coef[4 * 4]);
+    ALIGN_VAR_32(int16_t, avgBlock[4 * 4]);
+    int16_t totalSum = 0;
+    int16_t sum = 0;
+    
+	// Calculate average of 2x2 cells
+	for (int i = 0; i < 4; i++)
+        for (int j =0; j < 4; j++)
+        {
+            sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1]
+                    + src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1];
+            totalSum += sum;
+            avgBlock[i*4 + j] = sum >> 2;
+        }
+
+	//dct4
+	s_rootPrimitives.cu[BLOCK_4x4].dct(avgBlock, coef, 4);
+    memset(dst, 0, 64 * sizeof(int16_t));
+    for (int i = 0; i < 4; i++)
+    {
+        memcpy(&dst[i * 8], &coef[i * 4], 4 * sizeof(int16_t));
+    }
+
+	// fix first coef with total block average
+    dst[0] = totalSum << 1;
+}
+
+static void lowPassDct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+{
+    ALIGN_VAR_32(int16_t, coef[8 * 8]);
+    ALIGN_VAR_32(int16_t, avgBlock[8 * 8]);
+    int32_t totalSum = 0;
+    int16_t sum = 0;
+    for (int i = 0; i < 8; i++)
+        for (int j =0; j < 8; j++)
+        {
+            sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1]
+                    + src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1];
+            totalSum += sum;
+            avgBlock[i*8 + j] = sum >> 2;
+        }
+
+	// dct8
+	s_rootPrimitives.cu[BLOCK_8x8].dct(avgBlock, coef, 8);
+    memset(dst, 0, 256 * sizeof(int16_t));
+    for (int i = 0; i < 8; i++)
+    {
+        memcpy(&dst[i * 16], &coef[i * 8], 8 * sizeof(int16_t));
+    }
+    dst[0] = static_cast<int16_t>(totalSum >> 1);
+}
+
+static void lowPassDct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+{
+    ALIGN_VAR_32(int16_t, coef[16 * 16]);
+    ALIGN_VAR_32(int16_t, avgBlock[16 * 16]);
+    int32_t totalSum = 0;
+    int16_t sum = 0;
+    for (int i = 0; i < 16; i++)
+        for (int j =0; j < 16; j++)
+        {
+            sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1]
+                    + src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1];
+            totalSum += sum;
+            avgBlock[i*16 + j] = sum >> 2;
+        }
+
+	// dct16
+	s_rootPrimitives.cu[BLOCK_16x16].dct(avgBlock, coef, 16);
+    memset(dst, 0, 1024 * sizeof(int16_t));
+    for (int i = 0; i < 16; i++)
+    {
+        memcpy(&dst[i * 32], &coef[i * 16], 16 * sizeof(int16_t));
+    }
+    dst[0] = static_cast<int16_t>(totalSum >> 3);
+}
+
+/*
+static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
+{
+#if HIGH_BIT_DEPTH
+    X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > (X265_DEPTH - 8)), "dequant invalid scale %d\n", scale);
+#else
+    // NOTE: maximum of scale is (72 * 256)
+    X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale);
+#endif
+    X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
+    X265_CHECK((num % 8) == 0, "dequant num %d not multiple of 8\n", num);
+    X265_CHECK(shift <= 10, "shift too large %d\n", shift);
+    X265_CHECK(((intptr_t)coef & 31) == 0, "dequant coef buffer not aligned\n");
+
+    int add, coeffQ;
+
+    add = 1 << (shift - 1);
+
+    for (int n = 0; n < num; n++)
+    {
+        coeffQ = (quantCoef[n] * scale + add) >> shift;
+        coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
+    }
+}
+
+static void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
+{
+    X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
+
+    int add, coeffQ;
+
+    shift += 4;
+
+    if (shift > per)
+    {
+        add = 1 << (shift - per - 1);
+
+        for (int n = 0; n < num; n++)
+        {
+            coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
+            coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
+        }
+    }
+    else
+    {
+        for (int n = 0; n < num; n++)
+        {
+            coeffQ   = x265_clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
+            coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ << (per - shift));
+        }
+    }
+}
+
+static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
+{
+    X265_CHECK(qBits >= 8, "qBits less than 8\n");
+    X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
+    int qBits8 = qBits - 8;
+    uint32_t numSig = 0;
+
+    for (int blockpos = 0; blockpos < numCoeff; blockpos++)
+    {
+        int level = coef[blockpos];
+        int sign  = (level < 0 ? -1 : 1);
+
+        int tmplevel = abs(level) * quantCoeff[blockpos];
+        level = ((tmplevel + add) >> qBits);
+        deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
+        if (level)
+            ++numSig;
+        level *= sign;
+        qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
+    }
+
+    return numSig;
+}
+
+static uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
+{
+    X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
+    X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
+    X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n");
+
+    uint32_t numSig = 0;
+
+    for (int blockpos = 0; blockpos < numCoeff; blockpos++)
+    {
+        int level = coef[blockpos];
+        int sign  = (level < 0 ? -1 : 1);
+
+        int tmplevel = abs(level) * quantCoeff[blockpos];
+        level = ((tmplevel + add) >> qBits);
+        if (level)
+            ++numSig;
+        level *= sign;
+
+        // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
+        //       But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
+        qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
+    }
+
+    return numSig;
+}
+*/
+
+namespace X265_NS {
+// x265 private namespace
+
+void setupLowPassPrimitives(EncoderPrimitives& p)
+{
+	s_rootPrimitives = p;
+
+    //p.dequant_scaling = dequant_scaling_c;
+    //p.dequant_normal = dequant_normal_c;
+    //p.quant = quant_c;
+    //p.nquant = nquant_c;
+    p.cu[BLOCK_8x8].dct   = lowPassDct8_c;
+    p.cu[BLOCK_16x16].dct = lowPassDct16_c;
+    p.cu[BLOCK_32x32].dct = lowPassDct32_c;
+}
+}
diff --git a/source/common/param.cpp b/source/common/param.cpp
index b12004e3bc..b4afdf3de2 100644
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -287,6 +287,9 @@ void x265_param_default(x265_param* param)
     param->bUseAnalysisFile = 1;
     param->csvfpt = NULL;
     param->forceFlush = 0;
+
+    /* DCT Approximations */
+	param->bLowPassDct = 0;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -926,7 +929,8 @@ int x265_param_parse(x265_param* p, const char* name, const char* value)
     OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
     OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
     OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
-    OPT("uhd-bd") p->uhdBluray = atobool(value);
+	OPT("lowpass-dct") p->bLowPassDct = atobool(value);
+	OPT("uhd-bd") p->uhdBluray = atobool(value);
     else
         bExtraParams = true;
 
diff --git a/source/common/primitives.cpp b/source/common/primitives.cpp
index 211dc2f487..d71ce7b74f 100644
--- a/source/common/primitives.cpp
+++ b/source/common/primitives.cpp
@@ -58,6 +58,7 @@ void setupIntraPrimitives_c(EncoderPrimitives &p);
 void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
 void setupSaoPrimitives_c(EncoderPrimitives &p);
 void setupSeaIntegralPrimitives_c(EncoderPrimitives &p);
+void setupLowPassPrimitives(EncoderPrimitives& p);
 
 void setupCPrimitives(EncoderPrimitives &p)
 {
@@ -255,6 +256,9 @@ void x265_setup_primitives(x265_param *param)
         }
 #endif
 
+        if (param->bLowPassDct)
+            setupLowPassPrimitives(primitives); 
+
         setupAliasPrimitives(primitives);
     }
 
diff --git a/source/x265.h b/source/x265.h
index 4c94b07a8c..637daa9b1e 100644
--- a/source/x265.h
+++ b/source/x265.h
@@ -1485,6 +1485,11 @@ typedef struct x265_param
 
     /* Enable skipping split RD analysis when sum of split CU rdCost larger than none split CU rdCost for Intra CU */
     int       bEnableSplitRdSkip;
+
+    /* Use low-pass truncated dct approximation 
+     * This DCT approximation is less computational intensive and gives results close to 
+     * standard DCT for QP >= 23 */
+    int       bLowPassDct;
 } x265_param;
 
 /* x265_param_alloc:
diff --git a/source/x265cli.h b/source/x265cli.h
index 13b28b4da4..91a363dfcc 100644
--- a/source/x265cli.h
+++ b/source/x265cli.h
@@ -282,6 +282,7 @@ static const struct option long_options[] =
     { "force-flush",    required_argument, NULL, 0 },
     { "splitrd-skip",         no_argument, NULL, 0 },
     { "no-splitrd-skip",      no_argument, NULL, 0 },
+    { "lowpass-dct",          no_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -541,6 +542,7 @@ static void showHelp(x265_param *param)
     H1("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
     H1("   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
     H1("   --recon-y4m-exec <string>     pipe reconstructed frames to Y4M viewer, ex:\"ffplay -i pipe:0 -autoexit\"\n");
+    H0("   --lowpass-dct                 Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
     H1("\nExecutable return codes:\n");
     H1("    0 - encode successful\n");
     H1("    1 - unable to parse command line\n");

From 9358c368ab46926df569747ba2f2c2f03fab19cf Mon Sep 17 00:00:00 2001
From: Montes Claros <mont3z.claro5@gmail.com>
Date: Thu, 12 Oct 2017 21:22:15 -0700
Subject: [PATCH 2/2] Fix reamde for lowpass dct

---
 readme.rst | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/readme.rst b/readme.rst
index 85f2e55aed..cc54adc084 100644
--- a/readme.rst
+++ b/readme.rst
@@ -13,9 +13,21 @@ downloading and building the source.
 x265 is free to use under the `GNU GPL <http://www.gnu.org/licenses/gpl-2.0.html>`_ 
 and is also available under a commercial `license <http://x265.org>`_
 
-## Modifications ##
+=================
+Modifications 
+=================
+
 Here the x265 code was modified to support a lowpass subband approximation for the DCT.
 During performance tests this approximation had a gain of around 10% in performance. 
 Thus allowing encoding time to be reduced at the same rate.
 
 It also produced very small loss for 23 <= Qp =< 25 and minimal loss for Qp > 25, compared to the standard DCT.
+
+=================
+Usage 
+=================
+
+´´´
+./x265  input_file -o output_file --qp 25 --lowpass-dct 
+´´´
+