Add better Clenshaw #29

ianhbell · 2021-01-12T03:00:30Z

See chebfun's implementation: https://github.com/chebfun/chebfun/blob/master/%40chebtech/clenshaw.m

ianhbell · 2021-01-12T14:03:30Z

This is not so much faster:

template<int Norder, typename VecType>
double y_Clenshaw_x2_paired(const double x, const VecType& c, const double xmin, const double xmax) {
    auto xscaled2 = (2 * x - (xmax + xmin)) / (xmax - xmin) * 2.0; // x*2
    double u_kp1 = 0, u_kp2 = 0;
    for (int k = Norder; k >= 2; k -= 2) {
        // Do the recurrent calculation
        u_kp2 = c[k] + xscaled2 * u_kp1 - u_kp2;
        u_kp1 = c[k-1] + xscaled2 * u_kp2 - u_kp1;
    }
    if (Norder % 2) {
        auto tmp = u_kp1;
        u_kp1 = c[1] + xscaled2 * u_kp1 - u_kp2;
        u_kp2 = tmp;
    }
    return c[0] + 0.5*xscaled2*u_kp1 - u_kp2;
}

jowr · 2021-01-13T08:02:39Z

I have used this code and I recall getting a substantial increase in speed for fixed size polynomials:

template<typename T>
T T0(const T& x) {
	return 1.0;
};

template<> inline
Eigen::ArrayXd T0(const Eigen::ArrayXd& x) {
Eigen::ArrayXd res;
res.resizeLike(x);
res.setConstant(1.0);
return res;
};

template<typename T>
T T1(const T& x) {
	return x;
};

template<typename T>
T T2(const T& x) {
	return (2.0 * x*x) - 1.0;
};

template<typename T>
T Tn(std::size_t n, const T& x) {
	if (n == 0) return T0(x);
	else if (n == 1) return T1(x);
	else if (n == 2) return T2(x);

	//Potentially faster than the recursive version
	//return (2.0 * x * Tn(n - 1, x)) - Tn(n - 2, x);

	T tnm1 = T2(x);
	T tnm2 = T1(x);
	T tn = tnm1;

	for (std::size_t l = 3; l < n + 1; l++) {
		tn = (2.0 * x * tnm1) - tnm2;
		tnm2 = tnm1;
		tnm1 = tn;
	}

	return tn;
};

template<typename T>
void Tn_series(int n, const T& x, Eigen::Array<T, Eigen::Dynamic, 1>& res) {
	res.resize(0);
	if (n < 0) return;
	res.resize(n + 1);

	res(0, 0) = T0(x);
	if (n == 0) return;

	res(1, 0) = T1(x);
	if (n == 1) return;

	res(2, 0) = T2(x);
	if (n == 2) return;

	//Potentially faster than the recursive version
	//return (2.0 * x * Tn(n - 1, x)) - Tn(n - 2, x);

	T tnm1 = res(2, 0);
	T tnm2 = res(1, 0);
	T tn = tnm1;

	for (int i = 3; i < n + 1; i++) {
		tn = (2.0 * x * tnm1) - tnm2;
		tnm2 = tnm1;
		tnm1 = tn;
		res(i, 0) = tn;
	}

	return;
};

template<typename T>
void Tn_series(int n, const Eigen::Array<T, Eigen::Dynamic, 1>& x, Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>& res) {
	res.resize(0, 0);
	if (n < 0) return;
	try {
		res.resize(x.rows(), n + 1);
	} catch (...) {
		res.resize(0, 0);
		return;
	}

	res.col(0) = T0(x);
	if (n == 0) return;

	res.col(1) = T1(x);
	if (n == 1) return;

	res.col(2) = T2(x);
	if (n == 2) return;

	//Potentially faster than the recursive version
	//res.col(i) = (2.0 * x * Tn(n - 1, x)) - Tn(n - 2, x);

	Eigen::Array<T, Eigen::Dynamic, 1> tnm1 = res.col(2);
	Eigen::Array<T, Eigen::Dynamic, 1> tnm2 = res.col(1);
	Eigen::Array<T, Eigen::Dynamic, 1> tn = tnm1;

	for (int i = 3; i < n + 1; i++) {
		tn = (2.0 * x * tnm1) - tnm2;
		tnm2 = tnm1;
		tnm1 = tn;
		res.col(i) = tn;
	}
	return;
};

The actual calculation was embedded in a class, but you get the idea from this:

virtual const ErrCodes _evaluate_matrix(const double& x_, double& y) const {
    if (_coeffs.rows() <  1) { y = NAN; return error; }
    if (_coeffs.cols() != 1) { y = NAN; return error; }
    //if (x_ < -1) { y = NAN; return error; }
    //if (x_ >  1) { y = NAN; return error; }

    const double x = (x_ - _x_start - _x_halfwidth) / _x_halfwidth;
    const int x_degree = _coeffs.rows() - 1;

    ObservationType res;
    Tn_series(x_degree, x, res);
    y = (res*_coeffs).sum();
    return noError;
}

const ErrCodes _evaluate_matrix(const ObservationType& x_, ObservationType& y) const {
    if (_coeffs.rows() <  1) { y.setConstant(NAN); return error; }
    if (_coeffs.cols() != 1) { y.setConstant(NAN); return error; }
    //if ((x < -1).any()) { y.setConstant(NAN); return error; }
    //if ((x > 1).any()) { y.setConstant(NAN); return error; }

    try {
        y.resizeLike(x_);
        y.setConstant(NAN);
    } catch (...) {
        y.resize(0, 1);
        return error;
    }

    const ObservationType x = (x_ - _x_start - _x_halfwidth) / _x_halfwidth;
    const int x_degree = _coeffs.rows() - 1;

    CoefficientType x_res;
    Tn_series(x_degree, x, x_res);

    //y = x_res.matrix() * _coeffs.matrix();
    for (int i = 0; i < x_res.rows(); i++) {
        y(i) = (x_res.row(i) * _coeffs.transpose()).sum();
    }
    return noError;
}

ianhbell · 2021-01-13T18:27:20Z

I think the point here is that the bonus you get is a consequence of the fact that you are leveraging parallelism at the level of matrix x vector products, a place where Eigen has been extensively optimized. If you were to do a single input, I expect you would see a penalty due to the fact that you have the additional array overhead.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add better Clenshaw #29

Add better Clenshaw #29

ianhbell commented Jan 12, 2021

ianhbell commented Jan 12, 2021

jowr commented Jan 13, 2021

ianhbell commented Jan 13, 2021

Add better Clenshaw #29

Add better Clenshaw #29

Comments

ianhbell commented Jan 12, 2021

ianhbell commented Jan 12, 2021

jowr commented Jan 13, 2021

ianhbell commented Jan 13, 2021