-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #128 from miguelraz/dotprodexample
add dot_product example
- Loading branch information
Showing
2 changed files
with
182 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
### `stdsimd` examples | ||
|
||
This crate is a port of example uses of `stdsimd`, mostly taken from the `packed_simd` crate. | ||
|
||
The examples contain, as in the case of `dot_product.rs`, multiple ways of solving the problem, in order to show idiomatic uses of SIMD and iteration of performance designs. | ||
|
||
Run the tests with the command | ||
|
||
``` | ||
cargo run --example dot_product | ||
``` | ||
|
||
and verify the code for `dot_product.rs` on your machine. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
// Code taken from the `packed_simd` crate | ||
// Run this code with `cargo test --example dot_product` | ||
//use std::iter::zip; | ||
|
||
#![feature(array_chunks)] | ||
#![feature(slice_as_chunks)] | ||
// Add these imports to use the stdsimd library | ||
#![feature(portable_simd)] | ||
use core_simd::simd::*; | ||
|
||
// This is your barebones dot product implementation: | ||
// Take 2 vectors, multiply them element wise and *then* | ||
// go along the resulting array and add up the result. | ||
// In the next example we will see if there | ||
// is any difference to adding and multiplying in tandem. | ||
pub fn dot_prod_scalar_0(a: &[f32], b: &[f32]) -> f32 { | ||
assert_eq!(a.len(), b.len()); | ||
|
||
a.iter().zip(b.iter()).map(|(a, b)| a * b).sum() | ||
} | ||
|
||
// When dealing with SIMD, it is very important to think about the amount | ||
// of data movement and when it happens. We're going over simple computation examples here, and yet | ||
// it is not trivial to understand what may or may not contribute to performance | ||
// changes. Eventually, you will need tools to inspect the generated assembly and confirm your | ||
// hypothesis and benchmarks - we will mention them later on. | ||
// With the use of `fold`, we're doing a multiplication, | ||
// and then adding it to the sum, one element from both vectors at a time. | ||
pub fn dot_prod_scalar_1(a: &[f32], b: &[f32]) -> f32 { | ||
assert_eq!(a.len(), b.len()); | ||
a.iter() | ||
.zip(b.iter()) | ||
.fold(0.0, |a, zipped| a + zipped.0 * zipped.1) | ||
} | ||
|
||
// We now move on to the SIMD implementations: notice the following constructs: | ||
// `array_chunks::<4>`: mapping this over the vector will let use construct SIMD vectors | ||
// `f32x4::from_array`: construct the SIMD vector from a slice | ||
// `(a * b).reduce_sum()`: Multiply both f32x4 vectors together, and then reduce them. | ||
// This approach essentially uses SIMD to produce a vector of length N/4 of all the products, | ||
// and then add those with `sum()`. This is suboptimal. | ||
// TODO: ASCII diagrams | ||
pub fn dot_prod_simd_0(a: &[f32], b: &[f32]) -> f32 { | ||
assert_eq!(a.len(), b.len()); | ||
// TODO handle remainder when a.len() % 4 != 0 | ||
a.array_chunks::<4>() | ||
.map(|&a| f32x4::from_array(a)) | ||
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b))) | ||
.map(|(a, b)| (a * b).reduce_sum()) | ||
.sum() | ||
} | ||
|
||
// There's some simple ways to improve the previous code: | ||
// 1. Make a `zero` `f32x4` SIMD vector that we will be accumulating into | ||
// So that there is only one `sum()` reduction when the last `f32x4` has been processed | ||
// 2. Exploit Fused Multiply Add so that the multiplication, addition and sinking into the reduciton | ||
// happen in the same step. | ||
// If the arrays are large, minimizing the data shuffling will lead to great perf. | ||
// If the arrays are small, handling the remainder elements when the length isn't a multiple of 4 | ||
// Can become a problem. | ||
pub fn dot_prod_simd_1(a: &[f32], b: &[f32]) -> f32 { | ||
assert_eq!(a.len(), b.len()); | ||
// TODO handle remainder when a.len() % 4 != 0 | ||
a.array_chunks::<4>() | ||
.map(|&a| f32x4::from_array(a)) | ||
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b))) | ||
.fold(f32x4::splat(0.0), |acc, zipped| acc + zipped.0 * zipped.1) | ||
.reduce_sum() | ||
} | ||
|
||
// A lot of knowledgeable use of SIMD comes from knowing specific instructions that are | ||
// available - let's try to use the `mul_add` instruction, which is the fused-multiply-add we were looking for. | ||
use std_float::StdFloat; | ||
pub fn dot_prod_simd_2(a: &[f32], b: &[f32]) -> f32 { | ||
assert_eq!(a.len(), b.len()); | ||
// TODO handle remainder when a.len() % 4 != 0 | ||
let mut res = f32x4::splat(0.0); | ||
a.array_chunks::<4>() | ||
.map(|&a| f32x4::from_array(a)) | ||
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b))) | ||
.for_each(|(a, b)| { | ||
res = a.mul_add(b, res); | ||
}); | ||
res.reduce_sum() | ||
} | ||
|
||
// Finally, we will write the same operation but handling the loop remainder. | ||
const LANES: usize = 4; | ||
pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 { | ||
assert_eq!(a.len(), b.len()); | ||
|
||
let (a_extra, a_chunks) = a.as_rchunks(); | ||
let (b_extra, b_chunks) = b.as_rchunks(); | ||
|
||
// These are always true, but for emphasis: | ||
assert_eq!(a_chunks.len(), b_chunks.len()); | ||
assert_eq!(a_extra.len(), b_extra.len()); | ||
|
||
let mut sums = [0.0; LANES]; | ||
for ((x, y), d) in std::iter::zip(a_extra, b_extra).zip(&mut sums) { | ||
*d = x * y; | ||
} | ||
|
||
let mut sums = f32x4::from_array(sums); | ||
std::iter::zip(a_chunks, b_chunks).for_each(|(x, y)| { | ||
sums += f32x4::from_array(*x) * f32x4::from_array(*y); | ||
}); | ||
|
||
sums.reduce_sum() | ||
} | ||
|
||
// Finally, we present an iterator version for handling remainders in a scalar fashion at the end of the loop. | ||
// Unfortunately, this is allocating 1 `XMM` register on the order of `~len(a)` - we'll see how we can get around it in the | ||
// next example. | ||
pub fn dot_prod_simd_4(a: &[f32], b: &[f32]) -> f32 { | ||
let mut sum = a | ||
.array_chunks::<4>() | ||
.map(|&a| f32x4::from_array(a)) | ||
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b))) | ||
.map(|(a, b)| a * b) | ||
.fold(f32x4::splat(0.0), std::ops::Add::add) | ||
.reduce_sum(); | ||
let remain = a.len() - (a.len() % 4); | ||
sum += a[remain..] | ||
.iter() | ||
.zip(&b[remain..]) | ||
.map(|(a, b)| a * b) | ||
.sum::<f32>(); | ||
sum | ||
} | ||
|
||
// This version allocates a single `XMM` register for accumulation, and the folds don't allocate on top of that. | ||
// Notice the the use of `mul_add`, which can do a multiply and an add operation ber iteration. | ||
pub fn dot_prod_simd_5(a: &[f32], b: &[f32]) -> f32 { | ||
a.array_chunks::<4>() | ||
.map(|&a| f32x4::from_array(a)) | ||
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b))) | ||
.fold(f32x4::splat(0.), |acc, (a, b)| a.mul_add(b, acc)) | ||
.reduce_sum() | ||
} | ||
|
||
fn main() { | ||
// Empty main to make cargo happy | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
#[test] | ||
fn smoke_test() { | ||
use super::*; | ||
let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; | ||
let b: Vec<f32> = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0]; | ||
let x: Vec<f32> = [0.5; 1003].to_vec(); | ||
let y: Vec<f32> = [2.0; 1003].to_vec(); | ||
|
||
// Basic check | ||
assert_eq!(0.0, dot_prod_scalar_0(&a, &b)); | ||
assert_eq!(0.0, dot_prod_scalar_1(&a, &b)); | ||
assert_eq!(0.0, dot_prod_simd_0(&a, &b)); | ||
assert_eq!(0.0, dot_prod_simd_1(&a, &b)); | ||
assert_eq!(0.0, dot_prod_simd_2(&a, &b)); | ||
assert_eq!(0.0, dot_prod_simd_3(&a, &b)); | ||
assert_eq!(0.0, dot_prod_simd_4(&a, &b)); | ||
assert_eq!(0.0, dot_prod_simd_5(&a, &b)); | ||
|
||
// We can handle vectors that are non-multiples of 4 | ||
assert_eq!(1003.0, dot_prod_simd_3(&x, &y)); | ||
} | ||
} |