diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs index f6f5903684..10cbee5e6f 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs @@ -10,12 +10,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { internal partial struct Block8x8F { - /// - /// Transpose the block into the destination block. + /// + /// Fallback method to transpose a block into the destination block on non AVX supported CPUs. /// /// The destination block [MethodImpl(InliningOptions.ShortMethod)] - public void TransposeInto(ref Block8x8F d) + public void TransposeIntoFallback(ref Block8x8F d) { d.V0L.X = V0L.X; d.V1L.X = V0L.Y; diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt index 6ee0540213..f47d9106ee 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt @@ -23,12 +23,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { internal partial struct Block8x8F { - /// - /// Transpose the block into the destination block. + /// + /// Fallback method to transpose a block into the destination block on non AVX supported CPUs. /// /// The destination block [MethodImpl(InliningOptions.ShortMethod)] - public void TransposeInto(ref Block8x8F d) + public void TransposeIntoFallback(ref Block8x8F d) { <# PushIndent(" "); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index b7835d6706..547e116230 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -6,6 +6,10 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using System.Text; // ReSharper disable InconsistentNaming @@ -596,5 +600,98 @@ private static void GuardBlockIndex(int idx) DebugGuard.MustBeLessThan(idx, Size, nameof(idx)); DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx)); } + + /// + /// Transpose the block into the destination block. + /// + /// The destination block + [MethodImpl(InliningOptions.ShortMethod)] + public void TransposeInto(ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported) + { + this.TransposeIntoAvx(ref d); + } + else +#endif + { + this.TransposeIntoFallback(ref d); + } + } + +#if SUPPORTS_RUNTIME_INTRINSICS + /// + /// AVX-only variant for executing . + /// + /// + [MethodImpl(InliningOptions.ShortMethod)] + public void TransposeIntoAvx(ref Block8x8F d) + { + Vector256 r0 = Avx.InsertVector128( + Unsafe.As>(ref this.V0L).ToVector256(), + Unsafe.As>(ref this.V4L), + 1); + + Vector256 r1 = Avx.InsertVector128( + Unsafe.As>(ref this.V1L).ToVector256(), + Unsafe.As>(ref this.V5L), + 1); + + Vector256 r2 = Avx.InsertVector128( + Unsafe.As>(ref this.V2L).ToVector256(), + Unsafe.As>(ref this.V6L), + 1); + + Vector256 r3 = Avx.InsertVector128( + Unsafe.As>(ref this.V3L).ToVector256(), + Unsafe.As>(ref this.V7L), + 1); + + Vector256 r4 = Avx.InsertVector128( + Unsafe.As>(ref this.V0R).ToVector256(), + Unsafe.As>(ref this.V4R), + 1); + + Vector256 r5 = Avx.InsertVector128( + Unsafe.As>(ref this.V1R).ToVector256(), + Unsafe.As>(ref this.V5R), + 1); + + Vector256 r6 = Avx.InsertVector128( + Unsafe.As>(ref this.V2R).ToVector256(), + Unsafe.As>(ref this.V6R), + 1); + + Vector256 r7 = Avx.InsertVector128( + Unsafe.As>(ref this.V3R).ToVector256(), + Unsafe.As>(ref this.V7R), + 1); + + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackLow(r2, r3); + Vector256 v = Avx.Shuffle(t0, t2, 0x4E); + Unsafe.As>(ref d.V0L) = Avx.Blend(t0, v, 0xCC); + Unsafe.As>(ref d.V1L) = Avx.Blend(t2, v, 0x33); + + Vector256 t4 = Avx.UnpackLow(r4, r5); + Vector256 t6 = Avx.UnpackLow(r6, r7); + v = Avx.Shuffle(t4, t6, 0x4E); + Unsafe.As>(ref d.V4L) = Avx.Blend(t4, v, 0xCC); + Unsafe.As>(ref d.V5L) = Avx.Blend(t6, v, 0x33); + + Vector256 t1 = Avx.UnpackHigh(r0, r1); + Vector256 t3 = Avx.UnpackHigh(r2, r3); + v = Avx.Shuffle(t1, t3, 0x4E); + Unsafe.As>(ref d.V2L) = Avx.Blend(t1, v, 0xCC); + Unsafe.As>(ref d.V3L) = Avx.Blend(t3, v, 0x33); + + Vector256 t5 = Avx.UnpackHigh(r4, r5); + Vector256 t7 = Avx.UnpackHigh(r6, r7); + v = Avx.Shuffle(t5, t7, 0x4E); + Unsafe.As>(ref d.V6L) = Avx.Blend(t5, v, 0xCC); + Unsafe.As>(ref d.V7L) = Avx.Blend(t7, v, 0x33); + } +#endif } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index ee06f2bdeb..d0b373609b 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -1,4 +1,4 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. using System.Numerics; @@ -50,8 +50,6 @@ internal static class FastFloatingPointDCT /// Temporary block provided by the caller public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp) { - // TODO: Transpose is a bottleneck now. We need full AVX support to optimize it: - // https://github.com/dotnet/corefx/issues/22940 src.TransposeInto(ref temp); IDCT8x4_LeftPart(ref temp, ref dest); @@ -340,4 +338,4 @@ public static void TransformFDCT( dest.MultiplyInplace(C_0_125); } } -} \ No newline at end of file +} diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs new file mode 100644 index 0000000000..ae1b23df92 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs @@ -0,0 +1,45 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Formats.Jpeg.Components; + +namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations +{ + public class Block8x8F_Transpose + { + private static readonly Block8x8F Source = Create8x8FloatData(); + + [Benchmark(Baseline=true)] + public void TransposeIntoVector4() + { + var dest = default(Block8x8F); + Source.TransposeIntoFallback(ref dest); + } + +#if SUPPORTS_RUNTIME_INTRINSICS + [Benchmark] + public void TransposeIntoAvx() + { + var dest = default(Block8x8F); + Source.TransposeIntoAvx(ref dest); + } +#endif + + private static Block8x8F Create8x8FloatData() + { + var result = new float[64]; + for (int i = 0; i < 8; i++) + { + for (int j = 0; j < 8; j++) + { + result[(i * 8) + j] = (i * 10) + j; + } + } + + var source = default(Block8x8F); + source.LoadFrom(result); + return source; + } + } +} diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index 722521f98d..73a68063c0 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -163,7 +163,7 @@ public void Load_Store_IntArray() } [Fact] - public void TransposeInto() + public void TransposeIntoFallback() { float[] expected = Create8x8FloatData(); ReferenceImplementations.Transpose8x8(expected); @@ -172,7 +172,7 @@ public void TransposeInto() source.LoadFrom(Create8x8FloatData()); var dest = default(Block8x8F); - source.TransposeInto(ref dest); + source.TransposeIntoFallback(ref dest); float[] actual = new float[64]; dest.ScaledCopyTo(actual); @@ -180,6 +180,26 @@ public void TransposeInto() Assert.Equal(expected, actual); } +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void TransposeIntoAvx() + { + float[] expected = Create8x8FloatData(); + ReferenceImplementations.Transpose8x8(expected); + + var source = default(Block8x8F); + source.LoadFrom(Create8x8FloatData()); + + var dest = default(Block8x8F); + source.TransposeIntoAvx(ref dest); + + float[] actual = new float[64]; + dest.ScaledCopyTo(actual); + + Assert.Equal(expected, actual); + } +#endif + private class BufferHolder { public Block8x8F Buffer;