Support for row major hsMatrix

Swapping the shaders around to support row major hsMatrices and eliminating the swap function. This change hasn’t filtered down to the dynamic effects which get their uniforms through a giant buffer provided by the engine itself.
H-uru · Feb 27, 2023 · daef67c · daef67c
1 parent b3f999c
commit daef67c
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 75 deletions.
diff --git a/Sources/Plasma/FeatureLib/pfMetalPipeline/ShaderSrc/FixedPipelineShaders.metal b/Sources/Plasma/FeatureLib/pfMetalPipeline/ShaderSrc/FixedPipelineShaders.metal
@@ -265,9 +265,9 @@ vertex ColorInOut pipelineVertexShader(Vertex in [[stage_in]],
     half3 LAmbient = half3(0.0, 0.0, 0.0);
     half3 LDiffuse = half3(0.0, 0.0, 0.0);
 
-    const float3 Ndirection = normalize(uniforms.localToWorldMatrix * float4(in.normal, 0.0)).xyz;
+    const float3 Ndirection = normalize(float4(in.normal, 0.0) * uniforms.localToWorldMatrix).xyz;
 
-    float4 position = (uniforms.localToWorldMatrix * float4(in.position, 1.0));
+    float4 position = (float4(in.position, 1.0) * uniforms.localToWorldMatrix);
     if(temp_hasOnlyWeight1) {
         const float4 position2 = blendMatrix1 * float4(in.position, 1.0);
         position = (in.weight1 * position) + ((1.0f - in.weight1) * position2);
@@ -317,7 +317,7 @@ vertex ColorInOut pipelineVertexShader(Vertex in [[stage_in]],
                                  abs(uniforms.invVtxAlpha - MDiffuse.a));
 
     out.vtxColor = half4(material.rgb, abs(uniforms.invVtxAlpha - MDiffuse.a));
-    const float4 vCamPosition = uniforms.worldToCameraMatrix * position;
+    const float4 vCamPosition = position * uniforms.worldToCameraMatrix;
     //out.vCamNormal = uniforms.worldToCameraMatrix * (uniforms.localToWorldMatrix * float4(in.position, 0.0));
 
     //Fog
@@ -333,13 +333,13 @@ vertex ColorInOut pipelineVertexShader(Vertex in [[stage_in]],
     }
     out.fogColor.rgb = uniforms.fogColor;
 
-    const float4 normal = uniforms.worldToCameraMatrix * (uniforms.localToWorldMatrix * float4(in.normal, 0.0));
+    const float4 normal = (uniforms.localToWorldMatrix * float4(in.normal, 0.0)) * uniforms.worldToCameraMatrix;
 
     for(size_t layer=0; layer<num_layers; layer++) {
         (&out.texCoord1)[layer] = uniforms.sampleLocation(layer, &in.texCoord1, normal, vCamPosition);
     }
 
-    out.position = uniforms.projectionMatrix * vCamPosition;
+    out.position = vCamPosition * uniforms.projectionMatrix;
 
     return out;
 }
@@ -354,35 +354,35 @@ float3 VertexUniforms::sampleLocation(size_t index, thread float3 *texCoords, co
     //Note: If we want to require newer versions of Metal/newer hardware we could pass function pointers instead of doing these ifs.
     if (flags & (kMiscUseReflectionXform | kMiscUseRefractionXform)) {
         matrix = cameraToWorldMatrix;
-        matrix[3][0] = matrix[3][1] = matrix[3][2] = 0;
+        matrix[0][3] = matrix[1][3] = matrix[2][3] = 0;
 
         // This is just a rotation about X of Pi/2 (y = z, z = -y),
         // followed by flipping Z to reflect back towards us (z = -z).
 
         // swap mat[1][0] and mat[2][0]
         float temp;
-        temp = matrix[0][1];
-        matrix[0][1] = matrix[0][2];
-        matrix[0][2] = temp;
+        temp = matrix[1][0];
+        matrix[1][0] = matrix[2][0];
+        matrix[2][0] = temp;
 
         // swap mat[1][1] and mat[2][1]
         temp = matrix[1][1];
-        matrix[1][1] = matrix[1][2];
-        matrix[1][2] = temp;
+        matrix[1][1] = matrix[2][1];
+        matrix[2][1] = temp;
 
         // swap mat[1][2] and mat[2][2]
-        temp = matrix[2][1];
-        matrix[2][1] = matrix[2][2];
+        temp = matrix[1][2];
+        matrix[1][2] = matrix[2][2];
         matrix[2][2] = temp;
 
         if (flags & kMiscUseRefractionXform) {
             // Same as reflection, but then matrix = matrix * scaleMatNegateZ.
 
             // mat[0][2] = -mat[0][2];
-            matrix[2][0] = -matrix[2][0];
+            matrix[0][2] = -matrix[0][2];
 
             // mat[1][2] = -mat[1][2];
-            matrix[2][1] = -matrix[2][1];
+            matrix[1][2] = -matrix[1][2];
 
             // mat[2][2] = -mat[2][2];
             matrix[2][2] = -matrix[2][2];
@@ -398,10 +398,10 @@ float3 VertexUniforms::sampleLocation(size_t index, thread float3 *texCoords, co
         matrix_float4x4 scaleMatrix = matrix_float4x4(1.0);
 
         // hsVector3 camTrans(0.5f, 0.5f, 0.f);
-        scaleMatrix[3][0] = 0.5f;
-        scaleMatrix[3][1] = -0.5f;
+        scaleMatrix[0][3] = 0.5f;
+        scaleMatrix[1][3] = -0.5f;
 
-        matrix = scaleMatrix * translationMatrix;
+        matrix = translationMatrix * scaleMatrix;
 
         // The scale and trans move us from NDC to Screen space. We need to swap
         // the Z and W coordinates so that the texture projection will divide by W
@@ -410,50 +410,50 @@ float3 VertexUniforms::sampleLocation(size_t index, thread float3 *texCoords, co
 
         // swap mat[2][2] and mat[3][2]
         temp = matrix[2][2];
-        matrix[2][2] = matrix[2][3];
-        matrix[2][3] = temp;
+        matrix[2][2] = matrix[3][2];
+        matrix[3][2] = temp;
 
         // swap mat[2][3] and mat[3][3]
-        temp = matrix[3][2];
-        matrix[3][2] = matrix[3][3];
+        temp = matrix[2][3];
+        matrix[2][3] = matrix[3][3];
         matrix[3][3] = temp;
 
         // Multiply by the projection matrix
-        matrix = matrix * projectionMatrix;
+        matrix = projectionMatrix * matrix;
     } else if (flags & kMiscProjection) {
         matrix_float4x4 cam2World = cameraToWorldMatrix;
         if( !(UVWSrc & kUVWPosition) ) {
-            cam2World.columns[3][0] = 0;
-            cam2World.columns[3][1] = 0;
-            cam2World.columns[3][2] = 0;
+            cam2World.columns[0][3] = 0;
+            cam2World.columns[1][3] = 0;
+            cam2World.columns[2][3] = 0;
         }
 
-        matrix = matrix * cam2World;
+        matrix = cam2World * matrix;
     }
 
     float4 sampleCoord;
 
     switch (UVWSrc) {
     case kUVWNormal:
         {
-            sampleCoord = matrix * normal;
+            sampleCoord = normal * matrix;
         }
         break;
     case kUVWPosition:
         {
-            sampleCoord = matrix * camPosition;
+            sampleCoord = camPosition * matrix;
         }
         break;
     case kUVWReflect:
         {
-            sampleCoord = matrix * reflect(normalize(camPosition), normalize(normal));
+            sampleCoord = reflect(normalize(camPosition), normalize(normal)) * matrix;
         }
         break;
     default:
         {
             const int index = UVWSrc & 0x0F;
             if (index < num_uvs) {
-                sampleCoord = matrix * float4(texCoords[index], 1.0);
+                sampleCoord = float4(texCoords[index], 1.0) * matrix;
             } else {
                 //The DX engine will use a UV co-ord of 0,0 if the index is out of range
                 sampleCoord = float4(0.0);
@@ -649,13 +649,13 @@ vertex ShadowCasterInOut shadowVertexShader(Vertex in [[stage_in]],
 {
     ShadowCasterInOut out;
 
-    const float4 vCamPosition = uniforms.worldToCameraMatrix * (uniforms.localToWorldMatrix * float4(in.position, 1.0));
+    const float4 vCamPosition = (float4(in.position, 1.0) * uniforms.localToWorldMatrix) * uniforms.worldToCameraMatrix;
 
     const float4x4 matrix = uniforms.uvTransforms[0].transform;
 
-    out.texCoord1 = (matrix * vCamPosition).xyz;
+    out.texCoord1 = (vCamPosition * matrix).xyz;
 
-    out.position = uniforms.projectionMatrix * vCamPosition;
+    out.position =  vCamPosition * uniforms.projectionMatrix;
 
     return out;
 }

diff --git a/Sources/Plasma/FeatureLib/pfMetalPipeline/ShaderSrc/PlateShaders.metal b/Sources/Plasma/FeatureLib/pfMetalPipeline/ShaderSrc/PlateShaders.metal
@@ -77,9 +77,11 @@ vertex ColorInOut plateVertexShader(PlateVertex in [[stage_in]],
     ColorInOut out;
 
     float4 position = float4(in.position, 0.0, 1.0);
-    position = uniforms.projectionMatrix * position;
-    out.position =  (uniforms.localToWorldMatrix * position);
+    position =  position * uniforms.projectionMatrix;
+    out.position =  ( position * uniforms.localToWorldMatrix);
+    out.position.y *= -1.0f;
     out.texCoord = (float4(in.texCoord, 1.0) * uniforms.uvTransforms[0].transform).xyz;
+    out.texCoord.y = 1.0 - out.texCoord.y;
     out.normal = float4(0.0, 0.0, 1.0, 0.0);
 
     return out;

diff --git a/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalDevice.cpp b/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalDevice.cpp
@@ -62,39 +62,15 @@ You can contact Cyan Worlds, Inc. by email [email protected]
 
 #include "plMetalPipelineState.h"
 
-matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src, matrix_float4x4* dst, bool swapOrder)
+matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src, matrix_float4x4* dst)
 {
     if (src.fFlags & hsMatrix44::kIsIdent)
     {
         memcpy(dst, &matrix_identity_float4x4, sizeof(float) * 16);
     }
     else
     {
-        //SIMD is column major, hsMatrix44 is row major.
-        //We need to flip.
-        if(swapOrder) {
-            dst->columns[0][0] = src.fMap[0][0];
-            dst->columns[1][0] = src.fMap[0][1];
-            dst->columns[2][0] = src.fMap[0][2];
-            dst->columns[3][0] = src.fMap[0][3];
-
-            dst->columns[0][1] = src.fMap[1][0];
-            dst->columns[1][1] = src.fMap[1][1];
-            dst->columns[2][1] = src.fMap[1][2];
-            dst->columns[3][1] = src.fMap[1][3];
-
-            dst->columns[0][2] = src.fMap[2][0];
-            dst->columns[1][2] = src.fMap[2][1];
-            dst->columns[2][2] = src.fMap[2][2];
-            dst->columns[3][2] = src.fMap[2][3];
-
-            dst->columns[0][3] = src.fMap[3][0];
-            dst->columns[1][3] = src.fMap[3][1];
-            dst->columns[2][3] = src.fMap[3][2];
-            dst->columns[3][3] = src.fMap[3][3];
-        } else {
-            memcpy(dst, &src.fMap, sizeof(matrix_float4x4));
-        }
+        memcpy(dst, &src.fMap, sizeof(matrix_float4x4));
     }
 
     return dst;
@@ -957,13 +933,13 @@ void plMetalDevice::SetWorldToCameraMatrix(const hsMatrix44& src)
     hsMatrix2SIMD(inv, &fMatrixC2W);
 }
 
-void plMetalDevice::SetLocalToWorldMatrix(const hsMatrix44& src, bool swapOrder)
+void plMetalDevice::SetLocalToWorldMatrix(const hsMatrix44& src)
 {
     hsMatrix44 inv;
     src.GetInverse(&inv);
 
-    hsMatrix2SIMD(src, &fMatrixL2W, swapOrder);
-    hsMatrix2SIMD(inv, &fMatrixW2L, swapOrder);
+    hsMatrix2SIMD(src, &fMatrixL2W);
+    hsMatrix2SIMD(inv, &fMatrixW2L);
 }
 
 void plMetalDevice::CreateNewCommandBuffer(CA::MetalDrawable* drawable)

diff --git a/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalDevice.h b/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalDevice.h
@@ -67,7 +67,8 @@ class plCubicEnvironmap;
 class plLayerInterface;
 class plMetalPipelineState;
 
-matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src, matrix_float4x4* dst, bool swapOrder = true);
+//NOTE: Results of this will be row major
+matrix_float4x4* hsMatrix2SIMD(const hsMatrix44& src, matrix_float4x4* dst);
 
 class plMetalDevice
 {
@@ -149,7 +150,7 @@ class plMetalDevice
 
     void SetProjectionMatrix(const hsMatrix44& src);
     void SetWorldToCameraMatrix(const hsMatrix44& src);
-    void SetLocalToWorldMatrix(const hsMatrix44& src, bool swapOrder = true);
+    void SetLocalToWorldMatrix(const hsMatrix44& src);
 
     void PopulateTexture(plMetalDevice::TextureRef *tRef, plMipmap *img, uint slice);
     uint ConfigureAllowedLevels(plMetalDevice::TextureRef *tRef, plMipmap *mipmap);

diff --git a/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalPipeline.cpp b/Sources/Plasma/FeatureLib/pfMetalPipeline/plMetalPipeline.cpp
@@ -2554,15 +2554,10 @@ void plMetalPipeline::IDrawPlate(plPlate* plate)
     fDevice.CurrentRenderCommandEncoder()->setDepthStencilState(fDevice.fNoZReadOrWriteStencilState);
     fState.fCurrentDepthStencilState = fDevice.fNoZReadOrWriteStencilState;
 
-    //column major layout
     simd_float4x4 projMat = matrix_identity_float4x4;
-    //projMat.columns[2][3] = 1.0f;
-    //projMat.columns[3][1] = -0.5f;
-    projMat.columns[3][2] = 0.0f;
-    projMat.columns[1][1] = 1.0f;
 
     /// Set up the transform directly
-    fDevice.SetLocalToWorldMatrix(plate->GetTransform(), false);
+    fDevice.SetLocalToWorldMatrix(plate->GetTransform());
 
     IPushPiggyBacks(material);
 
@@ -4440,8 +4435,8 @@ void plMetalPipeline::IBlendVertBuffer(plSpan* span, hsMatrix44* matrixPalette,
             hsMatrix2SIMD(matrixPalette[indices & 0xFF], &simdMatrix);
             if (weights[j]) {
                 //Note: This bit is different than GL/DirectX. It's using acclerate so this is also accelerated on ARM through NEON or maybe even the Neural Engine.
-                destPt_buf += weights[j] * simd_mul(simdMatrix, *(simd_float4 *)pt_buf);
-                destNorm_buf += weights[j] * simd_mul(simdMatrix, *(simd_float4 *)vec_buf);
+                destPt_buf +=  simd_mul(*(simd_float4 *)pt_buf, simdMatrix) * weights[j];
+                destNorm_buf += simd_mul(*(simd_float4 *)vec_buf, simdMatrix) * weights[j];
             }
                 //ISkinVertexSSE41(matrixPalette[indices & 0xFF], weights[j], pt_buf, destPt_buf, vec_buf, destNorm_buf);
             indices >>= 8;