ksys/phys: Rewrite toMtx34 without using intrinsics

Doing the assignments in this specific pattern of assignments is sufficient to get the desired (bad) codegen when combined with hkVector4f's operator() (which was added recently).
2022-01-13 12:51:03 +01:00 · 2022-01-13 12:51:03 +01:00 · cd75ca724a
parent 278b088bd1
commit cd75ca724a
1 changed files with 11 additions and 35 deletions
--- a/src/KingSystem/Physics/physConversions.h
+++ b/src/KingSystem/Physics/physConversions.h
@ -5,10 +5,6 @@
 #include <math/seadQuat.h>
 #include <math/seadVector.h>

-#ifdef __aarch64__
-#include <arm_neon.h>
-#endif
-
 namespace ksys::phys {

 inline void toVec3(sead::Vector3f* out, const hkVector4f& vec) {
@ -49,38 +45,18 @@ inline void toMtx34(sead::Matrix34f* out, const hkTransformf& transform) {
    const hkRotationf& rotate = transform.getRotation();
    const hkVector4f& translate = transform.getTranslation();

-    hkVector4f row0, row1, row2;
+    hkVector4f mtx[3];
+    for (int j = 0; j < 3; ++j) {
+        for (int i = 0; i < 3; ++i) {
+            mtx[i][j] = rotate(i, j);
+        }
+    }
+    for (int i = 0; i < 3; ++i)
+        mtx[i][3] = translate(i);

-#ifdef __aarch64__
-    // XXX: this leads to really poor codegen (compared to using getRows, which
-    // is optimised into Neon zip/transpose instructions). Is Nintendo to blame
-    // for this bad usage of Neon intrinsics, or did Havok mess up their Neon getRows?
-
-    row0.v = vld1q_lane_f32(&rotate(0, 0), row0.v, 0);
-    row1.v = vld1q_lane_f32(&rotate(1, 0), row1.v, 0);
-    row2.v = vld1q_lane_f32(&rotate(2, 0), row2.v, 0);
-
-    row0.v = vld1q_lane_f32(&rotate(0, 1), row0.v, 1);
-    row1.v = vld1q_lane_f32(&rotate(1, 1), row1.v, 1);
-    row2.v = vld1q_lane_f32(&rotate(2, 1), row2.v, 1);
-
-    row0.v = vld1q_lane_f32(&rotate(0, 2), row0.v, 2);
-    row1.v = vld1q_lane_f32(&rotate(1, 2), row1.v, 2);
-    row2.v = vld1q_lane_f32(&rotate(2, 2), row2.v, 2);
-
-    row0.v = vld1q_lane_f32(&translate(0), row0.v, 3);
-    row1.v = vld1q_lane_f32(&translate(1), row1.v, 3);
-    row2.v = vld1q_lane_f32(&translate(2), row2.v, 3);
-#else
-    rotate.getRows(row0, row1, row2);
-    row0[3] = translate[0];
-    row1[3] = translate[1];
-    row2[3] = translate[2];
-#endif
-
-    row0.store<4>(out->m[0]);
-    row1.store<4>(out->m[1]);
-    row2.store<4>(out->m[2]);
+    mtx[0].store<4>(out->m[0]);
+    mtx[1].store<4>(out->m[1]);
+    mtx[2].store<4>(out->m[2]);
 }

 }  // namespace ksys::phys