math/Matrix: Add NEON implementation of 3x4 multiplication

This commit is contained in:
Léo Lam 2022-03-23 23:59:52 +01:00
parent 1a419d0497
commit 0ae95f04f9
No known key found for this signature in database
GPG Key ID: 0DF30F9081000741
1 changed files with 37 additions and 1 deletions

View File

@ -5,7 +5,6 @@
#endif // cafe
#ifdef __aarch64__
// For Matrix34CalcCommon<f32>::copy (access FP/SIMD registers: Q0, ...)
#include <arm_neon.h>
#endif
@ -995,6 +994,43 @@ void Matrix34CalcCommon<T>::multiply(Base& o, const Base& a, const Base& b)
o.m[2][3] = a31 * b14 + a32 * b24 + a33 * b34 + a34;
}
#ifdef __aarch64__
template <>
inline void Matrix34CalcCommon<f32>::multiply(Base& o, const Base& a, const Base& b)
{
auto a0 = vld1q_f32(a.m[0]);
auto a1 = vld1q_f32(a.m[1]);
auto a2 = vld1q_f32(a.m[2]);
auto b0 = vld1q_f32(b.m[0]);
auto b1 = vld1q_f32(b.m[1]);
auto b2 = vld1q_f32(b.m[2]);
float32x4_t c0, c1, c2;
c0 = vmulq_laneq_f32(b0, a0, 0);
c0 = vfmaq_laneq_f32(c0, b1, a0, 1);
c0 = vfmaq_laneq_f32(c0, b2, a0, 2);
// XXX: why do something so convoluted when copying lane 3 from A to C would suffice?
// `vcopyq_laneq_f32(vmovq_n_f32(0), 3, a0, 3)` is equivalent and generates better code.
c0 += vcopyq_laneq_f32(vmovq_n_f32(0), 3, vmovq_n_f32(a0[3]), 1);
c1 = vmulq_laneq_f32(b0, a1, 0);
c1 = vfmaq_laneq_f32(c1, b1, a1, 1);
c1 = vfmaq_laneq_f32(c1, b2, a1, 2);
c1 += vcopyq_laneq_f32(vmovq_n_f32(0), 3, vmovq_n_f32(a1[3]), 1);
c2 = vmulq_laneq_f32(b0, a2, 0);
c2 = vfmaq_laneq_f32(c2, b1, a2, 1);
c2 = vfmaq_laneq_f32(c2, b2, a2, 2);
c2 += vcopyq_laneq_f32(vmovq_n_f32(0), 3, vmovq_n_f32(a2[3]), 1);
vst1q_f32(o.m[0], c0);
vst1q_f32(o.m[1], c1);
vst1q_f32(o.m[2], c2);
}
#endif
#ifdef cafe
template <>