J3DSkinDeform work (#2896)

* Match four inline asm functions with manual regalloc

* J3DSkinDeform::changeFastSkinDL: Clean up var names, slightly improve match

* Syntax

* More var names
This commit is contained in:
LagoLunatic 2025-11-30 16:39:56 -05:00 committed by GitHub
parent 20e9957356
commit 434415337d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 136 additions and 199 deletions

View File

@ -111,177 +111,118 @@ inline void J3DPSMtx33CopyFrom34(register MtxP src, register Mtx3P dst) {
#endif
}
// regalloc issues
inline void J3DPSMulMtxVec(register MtxP mtx, register Vec* vec, register Vec* dst) {
register f32 fr12;
register f32 fr11;
register f32 fr10;
register f32 fr9;
register f32 fr8;
register f32 fr6;
register f32 fra6;
register f32 fr5;
register f32 fra5;
register f32 fra4;
register f32 fr4;
register f32 fr3;
register f32 fr2;
register f32 fra2;
register f32 fr01;
register f32 fr00;
#ifdef __MWERKS__
asm {
psq_l fr00, 0(vec), 0, 0
psq_l fr2, 0(mtx), 0, 0
psq_l fr01, 8(vec), 1, 0
ps_mul fr4, fr2, fr00
psq_l fr3, 8(mtx), 0, 0
ps_madd fr5, fr3, fr01, fr4
psq_l fr8, 16(mtx), 0, 0
ps_sum0 fr6, fr5, fr6, fr5
psq_l fr9, 24(mtx), 0, 0
ps_mul fr10, fr8, fr00
psq_st fr6, 0(dst), 1, 0
ps_madd fr11, fr9, fr01, fr10
psq_l fra2, 32(mtx), 0, 0
ps_sum0 fr12, fr11, fr12, fr11
psq_l fr3, 40(mtx), 0, 0
ps_mul fra4, fra2, fr00
psq_st fr12, 4(dst), 1, 0
ps_madd fra5, fr3, fr01, fra4
ps_sum0 fra6, fra5, fra6, fra5
psq_st fra6, 8(dst), 1, 0
psq_l f0, 0(vec), 0, 0
psq_l f2, 0(mtx), 0, 0
psq_l f1, 8(vec), 1, 0
ps_mul f4, f2, f0
psq_l f3, 8(mtx), 0, 0
ps_madd f5, f3, f1, f4
psq_l f8, 16(mtx), 0, 0
ps_sum0 f6, f5, f6, f5
psq_l f9, 24(mtx), 0, 0
ps_mul f10, f8, f0
psq_st f6, 0(dst), 1, 0
ps_madd f11, f9, f1, f10
psq_l f2, 32(mtx), 0, 0
ps_sum0 f12, f11, f12, f11
psq_l f3, 40(mtx), 0, 0
ps_mul f4, f2, f0
psq_st f12, 4(dst), 1, 0
ps_madd f5, f3, f1, f4
ps_sum0 f6, f5, f6, f5
psq_st f6, 8(dst), 1, 0
}
#endif
}
// regalloc issues
inline void J3DPSMulMtxVec(register MtxP mtx, register S16Vec* vec, register S16Vec* dst) {
register f32 fr12;
register f32 fr11;
register f32 fr10;
register f32 fr9;
register f32 fr8;
register f32 fr6;
register f32 fra6;
register f32 fr5;
register f32 fra5;
register f32 fra4;
register f32 fr4;
register f32 fr3;
register f32 fr2;
register f32 fra2;
register f32 fr01;
register f32 fr00;
#ifdef __MWERKS__
asm {
psq_l fr00, 0(vec), 0, 7
psq_l fr2, 0(mtx), 0, 0
psq_l fr01, 4(vec), 1, 7
ps_mul fr4, fr2, fr00
psq_l fr3, 8(mtx), 0, 0
ps_madd fr5, fr3, fr01, fr4
psq_l fr8, 16(mtx), 0, 0
ps_sum0 fr6, fr5, fr6, fr5
psq_l fr9, 24(mtx), 0, 0
ps_mul fr10, fr8, fr00
psq_st fr6, 0(dst), 1, 7
ps_madd fr11, fr9, fr01, fr10
psq_l fra2, 32(mtx), 0, 0
ps_sum0 fr12, fr11, fr12, fr11
psq_l fr3, 40(mtx), 0, 0
ps_mul fra4, fra2, fr00
psq_st fr12, 2(dst), 1, 7
ps_madd fra5, fr3, fr01, fra4
ps_sum0 fra6, fra5, fra6, fra5
psq_st fra6, 4(dst), 1, 7
psq_l f0, 0(vec), 0, 7
psq_l f2, 0(mtx), 0, 0
psq_l f1, 4(vec), 1, 7
ps_mul f4, f2, f0
psq_l f3, 8(mtx), 0, 0
ps_madd f5, f3, f1, f4
psq_l f8, 16(mtx), 0, 0
ps_sum0 f6, f5, f6, f5
psq_l f9, 24(mtx), 0, 0
ps_mul f10, f8, f0
psq_st f6, 0(dst), 1, 7
ps_madd f11, f9, f1, f10
psq_l f2, 32(mtx), 0, 0
ps_sum0 f12, f11, f12, f11
psq_l f3, 40(mtx), 0, 0
ps_mul f4, f2, f0
psq_st f12, 2(dst), 1, 7
ps_madd f5, f3, f1, f4
ps_sum0 f6, f5, f6, f5
psq_st f6, 4(dst), 1, 7
}
#endif
}
// regalloc issues
inline void J3DPSMulMtxVec(register Mtx3P mtx, register Vec* vec, register Vec* dst) {
register f32* punit;
register f32 unit;
register f32 fr12;
register f32 fr11;
register f32 fr10;
register f32 fr9;
register f32 fr8;
register f32 fr6;
register f32 fr5;
register f32 fr4;
register f32 fr3;
register f32 fr2;
register f32 fr01;
register f32 fr00;
#ifdef __MWERKS__
asm {
lis punit, PSMulUnit01@ha
psq_l fr00, 0(vec), 0, 0
addi punit, punit, PSMulUnit01@l
psq_l fr2, 0(mtx), 0, 0
psq_l unit, 0(punit), 0, 0
psq_l fr01, 8(vec), 1, 0
ps_add fr01, unit, fr01
psq_l fr3, 8(mtx), 1, 0
ps_mul fr4, fr2, fr00
psq_l fr8, 12(mtx), 0, 0
ps_madd fr5, fr3, fr01, fr4
ps_sum0 fr6, fr5, fr6, fr5
psq_l fr9, 20(mtx), 1, 0
ps_mul fr10, fr8, fr00
psq_st fr6, 0(dst), 1, 0
ps_madd fr11, fr9, fr01, fr10
psq_l fr2, 24(mtx), 0, 0
ps_sum0 fr12, fr11, fr12, fr11
psq_l fr3, 32(mtx), 1, 0
ps_mul fr4, fr2, fr00
psq_st fr12, 4(dst), 1, 0
ps_madd fr5, fr3, fr01, fr4
ps_sum0 fr6, fr5, fr6, fr5
psq_st fr6, 8(dst), 1, 0
lis r6, PSMulUnit01@ha
psq_l f0, 0(vec), 0, 0
addi r6, r6, PSMulUnit01@l
psq_l f2, 0(mtx), 0, 0
psq_l f13, 0(r6), 0, 0
psq_l f1, 8(vec), 1, 0
ps_add f1, f13, f1
psq_l f3, 8(mtx), 1, 0
ps_mul f4, f2, f0
psq_l f8, 12(mtx), 0, 0
ps_madd f5, f3, f1, f4
ps_sum0 f6, f5, f6, f5
psq_l f9, 20(mtx), 1, 0
ps_mul f10, f8, f0
psq_st f6, 0(dst), 1, 0
ps_madd f11, f9, f1, f10
psq_l f2, 24(mtx), 0, 0
ps_sum0 f12, f11, f12, f11
psq_l f3, 32(mtx), 1, 0
ps_mul f4, f2, f0
psq_st f12, 4(dst), 1, 0
ps_madd f5, f3, f1, f4
ps_sum0 f6, f5, f6, f5
psq_st f6, 8(dst), 1, 0
}
#endif
}
// regalloc issues
inline void J3DPSMulMtxVec(register Mtx3P mtx, register S16Vec* vec, register S16Vec* dst) {
register f32* punit;
register f32 unit;
register f32 fr6;
register f32 fr5;
register f32 fr4;
register f32 fr3;
register f32 fr2;
register f32 fr01;
register f32 fr00;
#ifdef __MWERKS__
asm {
lis punit, PSMulUnit01@ha
psq_l fr00, 0(vec), 0, 7
addi punit, punit, PSMulUnit01@l
psq_l fr2, 0(mtx), 0, 0
psq_l unit, 0(punit), 0, 0
psq_l fr01, 4(vec), 1, 7
ps_add fr01, unit, fr01
psq_l fr3, 8(mtx), 1, 0
ps_mul fr4, fr2, fr00
psq_l fr2, 12(mtx), 0, 0
ps_madd fr5, fr3, fr01, fr4
ps_sum0 fr6, fr5, fr6, fr5
psq_l fr3, 20(mtx), 1, 0
ps_mul fr4, fr2, fr00
psq_st fr6, 0(dst), 1, 7
ps_madd fr5, fr3, fr01, fr4
psq_l fr2, 24(mtx), 0, 0
ps_sum0 fr6, fr5, fr6, fr5
psq_l fr3, 32(mtx), 1, 0
ps_mul fr4, fr2, fr00
psq_st fr6, 2(dst), 1, 7
ps_madd fr5, fr3, fr01, fr4
ps_sum0 fr6, fr5, fr6, fr5
psq_st fr6, 4(dst), 1, 7
lis r6, PSMulUnit01@ha
psq_l f0, 0(vec), 0, 7
addi r6, r6, PSMulUnit01@l
psq_l f2, 0(mtx), 0, 0
psq_l f13, 0(r6), 0, 0
psq_l f1, 4(vec), 1, 7
ps_add f1, f13, f1
psq_l f3, 8(mtx), 1, 0
ps_mul f4, f2, f0
psq_l f8, 12(mtx), 0, 0
ps_madd f5, f3, f1, f4
ps_sum0 f6, f5, f6, f5
psq_l f9, 20(mtx), 1, 0
ps_mul f10, f8, f0
psq_st f6, 0(dst), 1, 7
ps_madd f11, f9, f1, f10
psq_l f2, 24(mtx), 0, 0
ps_sum0 f12, f11, f12, f11
psq_l f3, 32(mtx), 1, 0
ps_mul f4, f2, f0
psq_st f12, 2(dst), 1, 7
ps_madd f5, f3, f1, f4
ps_sum0 f6, f5, f6, f5
psq_st f6, 4(dst), 1, 7
}
#endif
}

View File

@ -200,15 +200,15 @@ public:
JSUTree<T>* getLastChild() const { return (JSUTree<T>*)this->getLastLink(); }
JSUTree<T>* getNextChild() const { return (JSUTree<T>*)JSUPtrLink::mNext; }
JSUTree<T>* getNextChild() const { return (JSUTree<T>*)this->mNext; }
JSUTree<T>* getPrevChild() const { return (JSUTree<T>*)JSUPtrLink::mPrev; }
JSUTree<T>* getPrevChild() const { return (JSUTree<T>*)this->mPrev; }
u32 getNumChildren() const { return this->getNumLinks(); }
T* getObject() const { return (T*)JSUPtrLink::mObject; }
T* getObject() const { return (T*)this->mObject; }
JSUTree<T>* getParent() const { return (JSUTree<T>*)JSUPtrLink::mList; }
JSUTree<T>* getParent() const { return (JSUTree<T>*)this->mList; }
};
/**

View File

@ -212,7 +212,6 @@ u16 J3DSkinDeform::sWorkArea_MtxReg[1024];
/* 8032CF44-8032D378 327884 0434+00 0/0 1/1 0/0 .text
* initMtxIndexArray__13J3DSkinDeformFP12J3DModelData */
// NONMATCHING - matches debug, not retail
int J3DSkinDeform::initMtxIndexArray(J3DModelData* pModelData) {
J3D_ASSERT_NULLPTR(507, pModelData != NULL);
if (mPosData != NULL && mNrmData != NULL) {
@ -367,75 +366,76 @@ int J3DSkinDeform::initMtxIndexArray(J3DModelData* pModelData) {
/* 8032D378-8032D5C4 327CB8 024C+00 0/0 1/1 0/0 .text
* changeFastSkinDL__13J3DSkinDeformFP12J3DModelData */
// NONMATCHING - regalloc, display list access issues
// NONMATCHING - instruction ordering/optimization issue, matches debug
// the compiler needs to delay adding +3 to dl until the end of the while loop for the function to match
// but instead it puts the +3 at the start of the for loop and reworks the other instructions
// can get a 99.93% match on retail by moving where dl is incremented, but it seems fake as it breaks debug, and introduces an operand swap on src
void J3DSkinDeform::changeFastSkinDL(J3DModelData* pModelData) {
J3D_ASSERT_NULLPTR(740, pModelData != NULL);
for (u16 i = 0; i < pModelData->getShapeNum(); i++) {
u32 kSize[4] = {0,1,1,2};
int local_30 = -1;
int local_34 = 0;
int pnmtxIdxOffs = -1;
int vtxSize = 0;
J3DShape* pShapeNode = pModelData->getShapeNodePointer(i);
for (GXVtxDescList* vtxDesc = pShapeNode->getVtxDesc(); vtxDesc->attr != GX_VA_NULL; vtxDesc++) {
if (vtxDesc->attr == GX_VA_PNMTXIDX) {
local_30 = local_34;
pnmtxIdxOffs = vtxSize;
}
local_34 += kSize[vtxDesc->type];
vtxSize += kSize[vtxDesc->type];
}
if (local_30 != -1) {
if (pnmtxIdxOffs != -1) {
for (u16 j = 0; j < (u16)pShapeNode->getMtxGroupNum(); j++) {
u8* pDList = pShapeNode->getShapeDraw(j)->getDisplayList();
u8* local_44 = pDList;
u8* puVar10 = pDList;
while (local_44 - pDList < pShapeNode->getShapeDraw(j)->getDisplayListSize()) {
u8 command = *local_44;
local_44++;
*puVar10++ = command;
if (command != GX_TRIANGLEFAN && command != GX_TRIANGLESTRIP)
u8* displayListStart = pShapeNode->getShapeDraw(j)->getDisplayList();
u8* dl = displayListStart;
u8* dst = displayListStart;
while ((dl - displayListStart) < pShapeNode->getShapeDraw(j)->getDisplayListSize()) {
u8 cmd = *dl;
dl++;
*dst++ = cmd;
if (cmd != GX_TRIANGLEFAN && cmd != GX_TRIANGLESTRIP)
break;
int uVar9 = *(u16*)local_44;
local_44 += 2;
*(u16*)puVar10 = uVar9;
puVar10 += 2;
for (int local_4c = 0; local_4c < uVar9; local_4c++) {
u8* dst = &local_44[local_34 * local_4c];
memcpy(puVar10, dst + 1, local_34 - 1);
// FAKEMATCH
#if DEBUG || VERSION == VERSION_WII_USA_R0 || VERSION == VERSION_WII_USA_R2
puVar10 += local_34 - 1;
#else
puVar10 = (local_34 + puVar10) - 1;
#endif
int vtxCount = *(u16*)dl;
dl += 2;
*(u16*)dst = vtxCount;
dst += 2;
for (int k = 0; k < vtxCount; k++) {
u8* src = &dl[vtxSize * k];
memcpy(dst, src + 1, (int)(vtxSize - 1)); // The -1 is to remove GX_VA_PNMTXIDX
dst += (int)(vtxSize - 1);
}
local_44 += local_34 * uVar9;
dl += vtxSize * vtxCount;
}
int dlistSize = ((int)puVar10 - (int)pDList + 0x1f) & ~0x1f;
while ((int)puVar10 - (int)pDList < pShapeNode->getShapeDraw(j)->getDisplayListSize()) {
*puVar10++ = 0;
int dlistSize = ((int)dst - (int)displayListStart + 0x1f) & ~0x1f;
while ((int)dst - (int)displayListStart < pShapeNode->getShapeDraw(j)->getDisplayListSize()) {
*dst++ = 0;
}
pShapeNode->getShapeDraw(j)->setDisplayListSize(dlistSize);
DCStoreRange(pDList, pShapeNode->getShapeDraw(j)->getDisplayListSize());
DCStoreRange(displayListStart, pShapeNode->getShapeDraw(j)->getDisplayListSize());
}
}
}
for (u16 i = 0; i < pModelData->getShapeNum(); i++) {
J3DShape* pShape = pModelData->getShapeNodePointer(i);
GXVtxDescList* local_5c = pShape->getVtxDesc();
GXVtxDescList* local_60 = local_5c;
for (; local_5c->attr != GX_VA_NULL; local_5c++) {
if (local_5c->attr != GX_VA_PNMTXIDX) {
local_60->attr = local_5c->attr;
local_60->type = local_5c->type;
local_60++;
J3DShape* shape = pModelData->getShapeNodePointer(i);
GXVtxDescList* desc = shape->getVtxDesc();
GXVtxDescList* descDst = desc;
for (; desc->attr != GX_VA_NULL; desc++) {
if (desc->attr != GX_VA_PNMTXIDX) {
descDst->attr = desc->attr;
descDst->type = desc->type;
descDst++;
}
}
local_60->attr = GX_VA_NULL;
local_60->type = GX_NONE;
pShape->makeVcdVatCmd();
descDst->attr = GX_VA_NULL;
descDst->type = GX_NONE;
shape->makeVcdVatCmd();
}
}
@ -576,7 +576,6 @@ void J3DSkinDeform::deformFastVtxNrm_F32(J3DVertexBuffer* pVtxBuffer, J3DMtxBuff
/* 8032DB50-8032DC74 328490 0124+00 1/1 0/0 0/0 .text
* deformVtxPos_F32__13J3DSkinDeformCFP15J3DVertexBufferP12J3DMtxBuffer */
// NONMATCHING - J3DPSMulMtxVec regalloc
void J3DSkinDeform::deformVtxPos_F32(J3DVertexBuffer* pVtxBuffer, J3DMtxBuffer* pMtxBuffer) const {
Mtx* anmMtx = NULL;
Mtx* anmMtxs[2];
@ -601,7 +600,6 @@ void J3DSkinDeform::deformVtxPos_F32(J3DVertexBuffer* pVtxBuffer, J3DMtxBuffer*
/* 8032DC74-8032DDB8 3285B4 0144+00 1/1 0/0 0/0 .text
* deformVtxPos_S16__13J3DSkinDeformCFP15J3DVertexBufferP12J3DMtxBuffer */
// NONMATCHING - J3DPSMulMtxVec regalloc
void J3DSkinDeform::deformVtxPos_S16(J3DVertexBuffer* pVtxBuffer, J3DMtxBuffer* pMtxBuffer) const {
Mtx* anmMtx = NULL;
Mtx* anmMtxs[2];
@ -628,7 +626,6 @@ void J3DSkinDeform::deformVtxPos_S16(J3DVertexBuffer* pVtxBuffer, J3DMtxBuffer*
/* 8032DDB8-8032DEBC 3286F8 0104+00 1/1 0/0 0/0 .text
* deformVtxNrm_F32__13J3DSkinDeformCFP15J3DVertexBuffer */
// NONMATCHING - J3DPSMulMtxVec regalloc
void J3DSkinDeform::deformVtxNrm_F32(J3DVertexBuffer* pVtxBuffer) const {
pVtxBuffer->swapTransformedVtxNrm();
int nrmNum = pVtxBuffer->getVertexData()->getNrmNum();
@ -645,7 +642,6 @@ void J3DSkinDeform::deformVtxNrm_F32(J3DVertexBuffer* pVtxBuffer) const {
/* 8032DEBC-8032DFDC 3287FC 0120+00 1/1 0/0 0/0 .text
* deformVtxNrm_S16__13J3DSkinDeformCFP15J3DVertexBuffer */
// NONMATCHING - J3DPSMulMtxVec regalloc
void J3DSkinDeform::deformVtxNrm_S16(J3DVertexBuffer* pVtxBuffer) const {
int vtxNrmFrac = pVtxBuffer->getVertexData()->getVtxNrmFrac();
J3DGQRSetup7(vtxNrmFrac, 7, vtxNrmFrac, 7);