From 434415337d268e405b09f6b72935a7e918e4cfe7 Mon Sep 17 00:00:00 2001 From: LagoLunatic Date: Sun, 30 Nov 2025 16:39:56 -0500 Subject: [PATCH] J3DSkinDeform work (#2896) * Match four inline asm functions with manual regalloc * J3DSkinDeform::changeFastSkinDL: Clean up var names, slightly improve match * Syntax * More var names --- include/JSystem/J3DGraphBase/J3DTransform.h | 235 +++++++----------- include/JSystem/JSupport/JSUList.h | 8 +- .../J3DGraphAnimator/J3DSkinDeform.cpp | 92 ++++--- 3 files changed, 136 insertions(+), 199 deletions(-) diff --git a/include/JSystem/J3DGraphBase/J3DTransform.h b/include/JSystem/J3DGraphBase/J3DTransform.h index 242b3b03f6d..5ae91a7bb72 100644 --- a/include/JSystem/J3DGraphBase/J3DTransform.h +++ b/include/JSystem/J3DGraphBase/J3DTransform.h @@ -111,177 +111,118 @@ inline void J3DPSMtx33CopyFrom34(register MtxP src, register Mtx3P dst) { #endif } -// regalloc issues inline void J3DPSMulMtxVec(register MtxP mtx, register Vec* vec, register Vec* dst) { - register f32 fr12; - register f32 fr11; - register f32 fr10; - register f32 fr9; - register f32 fr8; - register f32 fr6; - register f32 fra6; - register f32 fr5; - register f32 fra5; - register f32 fra4; - register f32 fr4; - register f32 fr3; - register f32 fr2; - register f32 fra2; - register f32 fr01; - register f32 fr00; #ifdef __MWERKS__ asm { - psq_l fr00, 0(vec), 0, 0 - psq_l fr2, 0(mtx), 0, 0 - psq_l fr01, 8(vec), 1, 0 - ps_mul fr4, fr2, fr00 - psq_l fr3, 8(mtx), 0, 0 - ps_madd fr5, fr3, fr01, fr4 - psq_l fr8, 16(mtx), 0, 0 - ps_sum0 fr6, fr5, fr6, fr5 - psq_l fr9, 24(mtx), 0, 0 - ps_mul fr10, fr8, fr00 - psq_st fr6, 0(dst), 1, 0 - ps_madd fr11, fr9, fr01, fr10 - psq_l fra2, 32(mtx), 0, 0 - ps_sum0 fr12, fr11, fr12, fr11 - psq_l fr3, 40(mtx), 0, 0 - ps_mul fra4, fra2, fr00 - psq_st fr12, 4(dst), 1, 0 - ps_madd fra5, fr3, fr01, fra4 - ps_sum0 fra6, fra5, fra6, fra5 - psq_st fra6, 8(dst), 1, 0 + psq_l f0, 0(vec), 0, 0 + psq_l f2, 0(mtx), 0, 0 + psq_l f1, 8(vec), 1, 0 + ps_mul f4, f2, f0 + psq_l f3, 8(mtx), 0, 0 + ps_madd f5, f3, f1, f4 + psq_l f8, 16(mtx), 0, 0 + ps_sum0 f6, f5, f6, f5 + psq_l f9, 24(mtx), 0, 0 + ps_mul f10, f8, f0 + psq_st f6, 0(dst), 1, 0 + ps_madd f11, f9, f1, f10 + psq_l f2, 32(mtx), 0, 0 + ps_sum0 f12, f11, f12, f11 + psq_l f3, 40(mtx), 0, 0 + ps_mul f4, f2, f0 + psq_st f12, 4(dst), 1, 0 + ps_madd f5, f3, f1, f4 + ps_sum0 f6, f5, f6, f5 + psq_st f6, 8(dst), 1, 0 } #endif } -// regalloc issues inline void J3DPSMulMtxVec(register MtxP mtx, register S16Vec* vec, register S16Vec* dst) { - register f32 fr12; - register f32 fr11; - register f32 fr10; - register f32 fr9; - register f32 fr8; - register f32 fr6; - register f32 fra6; - register f32 fr5; - register f32 fra5; - register f32 fra4; - register f32 fr4; - register f32 fr3; - register f32 fr2; - register f32 fra2; - register f32 fr01; - register f32 fr00; #ifdef __MWERKS__ asm { - psq_l fr00, 0(vec), 0, 7 - psq_l fr2, 0(mtx), 0, 0 - psq_l fr01, 4(vec), 1, 7 - ps_mul fr4, fr2, fr00 - psq_l fr3, 8(mtx), 0, 0 - ps_madd fr5, fr3, fr01, fr4 - psq_l fr8, 16(mtx), 0, 0 - ps_sum0 fr6, fr5, fr6, fr5 - psq_l fr9, 24(mtx), 0, 0 - ps_mul fr10, fr8, fr00 - psq_st fr6, 0(dst), 1, 7 - ps_madd fr11, fr9, fr01, fr10 - psq_l fra2, 32(mtx), 0, 0 - ps_sum0 fr12, fr11, fr12, fr11 - psq_l fr3, 40(mtx), 0, 0 - ps_mul fra4, fra2, fr00 - psq_st fr12, 2(dst), 1, 7 - ps_madd fra5, fr3, fr01, fra4 - ps_sum0 fra6, fra5, fra6, fra5 - psq_st fra6, 4(dst), 1, 7 + psq_l f0, 0(vec), 0, 7 + psq_l f2, 0(mtx), 0, 0 + psq_l f1, 4(vec), 1, 7 + ps_mul f4, f2, f0 + psq_l f3, 8(mtx), 0, 0 + ps_madd f5, f3, f1, f4 + psq_l f8, 16(mtx), 0, 0 + ps_sum0 f6, f5, f6, f5 + psq_l f9, 24(mtx), 0, 0 + ps_mul f10, f8, f0 + psq_st f6, 0(dst), 1, 7 + ps_madd f11, f9, f1, f10 + psq_l f2, 32(mtx), 0, 0 + ps_sum0 f12, f11, f12, f11 + psq_l f3, 40(mtx), 0, 0 + ps_mul f4, f2, f0 + psq_st f12, 2(dst), 1, 7 + ps_madd f5, f3, f1, f4 + ps_sum0 f6, f5, f6, f5 + psq_st f6, 4(dst), 1, 7 } #endif } -// regalloc issues inline void J3DPSMulMtxVec(register Mtx3P mtx, register Vec* vec, register Vec* dst) { - register f32* punit; - register f32 unit; - register f32 fr12; - register f32 fr11; - register f32 fr10; - register f32 fr9; - register f32 fr8; - register f32 fr6; - register f32 fr5; - register f32 fr4; - register f32 fr3; - register f32 fr2; - register f32 fr01; - register f32 fr00; #ifdef __MWERKS__ asm { - lis punit, PSMulUnit01@ha - psq_l fr00, 0(vec), 0, 0 - addi punit, punit, PSMulUnit01@l - psq_l fr2, 0(mtx), 0, 0 - psq_l unit, 0(punit), 0, 0 - psq_l fr01, 8(vec), 1, 0 - ps_add fr01, unit, fr01 - psq_l fr3, 8(mtx), 1, 0 - ps_mul fr4, fr2, fr00 - psq_l fr8, 12(mtx), 0, 0 - ps_madd fr5, fr3, fr01, fr4 - ps_sum0 fr6, fr5, fr6, fr5 - psq_l fr9, 20(mtx), 1, 0 - ps_mul fr10, fr8, fr00 - psq_st fr6, 0(dst), 1, 0 - ps_madd fr11, fr9, fr01, fr10 - psq_l fr2, 24(mtx), 0, 0 - ps_sum0 fr12, fr11, fr12, fr11 - psq_l fr3, 32(mtx), 1, 0 - ps_mul fr4, fr2, fr00 - psq_st fr12, 4(dst), 1, 0 - ps_madd fr5, fr3, fr01, fr4 - ps_sum0 fr6, fr5, fr6, fr5 - psq_st fr6, 8(dst), 1, 0 + lis r6, PSMulUnit01@ha + psq_l f0, 0(vec), 0, 0 + addi r6, r6, PSMulUnit01@l + psq_l f2, 0(mtx), 0, 0 + psq_l f13, 0(r6), 0, 0 + psq_l f1, 8(vec), 1, 0 + ps_add f1, f13, f1 + psq_l f3, 8(mtx), 1, 0 + ps_mul f4, f2, f0 + psq_l f8, 12(mtx), 0, 0 + ps_madd f5, f3, f1, f4 + ps_sum0 f6, f5, f6, f5 + psq_l f9, 20(mtx), 1, 0 + ps_mul f10, f8, f0 + psq_st f6, 0(dst), 1, 0 + ps_madd f11, f9, f1, f10 + psq_l f2, 24(mtx), 0, 0 + ps_sum0 f12, f11, f12, f11 + psq_l f3, 32(mtx), 1, 0 + ps_mul f4, f2, f0 + psq_st f12, 4(dst), 1, 0 + ps_madd f5, f3, f1, f4 + ps_sum0 f6, f5, f6, f5 + psq_st f6, 8(dst), 1, 0 } #endif } -// regalloc issues inline void J3DPSMulMtxVec(register Mtx3P mtx, register S16Vec* vec, register S16Vec* dst) { - register f32* punit; - register f32 unit; - register f32 fr6; - register f32 fr5; - register f32 fr4; - register f32 fr3; - register f32 fr2; - register f32 fr01; - register f32 fr00; #ifdef __MWERKS__ asm { - lis punit, PSMulUnit01@ha - psq_l fr00, 0(vec), 0, 7 - addi punit, punit, PSMulUnit01@l - psq_l fr2, 0(mtx), 0, 0 - psq_l unit, 0(punit), 0, 0 - psq_l fr01, 4(vec), 1, 7 - ps_add fr01, unit, fr01 - psq_l fr3, 8(mtx), 1, 0 - ps_mul fr4, fr2, fr00 - psq_l fr2, 12(mtx), 0, 0 - ps_madd fr5, fr3, fr01, fr4 - ps_sum0 fr6, fr5, fr6, fr5 - psq_l fr3, 20(mtx), 1, 0 - ps_mul fr4, fr2, fr00 - psq_st fr6, 0(dst), 1, 7 - ps_madd fr5, fr3, fr01, fr4 - psq_l fr2, 24(mtx), 0, 0 - ps_sum0 fr6, fr5, fr6, fr5 - psq_l fr3, 32(mtx), 1, 0 - ps_mul fr4, fr2, fr00 - psq_st fr6, 2(dst), 1, 7 - ps_madd fr5, fr3, fr01, fr4 - ps_sum0 fr6, fr5, fr6, fr5 - psq_st fr6, 4(dst), 1, 7 + lis r6, PSMulUnit01@ha + psq_l f0, 0(vec), 0, 7 + addi r6, r6, PSMulUnit01@l + psq_l f2, 0(mtx), 0, 0 + psq_l f13, 0(r6), 0, 0 + psq_l f1, 4(vec), 1, 7 + ps_add f1, f13, f1 + psq_l f3, 8(mtx), 1, 0 + ps_mul f4, f2, f0 + psq_l f8, 12(mtx), 0, 0 + ps_madd f5, f3, f1, f4 + ps_sum0 f6, f5, f6, f5 + psq_l f9, 20(mtx), 1, 0 + ps_mul f10, f8, f0 + psq_st f6, 0(dst), 1, 7 + ps_madd f11, f9, f1, f10 + psq_l f2, 24(mtx), 0, 0 + ps_sum0 f12, f11, f12, f11 + psq_l f3, 32(mtx), 1, 0 + ps_mul f4, f2, f0 + psq_st f12, 2(dst), 1, 7 + ps_madd f5, f3, f1, f4 + ps_sum0 f6, f5, f6, f5 + psq_st f6, 4(dst), 1, 7 } #endif } diff --git a/include/JSystem/JSupport/JSUList.h b/include/JSystem/JSupport/JSUList.h index 06ccfd0f181..8e0f044d3a7 100644 --- a/include/JSystem/JSupport/JSUList.h +++ b/include/JSystem/JSupport/JSUList.h @@ -200,15 +200,15 @@ public: JSUTree* getLastChild() const { return (JSUTree*)this->getLastLink(); } - JSUTree* getNextChild() const { return (JSUTree*)JSUPtrLink::mNext; } + JSUTree* getNextChild() const { return (JSUTree*)this->mNext; } - JSUTree* getPrevChild() const { return (JSUTree*)JSUPtrLink::mPrev; } + JSUTree* getPrevChild() const { return (JSUTree*)this->mPrev; } u32 getNumChildren() const { return this->getNumLinks(); } - T* getObject() const { return (T*)JSUPtrLink::mObject; } + T* getObject() const { return (T*)this->mObject; } - JSUTree* getParent() const { return (JSUTree*)JSUPtrLink::mList; } + JSUTree* getParent() const { return (JSUTree*)this->mList; } }; /** diff --git a/src/JSystem/J3DGraphAnimator/J3DSkinDeform.cpp b/src/JSystem/J3DGraphAnimator/J3DSkinDeform.cpp index 4575a914a0b..8c3efdd0fa4 100644 --- a/src/JSystem/J3DGraphAnimator/J3DSkinDeform.cpp +++ b/src/JSystem/J3DGraphAnimator/J3DSkinDeform.cpp @@ -212,7 +212,6 @@ u16 J3DSkinDeform::sWorkArea_MtxReg[1024]; /* 8032CF44-8032D378 327884 0434+00 0/0 1/1 0/0 .text * initMtxIndexArray__13J3DSkinDeformFP12J3DModelData */ -// NONMATCHING - matches debug, not retail int J3DSkinDeform::initMtxIndexArray(J3DModelData* pModelData) { J3D_ASSERT_NULLPTR(507, pModelData != NULL); if (mPosData != NULL && mNrmData != NULL) { @@ -367,75 +366,76 @@ int J3DSkinDeform::initMtxIndexArray(J3DModelData* pModelData) { /* 8032D378-8032D5C4 327CB8 024C+00 0/0 1/1 0/0 .text * changeFastSkinDL__13J3DSkinDeformFP12J3DModelData */ -// NONMATCHING - regalloc, display list access issues +// NONMATCHING - instruction ordering/optimization issue, matches debug +// the compiler needs to delay adding +3 to dl until the end of the while loop for the function to match +// but instead it puts the +3 at the start of the for loop and reworks the other instructions +// can get a 99.93% match on retail by moving where dl is incremented, but it seems fake as it breaks debug, and introduces an operand swap on src void J3DSkinDeform::changeFastSkinDL(J3DModelData* pModelData) { J3D_ASSERT_NULLPTR(740, pModelData != NULL); for (u16 i = 0; i < pModelData->getShapeNum(); i++) { u32 kSize[4] = {0,1,1,2}; - int local_30 = -1; - int local_34 = 0; + int pnmtxIdxOffs = -1; + int vtxSize = 0; + J3DShape* pShapeNode = pModelData->getShapeNodePointer(i); for (GXVtxDescList* vtxDesc = pShapeNode->getVtxDesc(); vtxDesc->attr != GX_VA_NULL; vtxDesc++) { if (vtxDesc->attr == GX_VA_PNMTXIDX) { - local_30 = local_34; + pnmtxIdxOffs = vtxSize; } - local_34 += kSize[vtxDesc->type]; + vtxSize += kSize[vtxDesc->type]; } - if (local_30 != -1) { + if (pnmtxIdxOffs != -1) { for (u16 j = 0; j < (u16)pShapeNode->getMtxGroupNum(); j++) { - u8* pDList = pShapeNode->getShapeDraw(j)->getDisplayList(); - u8* local_44 = pDList; - u8* puVar10 = pDList; - while (local_44 - pDList < pShapeNode->getShapeDraw(j)->getDisplayListSize()) { - u8 command = *local_44; - local_44++; - *puVar10++ = command; - if (command != GX_TRIANGLEFAN && command != GX_TRIANGLESTRIP) + u8* displayListStart = pShapeNode->getShapeDraw(j)->getDisplayList(); + u8* dl = displayListStart; + u8* dst = displayListStart; + while ((dl - displayListStart) < pShapeNode->getShapeDraw(j)->getDisplayListSize()) { + u8 cmd = *dl; + dl++; + *dst++ = cmd; + + if (cmd != GX_TRIANGLEFAN && cmd != GX_TRIANGLESTRIP) break; - int uVar9 = *(u16*)local_44; - local_44 += 2; - *(u16*)puVar10 = uVar9; - puVar10 += 2; - for (int local_4c = 0; local_4c < uVar9; local_4c++) { - u8* dst = &local_44[local_34 * local_4c]; - memcpy(puVar10, dst + 1, local_34 - 1); - // FAKEMATCH - #if DEBUG || VERSION == VERSION_WII_USA_R0 || VERSION == VERSION_WII_USA_R2 - puVar10 += local_34 - 1; - #else - puVar10 = (local_34 + puVar10) - 1; - #endif + int vtxCount = *(u16*)dl; + dl += 2; + *(u16*)dst = vtxCount; + dst += 2; + + for (int k = 0; k < vtxCount; k++) { + u8* src = &dl[vtxSize * k]; + memcpy(dst, src + 1, (int)(vtxSize - 1)); // The -1 is to remove GX_VA_PNMTXIDX + dst += (int)(vtxSize - 1); } - local_44 += local_34 * uVar9; + dl += vtxSize * vtxCount; } - int dlistSize = ((int)puVar10 - (int)pDList + 0x1f) & ~0x1f; - while ((int)puVar10 - (int)pDList < pShapeNode->getShapeDraw(j)->getDisplayListSize()) { - *puVar10++ = 0; + int dlistSize = ((int)dst - (int)displayListStart + 0x1f) & ~0x1f; + while ((int)dst - (int)displayListStart < pShapeNode->getShapeDraw(j)->getDisplayListSize()) { + *dst++ = 0; } pShapeNode->getShapeDraw(j)->setDisplayListSize(dlistSize); - DCStoreRange(pDList, pShapeNode->getShapeDraw(j)->getDisplayListSize()); + DCStoreRange(displayListStart, pShapeNode->getShapeDraw(j)->getDisplayListSize()); } } } for (u16 i = 0; i < pModelData->getShapeNum(); i++) { - J3DShape* pShape = pModelData->getShapeNodePointer(i); - GXVtxDescList* local_5c = pShape->getVtxDesc(); - GXVtxDescList* local_60 = local_5c; - for (; local_5c->attr != GX_VA_NULL; local_5c++) { - if (local_5c->attr != GX_VA_PNMTXIDX) { - local_60->attr = local_5c->attr; - local_60->type = local_5c->type; - local_60++; + J3DShape* shape = pModelData->getShapeNodePointer(i); + GXVtxDescList* desc = shape->getVtxDesc(); + GXVtxDescList* descDst = desc; + for (; desc->attr != GX_VA_NULL; desc++) { + if (desc->attr != GX_VA_PNMTXIDX) { + descDst->attr = desc->attr; + descDst->type = desc->type; + descDst++; } } - local_60->attr = GX_VA_NULL; - local_60->type = GX_NONE; - pShape->makeVcdVatCmd(); + descDst->attr = GX_VA_NULL; + descDst->type = GX_NONE; + shape->makeVcdVatCmd(); } } @@ -576,7 +576,6 @@ void J3DSkinDeform::deformFastVtxNrm_F32(J3DVertexBuffer* pVtxBuffer, J3DMtxBuff /* 8032DB50-8032DC74 328490 0124+00 1/1 0/0 0/0 .text * deformVtxPos_F32__13J3DSkinDeformCFP15J3DVertexBufferP12J3DMtxBuffer */ -// NONMATCHING - J3DPSMulMtxVec regalloc void J3DSkinDeform::deformVtxPos_F32(J3DVertexBuffer* pVtxBuffer, J3DMtxBuffer* pMtxBuffer) const { Mtx* anmMtx = NULL; Mtx* anmMtxs[2]; @@ -601,7 +600,6 @@ void J3DSkinDeform::deformVtxPos_F32(J3DVertexBuffer* pVtxBuffer, J3DMtxBuffer* /* 8032DC74-8032DDB8 3285B4 0144+00 1/1 0/0 0/0 .text * deformVtxPos_S16__13J3DSkinDeformCFP15J3DVertexBufferP12J3DMtxBuffer */ -// NONMATCHING - J3DPSMulMtxVec regalloc void J3DSkinDeform::deformVtxPos_S16(J3DVertexBuffer* pVtxBuffer, J3DMtxBuffer* pMtxBuffer) const { Mtx* anmMtx = NULL; Mtx* anmMtxs[2]; @@ -628,7 +626,6 @@ void J3DSkinDeform::deformVtxPos_S16(J3DVertexBuffer* pVtxBuffer, J3DMtxBuffer* /* 8032DDB8-8032DEBC 3286F8 0104+00 1/1 0/0 0/0 .text * deformVtxNrm_F32__13J3DSkinDeformCFP15J3DVertexBuffer */ -// NONMATCHING - J3DPSMulMtxVec regalloc void J3DSkinDeform::deformVtxNrm_F32(J3DVertexBuffer* pVtxBuffer) const { pVtxBuffer->swapTransformedVtxNrm(); int nrmNum = pVtxBuffer->getVertexData()->getNrmNum(); @@ -645,7 +642,6 @@ void J3DSkinDeform::deformVtxNrm_F32(J3DVertexBuffer* pVtxBuffer) const { /* 8032DEBC-8032DFDC 3287FC 0120+00 1/1 0/0 0/0 .text * deformVtxNrm_S16__13J3DSkinDeformCFP15J3DVertexBuffer */ -// NONMATCHING - J3DPSMulMtxVec regalloc void J3DSkinDeform::deformVtxNrm_S16(J3DVertexBuffer* pVtxBuffer) const { int vtxNrmFrac = pVtxBuffer->getVertexData()->getVtxNrmFrac(); J3DGQRSetup7(vtxNrmFrac, 7, vtxNrmFrac, 7);