From 28b60fc00cd13f9e4780b88cddbad4b4f2be4cca Mon Sep 17 00:00:00 2001 From: Tharo <17233964+Thar0@users.noreply.github.com> Date: Sat, 16 Aug 2025 14:07:25 +0100 Subject: [PATCH] Handwritten asm: bcmp, bcopy, bzero (#1817) * Handwritten asm: bcmp, bcopy, bzero * consistent and * Consistent add --- Makefile | 2 + spec/spec | 6 +- src/libultra/libc/bcmp.s | 89 ++++++++++++++++ src/libultra/libc/bcopy.s | 211 ++++++++++++++++++++++++++++++++++++++ src/libultra/libc/bzero.s | 59 +++++++++++ 5 files changed, 364 insertions(+), 3 deletions(-) create mode 100644 src/libultra/libc/bcmp.s create mode 100644 src/libultra/libc/bcopy.s create mode 100644 src/libultra/libc/bzero.s diff --git a/Makefile b/Makefile index 9db3585703..c569f8d878 100644 --- a/Makefile +++ b/Makefile @@ -447,6 +447,8 @@ $(BUILD_DIR)/src/libultra/libc/%.o: OPTFLAGS := -O2 $(BUILD_DIR)/src/libultra/gu/%.o: OPTFLAGS := -O2 $(BUILD_DIR)/src/libultra/rmon/%.o: OPTFLAGS := -O2 +$(BUILD_DIR)/src/libultra/libc/%.o: ASOPTFLAGS := -O2 + $(BUILD_DIR)/src/boot/libu64/%.o: OPTFLAGS := -O2 $(BUILD_DIR)/src/boot/libc/%.o: OPTFLAGS := -O2 diff --git a/spec/spec b/spec/spec index 3ad80ab38e..c477c58463 100644 --- a/spec/spec +++ b/spec/spec @@ -69,7 +69,7 @@ beginseg include "$(BUILD_DIR)/src/libultra/os/thread.o" include "$(BUILD_DIR)/src/libultra/os/destroythread.o" include "$(BUILD_DIR)/src/libultra/voice/voicecheckresult.o" - include "$(BUILD_DIR)/asm/boot/bzero.text.o" + include "$(BUILD_DIR)/src/libultra/libc/bzero.o" include "$(BUILD_DIR)/src/libultra/io/motor.o" include "$(BUILD_DIR)/src/libultra/io/siacs.o" include "$(BUILD_DIR)/src/libultra/io/controller.o" @@ -140,7 +140,7 @@ beginseg include "$(BUILD_DIR)/src/libultra/gu/position.o" include "$(BUILD_DIR)/src/libultra/io/epirawdma.o" include "$(BUILD_DIR)/src/libultra/io/sptaskyielded.o" - include "$(BUILD_DIR)/asm/boot/bcmp.text.o" + include "$(BUILD_DIR)/src/libultra/libc/bcmp.o" include "$(BUILD_DIR)/src/libultra/os/gettime.o" include "$(BUILD_DIR)/src/libultra/gu/rotate.o" include "$(BUILD_DIR)/src/libultra/os/setglobalintmask.o" @@ -157,7 +157,7 @@ beginseg include "$(BUILD_DIR)/src/libultra/os/getcompare.o" include "$(BUILD_DIR)/src/libultra/io/dpgetstat.o" include "$(BUILD_DIR)/src/libultra/io/dpsetstat.o" - include "$(BUILD_DIR)/asm/boot/bcopy.text.o" + include "$(BUILD_DIR)/src/libultra/libc/bcopy.o" include "$(BUILD_DIR)/src/libultra/os/resetglobalintmask.o" include "$(BUILD_DIR)/src/libultra/io/pfsdeletefile.o" include "$(BUILD_DIR)/src/libultra/gu/ortho.o" diff --git a/src/libultra/libc/bcmp.s b/src/libultra/libc/bcmp.s new file mode 100644 index 0000000000..dab1c2c672 --- /dev/null +++ b/src/libultra/libc/bcmp.s @@ -0,0 +1,89 @@ +#include "PR/asm.h" +#include "PR/regdef.h" + +.text + +LEAF(bcmp) + xor v0, a0, a1 + blt a2, 0x10, bytecmp + + and v0, v0, 3 + negu t8, a0 + bnez v0, unaligncmp + + and t8, t8, 3 + subu a2, a2, t8 + beqz t8, wordcmp + + move v0, v1 + lwl v0, (a0) + lwl v1, (a1) + addu a0, a0, t8 + addu a1, a1, t8 + bne v0, v1, cmpne + +wordcmp: + and a3, a2, ~3 + subu a2, a2, a3 + beqz a3, bytecmp + + addu a3, a3, a0 +1: + lw v0, (a0) + lw v1, (a1) + addu a0, a0, 4 + addu a1, a1, 4 + bne v0, v1, cmpne + bne a0, a3, 1b + + b bytecmp + +unaligncmp: + negu a3, a1 + and a3, a3, 3 + subu a2, a2, a3 + beqz a3, partaligncmp + + addu a3, a3, a0 +1: + lbu v0, (a0) + lbu v1, (a1) + addu a0, a0, 1 + addu a1, a1, 1 + bne v0, v1, cmpne + bne a0, a3, 1b + +partaligncmp: + and a3, a2, ~3 + subu a2, a2, a3 + beqz a3, bytecmp + + addu a3, a3, a0 +1: + lwl v0, (a0) + lwr v0, 3(a0) + lw v1, (a1) + addu a0, a0, 4 + addu a1, a1, 4 + bne v0, v1, cmpne + bne a0, a3, 1b + +bytecmp: + addu a3, a2, a0 + blez a2, cmpdone +1: + lbu v0, (a0) + lbu v1, (a1) + addu a0, a0, 1 + addu a1, a1, 1 + bne v0, v1, cmpne + bne a0, a3, 1b + +cmpdone: + move v0, zero + jr ra + +cmpne: + li v0, 1 + jr ra +END(bcmp) diff --git a/src/libultra/libc/bcopy.s b/src/libultra/libc/bcopy.s new file mode 100644 index 0000000000..3c0742048e --- /dev/null +++ b/src/libultra/libc/bcopy.s @@ -0,0 +1,211 @@ +#include "PR/asm.h" +#include "PR/regdef.h" + +.text + +LEAF(bcopy) + move a3, a1 + beqz a2, ret + beq a0, a1, ret + blt a1, a0, goforwards + add v0, a0, a2 + bge a1, v0, goforwards + b gobackwards + +goforwards: + blt a2, 0x10, forwards_bytecopy + and v0, a0, 3 + and v1, a1, 3 + beq v0, v1, forwalignable + +forwards_bytecopy: + beqz a2, ret + addu v1, a0, a2 +99: + lb v0, (a0) + addu a0, a0, 1 + sb v0, (a1) + addu a1, a1, 1 + bne a0, v1, 99b +ret: + move v0, a3 + jr ra + +forwalignable: + beqz v0, forwards_32 + beq v0, 1, forw_copy3 + beq v0, 2, forw_copy2 + + lb v0, (a0) + addu a0, a0, 1 + sb v0, (a1) + addu a1, a1, 1 + addu a2, a2, -1 + b forwards_32 + +forw_copy2: + lh v0, (a0) + addu a0, a0, 2 + sh v0, (a1) + addu a1, a1, 2 + addu a2, a2, -2 + b forwards_32 + +forw_copy3: + lb v0, (a0) + lh v1, 1(a0) + addu a0, a0, 3 + sb v0, (a1) + sh v1, 1(a1) + addu a1, a1, 3 + addu a2, a2, -3 + +forwards: +forwards_32: + blt a2, 32, forwards_16 + lw v0, 0(a0) + lw v1, 4(a0) + lw t0, 8(a0) + lw t1, 12(a0) + lw t2, 16(a0) + lw t3, 20(a0) + lw t4, 24(a0) + lw t5, 28(a0) + addu a0, a0, 32 + sw v0, 0(a1) + sw v1, 4(a1) + sw t0, 8(a1) + sw t1, 12(a1) + sw t2, 16(a1) + sw t3, 20(a1) + sw t4, 24(a1) + sw t5, 28(a1) + addu a1, a1, 32 + addu a2, a2, -32 + b forwards_32 + +forwards_16: + blt a2, 16, forwards_4 + lw v0, 0(a0) + lw v1, 4(a0) + lw t0, 8(a0) + lw t1, 12(a0) + addu a0, a0, 16 + sw v0, 0(a1) + sw v1, 4(a1) + sw t0, 8(a1) + sw t1, 12(a1) + addu a1, a1, 16 + addu a2, a2, -16 + b forwards_16 + +forwards_4: + blt a2, 4, forwards_bytecopy + + lw v0, 0(a0) + addu a0, a0, 4 + sw v0, 0(a1) + addu a1, a1, 4 + addu a2, a2, -4 + b forwards_4 + +gobackwards: + add a0, a0,a2 + add a1, a1,a2 + blt a2, 16, backwards_bytecopy + + and v0, a0, 0x3 + and v1, a1, 0x3 + beq v0, v1, backalignable + +backwards_bytecopy: + beqz a2, ret + addu a0, a0, -1 + addu a1, a1, -1 + subu v1, a0,a2 +99: + lb v0, 0(a0) + addu a0, a0, -1 + sb v0, 0(a1) + addu a1, a1, -1 + bne a0, v1,99b + + move v0, a3 + jr ra + +backalignable: + beqz v0, backwards + beq v0, 3, back_copy3 + beq v0, 2, back_copy2 + lb v0, -1(a0) + addu a0, a0, -1 + sb v0, -1(a1) + addu a1, a1, -1 + addu a2, a2, -1 + b backwards + +back_copy2: + lh v0, -2(a0) + addu a0, a0, -2 + sh v0, -2(a1) + addu a1, a1, -2 + addu a2, a2, -2 + b backwards + +back_copy3: + lb v0, -1(a0) + lh v1, -3(a0) + addu a0, a0, -3 + sb v0, -1(a1) + sh v1, -3(a1) + addu a1, a1, -3 + addu a2, a2, -3 + +backwards: +backwards_32: + blt a2, 32, backwards_16 + lw v0, -4(a0) + lw v1, -8(a0) + lw t0, -12(a0) + lw t1, -16(a0) + lw t2, -20(a0) + lw t3, -24(a0) + lw t4, -28(a0) + lw t5, -32(a0) + addu a0, a0, -32 + sw v0, -4(a1) + sw v1, -8(a1) + sw t0, -12(a1) + sw t1, -16(a1) + sw t2, -20(a1) + sw t3, -24(a1) + sw t4, -28(a1) + sw t5, -32(a1) + addu a1, a1, -32 + addu a2, a2, -32 + b backwards_32 + +backwards_16: + blt a2, 16, backwards_4 + lw v0, -4(a0) + lw v1, -8(a0) + lw t0, -12(a0) + lw t1, -16(a0) + addu a0, a0, -16 + sw v0, -4(a1) + sw v1, -8(a1) + sw t0, -12(a1) + sw t1, -16(a1) + addu a1, a1, -16 + addu a2, a2, -16 + b backwards_16 + +backwards_4: + blt a2, 4, backwards_bytecopy + lw v0, -4(a0) + addu a0, a0, -4 + sw v0, -4(a1) + addu a1, a1, -4 + addu a2, a2, -4 + b backwards_4 +END(bcopy) diff --git a/src/libultra/libc/bzero.s b/src/libultra/libc/bzero.s new file mode 100644 index 0000000000..84f305fca8 --- /dev/null +++ b/src/libultra/libc/bzero.s @@ -0,0 +1,59 @@ +#include "PR/asm.h" +#include "PR/regdef.h" + +.text + +LEAF(bzero) + negu v1, a0 + blt a1, 0xC, bytezero + + and v1, v1, 3 + subu a1, a1, v1 + beqz v1, blkzero + + swl zero, (a0) + addu a0, a0, v1 +blkzero: + /* align backwards to 0x20 */ + and a3, a1, ~(0x20 - 1) + /* If the result is zero, the amount to zero is less than 0x20 bytes */ + subu a1, a1, a3 + beqz a3, wordzero + /* zero in blocks of 0x20 at a time */ + addu a3, a3, a0 +1: + sw zero, 0(a0) + sw zero, 4(a0) + sw zero, 8(a0) + sw zero, 12(a0) + addu a0, a0, 0x20 + sw zero, -16(a0) + sw zero, -12(a0) + sw zero, -8(a0) + sw zero, -4(a0) + bne a0, a3, 1b + +wordzero: + /* align backwards to 0x4 */ + and a3, a1, ~3 + /* If the result is zero, the amount to zero is less than 0x4 bytes */ + subu a1, a1, a3 + beqz a3, bytezero + /* zero one word at a time */ + addu a3, a3, a0 +1: + addu a0, a0, 4 + sw zero, -4(a0) + bne a0, a3, 1b +bytezero: + /* test if nothing left to zero */ + blez a1, zerodone + /* zero one byte at a time */ + addu a1, a1, a0 +1: + addu a0, a0, 1 + sb zero, -1(a0) + bne a0, a1, 1b +zerodone: + jr ra +END(bzero)