Merge remote-tracking branch 'rth/tcg-arm-pull' into staging

# By Richard Henderson
# Via Richard Henderson
* rth/tcg-arm-pull:
  tcg-arm: Move the tlb addend load earlier
  tcg-arm: Remove restriction on qemu_ld output register
  tcg-arm: Return register containing tlb addend
  tcg-arm: Move load of tlb addend into tcg_out_tlb_read
  tcg-arm: Use QEMU_BUILD_BUG_ON to verify constraints on tlb
  tcg-arm: Use strd for tcg_out_arg_reg64
  tcg-arm: Rearrange slow-path qemu_ld/st
  tcg-arm: Use ldrd/strd for appropriate qemu_ld/st64

Message-id: 1380663109-14434-1-git-send-email-rth@twiddle.net
Signed-off-by: Anthony Liguori <anthony@codemonkey.ws>
This commit is contained in:
Anthony Liguori 2013-10-09 07:52:57 -07:00
commit 576e81be39
2 changed files with 215 additions and 184 deletions

View File

@ -324,21 +324,7 @@ extern uintptr_t tci_tb_ptr;
In some implementations, we pass the "logical" return address manually; In some implementations, we pass the "logical" return address manually;
in others, we must infer the logical return from the true return. */ in others, we must infer the logical return from the true return. */
#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU) #if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
# if defined(__arm__) # if defined(__aarch64__)
/* We define two insns between the return address and the branch back to
straight-line. Find and decode that branch insn. */
# define GETRA_LDST(RA) tcg_getra_ldst(RA)
static inline uintptr_t tcg_getra_ldst(uintptr_t ra)
{
int32_t b;
ra += 8; /* skip the two insns */
b = *(int32_t *)ra; /* load the branch insn */
b = (b << 8) >> (8 - 2); /* extract the displacement */
ra += 8; /* branches are relative to pc+8 */
ra += b; /* apply the displacement */
return ra;
}
# elif defined(__aarch64__)
# define GETRA_LDST(RA) tcg_getra_ldst(RA) # define GETRA_LDST(RA) tcg_getra_ldst(RA)
static inline uintptr_t tcg_getra_ldst(uintptr_t ra) static inline uintptr_t tcg_getra_ldst(uintptr_t ra)
{ {

View File

@ -175,20 +175,12 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
ct->ct |= TCG_CT_REG; ct->ct |= TCG_CT_REG;
tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1); tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
#ifdef CONFIG_SOFTMMU #ifdef CONFIG_SOFTMMU
/* r0-r2 will be overwritten when reading the tlb entry, /* r0-r2,lr will be overwritten when reading the tlb entry,
so don't use these. */ so don't use these. */
tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0); tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1); tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2); tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
#endif tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
break;
case 'L':
ct->ct |= TCG_CT_REG;
tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
#ifdef CONFIG_SOFTMMU
/* r1 is still needed to load data_reg or data_reg2,
so don't use it. */
tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
#endif #endif
break; break;
@ -207,6 +199,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
/* Avoid clashes with registers being used for helper args */ /* Avoid clashes with registers being used for helper args */
tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3); tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
#endif #endif
tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
#endif #endif
break; break;
@ -320,6 +313,9 @@ typedef enum {
INSN_STRB_REG = 0x06400000, INSN_STRB_REG = 0x06400000,
INSN_LDRD_IMM = 0x004000d0, INSN_LDRD_IMM = 0x004000d0,
INSN_LDRD_REG = 0x000000d0,
INSN_STRD_IMM = 0x004000f0,
INSN_STRD_REG = 0x000000f0,
} ARMInsn; } ARMInsn;
#define SHIFT_IMM_LSL(im) (((im) << 7) | 0x00) #define SHIFT_IMM_LSL(im) (((im) << 7) | 0x00)
@ -379,13 +375,17 @@ static inline void tcg_out_b_noaddr(TCGContext *s, int cond)
/* We pay attention here to not modify the branch target by skipping /* We pay attention here to not modify the branch target by skipping
the corresponding bytes. This ensure that caches and memory are the corresponding bytes. This ensure that caches and memory are
kept coherent during retranslation. */ kept coherent during retranslation. */
#ifdef HOST_WORDS_BIGENDIAN
tcg_out8(s, (cond << 4) | 0x0a);
s->code_ptr += 3;
#else
s->code_ptr += 3; s->code_ptr += 3;
tcg_out8(s, (cond << 4) | 0x0a); tcg_out8(s, (cond << 4) | 0x0a);
#endif }
static inline void tcg_out_bl_noaddr(TCGContext *s, int cond)
{
/* We pay attention here to not modify the branch target by skipping
the corresponding bytes. This ensure that caches and memory are
kept coherent during retranslation. */
s->code_ptr += 3;
tcg_out8(s, (cond << 4) | 0x0b);
} }
static inline void tcg_out_bl(TCGContext *s, int cond, int32_t offset) static inline void tcg_out_bl(TCGContext *s, int cond, int32_t offset)
@ -810,6 +810,30 @@ static inline void tcg_out_st32_r(TCGContext *s, int cond, TCGReg rt,
tcg_out_memop_r(s, cond, INSN_STR_REG, rt, rn, rm, 1, 1, 0); tcg_out_memop_r(s, cond, INSN_STR_REG, rt, rn, rm, 1, 1, 0);
} }
static inline void tcg_out_ldrd_8(TCGContext *s, int cond, TCGReg rt,
TCGReg rn, int imm8)
{
tcg_out_memop_8(s, cond, INSN_LDRD_IMM, rt, rn, imm8, 1, 0);
}
static inline void tcg_out_ldrd_r(TCGContext *s, int cond, TCGReg rt,
TCGReg rn, TCGReg rm)
{
tcg_out_memop_r(s, cond, INSN_LDRD_REG, rt, rn, rm, 1, 1, 0);
}
static inline void tcg_out_strd_8(TCGContext *s, int cond, TCGReg rt,
TCGReg rn, int imm8)
{
tcg_out_memop_8(s, cond, INSN_STRD_IMM, rt, rn, imm8, 1, 0);
}
static inline void tcg_out_strd_r(TCGContext *s, int cond, TCGReg rt,
TCGReg rn, TCGReg rm)
{
tcg_out_memop_r(s, cond, INSN_STRD_REG, rt, rn, rm, 1, 1, 0);
}
/* Register pre-increment with base writeback. */ /* Register pre-increment with base writeback. */
static inline void tcg_out_ld32_rwb(TCGContext *s, int cond, TCGReg rt, static inline void tcg_out_ld32_rwb(TCGContext *s, int cond, TCGReg rt,
TCGReg rn, TCGReg rm) TCGReg rn, TCGReg rm)
@ -975,34 +999,27 @@ static inline void tcg_out_st8(TCGContext *s, int cond,
tcg_out_st8_12(s, cond, rd, rn, offset); tcg_out_st8_12(s, cond, rd, rn, offset);
} }
/* The _goto case is normally between TBs within the same code buffer, /* The _goto case is normally between TBs within the same code buffer, and
* and with the code buffer limited to 16MB we shouldn't need the long * with the code buffer limited to 16MB we wouldn't need the long case.
* case. * But we also use it for the tail-call to the qemu_ld/st helpers, which does.
*
* .... except to the prologue that is in its own buffer.
*/ */
static inline void tcg_out_goto(TCGContext *s, int cond, uint32_t addr) static inline void tcg_out_goto(TCGContext *s, int cond, uint32_t addr)
{ {
int32_t val; int32_t disp = addr - (tcg_target_long) s->code_ptr;
if ((addr & 1) == 0 && disp - 8 < 0x01fffffd && disp - 8 > -0x01fffffd) {
tcg_out_b(s, cond, disp);
return;
}
tcg_out_movi32(s, cond, TCG_REG_TMP, addr);
if (use_armv5t_instructions) {
tcg_out_bx(s, cond, TCG_REG_TMP);
} else {
if (addr & 1) { if (addr & 1) {
/* goto to a Thumb destination isn't supported */
tcg_abort(); tcg_abort();
} }
tcg_out_mov_reg(s, cond, TCG_REG_PC, TCG_REG_TMP);
val = addr - (tcg_target_long) s->code_ptr;
if (val - 8 < 0x01fffffd && val - 8 > -0x01fffffd)
tcg_out_b(s, cond, val);
else {
if (cond == COND_AL) {
tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
tcg_out32(s, addr);
} else {
tcg_out_movi32(s, cond, TCG_REG_TMP, val - 8);
tcg_out_dat_reg(s, cond, ARITH_ADD,
TCG_REG_PC, TCG_REG_PC,
TCG_REG_TMP, SHIFT_IMM_LSL(0));
}
} }
} }
@ -1057,23 +1074,29 @@ static inline void tcg_out_goto_label(TCGContext *s, int cond, int label_index)
} }
#ifdef CONFIG_SOFTMMU #ifdef CONFIG_SOFTMMU
/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
* int mmu_idx, uintptr_t ra)
*/
static const void * const qemu_ld_helpers[8] = {
helper_ret_ldub_mmu,
helper_ret_lduw_mmu,
helper_ret_ldul_mmu,
helper_ret_ldq_mmu,
/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr, helper_ret_ldsb_mmu,
int mmu_idx) */ helper_ret_ldsw_mmu,
static const void * const qemu_ld_helpers[4] = { helper_ret_ldul_mmu,
helper_ldb_mmu, helper_ret_ldq_mmu,
helper_ldw_mmu,
helper_ldl_mmu,
helper_ldq_mmu,
}; };
/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr, /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
uintxx_t val, int mmu_idx) */ * uintxx_t val, int mmu_idx, uintptr_t ra)
*/
static const void * const qemu_st_helpers[4] = { static const void * const qemu_st_helpers[4] = {
helper_stb_mmu, helper_ret_stb_mmu,
helper_stw_mmu, helper_ret_stw_mmu,
helper_stl_mmu, helper_ret_stl_mmu,
helper_stq_mmu, helper_ret_stq_mmu,
}; };
/* Helper routines for marshalling helper function arguments into /* Helper routines for marshalling helper function arguments into
@ -1117,53 +1140,62 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
if (argreg & 1) { if (argreg & 1) {
argreg++; argreg++;
} }
if (use_armv6_instructions && argreg >= 4
&& (arglo & 1) == 0 && arghi == arglo + 1) {
tcg_out_strd_8(s, COND_AL, arglo,
TCG_REG_CALL_STACK, (argreg - 4) * 4);
return argreg + 2;
} else {
argreg = tcg_out_arg_reg32(s, argreg, arglo); argreg = tcg_out_arg_reg32(s, argreg, arglo);
argreg = tcg_out_arg_reg32(s, argreg, arghi); argreg = tcg_out_arg_reg32(s, argreg, arghi);
return argreg; return argreg;
}
} }
#define TLB_SHIFT (CPU_TLB_ENTRY_BITS + CPU_TLB_BITS) #define TLB_SHIFT (CPU_TLB_ENTRY_BITS + CPU_TLB_BITS)
/* Load and compare a TLB entry, leaving the flags set. Leaves R2 pointing /* We're expecting to use an 8-bit immediate and to mask. */
to the tlb entry. Clobbers R1 and TMP. */ QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8);
static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, /* We're expecting to use an 8-bit immediate add + 8-bit ldrd offset.
int s_bits, int tlb_offset) Using the offset of the second entry in the last tlb table ensures
that we can index all of the elements of the first entry. */
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
> 0xffff);
/* Load and compare a TLB entry, leaving the flags set. Returns the register
containing the addend of the tlb entry. Clobbers R0, R1, R2, TMP. */
static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
int s_bits, int mem_index, bool is_load)
{ {
TCGReg base = TCG_AREG0; TCGReg base = TCG_AREG0;
int cmp_off =
(is_load
? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
: offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
/* Should generate something like the following: /* Should generate something like the following:
* pre-v7:
* shr tmp, addr_reg, #TARGET_PAGE_BITS (1) * shr tmp, addr_reg, #TARGET_PAGE_BITS (1)
* add r2, env, #off & 0xff00 * add r2, env, #high
* and r0, tmp, #(CPU_TLB_SIZE - 1) (2) * and r0, tmp, #(CPU_TLB_SIZE - 1) (2)
* add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS (3) * add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS (3)
* ldr r0, [r2, #off & 0xff]! (4) * ldr r0, [r2, #cmp] (4)
* tst addr_reg, #s_mask * tst addr_reg, #s_mask
* cmpeq r0, tmp, lsl #TARGET_PAGE_BITS (5) * ldr r1, [r2, #add] (5)
* * cmpeq r0, tmp, lsl #TARGET_PAGE_BITS
* v7 (not implemented yet):
* ubfx r2, addr_reg, #TARGET_PAGE_BITS, #CPU_TLB_BITS (1)
* movw tmp, #~TARGET_PAGE_MASK & ~s_mask
* movw r0, #off
* add r2, env, r2, lsl #CPU_TLB_ENTRY_BITS (2)
* bic tmp, addr_reg, tmp
* ldr r0, [r2, r0]! (3)
* cmp r0, tmp (4)
*/ */
# if CPU_TLB_BITS > 8
# error
# endif
tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS)); 0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
/* We assume that the offset is contained within 16 bits. */ /* We checked that the offset is contained within 16 bits above. */
assert((tlb_offset & ~0xffff) == 0); if (add_off > 0xfff || (use_armv6_instructions && cmp_off > 0xff)) {
if (tlb_offset > 0xff) {
tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base, tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
(24 << 7) | (tlb_offset >> 8)); (24 << 7) | (cmp_off >> 8));
tlb_offset &= 0xff;
base = TCG_REG_R2; base = TCG_REG_R2;
add_off -= cmp_off & 0xff00;
cmp_off &= 0xff;
} }
tcg_out_dat_imm(s, COND_AL, ARITH_AND, tcg_out_dat_imm(s, COND_AL, ARITH_AND,
@ -1175,14 +1207,11 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
but due to how the pointer needs setting up, ldm isn't useful. but due to how the pointer needs setting up, ldm isn't useful.
Base arm5 doesn't have ldrd, but armv5te does. */ Base arm5 doesn't have ldrd, but armv5te does. */
if (use_armv6_instructions && TARGET_LONG_BITS == 64) { if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
tcg_out_memop_8(s, COND_AL, INSN_LDRD_IMM, TCG_REG_R0, tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
TCG_REG_R2, tlb_offset, 1, 1);
} else { } else {
tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0, tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
TCG_REG_R2, tlb_offset, 1, 1);
if (TARGET_LONG_BITS == 64) { if (TARGET_LONG_BITS == 64) {
tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R1, tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, cmp_off + 4);
TCG_REG_R2, 4, 1, 0);
} }
} }
@ -1192,6 +1221,9 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
0, addrlo, (1 << s_bits) - 1); 0, addrlo, (1 << s_bits) - 1);
} }
/* Load the tlb addend. */
tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2, add_off);
tcg_out_dat_reg(s, (s_bits ? COND_EQ : COND_AL), ARITH_CMP, 0, tcg_out_dat_reg(s, (s_bits ? COND_EQ : COND_AL), ARITH_CMP, 0,
TCG_REG_R0, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS)); TCG_REG_R0, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
@ -1199,6 +1231,8 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0,
TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0)); TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0));
} }
return TCG_REG_R2;
} }
/* Record the context of a call to the out of line helper code for the slow /* Record the context of a call to the out of line helper code for the slow
@ -1232,7 +1266,8 @@ static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc,
static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
{ {
TCGReg argreg, data_reg, data_reg2; TCGReg argreg, data_reg, data_reg2;
uint8_t *start; int opc = lb->opc;
uintptr_t func;
reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr); reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
@ -1243,46 +1278,46 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg); argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg);
} }
argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index); argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
tcg_out_call(s, (tcg_target_long) qemu_ld_helpers[lb->opc & 3]); argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
/* For armv6 we can use the canonical unsigned helpers and minimize
icache usage. For pre-armv6, use the signed helpers since we do
not have a single insn sign-extend. */
if (use_armv6_instructions) {
func = (uintptr_t)qemu_ld_helpers[opc & 3];
} else {
func = (uintptr_t)qemu_ld_helpers[opc];
if (opc & 4) {
opc = 2;
}
}
tcg_out_call(s, func);
data_reg = lb->datalo_reg; data_reg = lb->datalo_reg;
data_reg2 = lb->datahi_reg; data_reg2 = lb->datahi_reg;
switch (opc) {
start = s->code_ptr;
switch (lb->opc) {
case 0 | 4: case 0 | 4:
tcg_out_ext8s(s, COND_AL, data_reg, TCG_REG_R0); tcg_out_ext8s(s, COND_AL, data_reg, TCG_REG_R0);
break; break;
case 1 | 4: case 1 | 4:
tcg_out_ext16s(s, COND_AL, data_reg, TCG_REG_R0); tcg_out_ext16s(s, COND_AL, data_reg, TCG_REG_R0);
break; break;
case 0:
case 1:
case 2:
default: default:
tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0); tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
break; break;
case 3: case 3:
if (data_reg != TCG_REG_R1) {
tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0); tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1); tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1);
break; } else if (data_reg2 != TCG_REG_R0) {
tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1);
tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
} else {
tcg_out_mov_reg(s, COND_AL, TCG_REG_TMP, TCG_REG_R0);
tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1);
tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_TMP);
} }
/* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between
the call and the branch back to straight-line code. Note that the
moves above could be elided by register allocation, nor do we know
which code alternative we chose for extension. */
switch (s->code_ptr - start) {
case 0:
tcg_out_nop(s);
/* FALLTHRU */
case 4:
tcg_out_nop(s);
/* FALLTHRU */
case 8:
break; break;
default:
abort();
} }
tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr); tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr);
@ -1320,13 +1355,10 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
} }
argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index); argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
tcg_out_call(s, (tcg_target_long) qemu_st_helpers[lb->opc & 3]); argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
/* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between /* Tail-call to the helper, which will return to the fast path. */
the call and the branch back to straight-line code. */ tcg_out_goto(s, COND_AL, (tcg_target_long) qemu_st_helpers[lb->opc & 3]);
tcg_out_nop(s);
tcg_out_nop(s);
tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr);
} }
#endif /* SOFTMMU */ #endif /* SOFTMMU */
@ -1336,7 +1368,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
bool bswap; bool bswap;
#ifdef CONFIG_SOFTMMU #ifdef CONFIG_SOFTMMU
int mem_index, s_bits; int mem_index, s_bits;
TCGReg addr_reg2; TCGReg addr_reg2, addend;
uint8_t *label_ptr; uint8_t *label_ptr;
#endif #endif
#ifdef TARGET_WORDS_BIGENDIAN #ifdef TARGET_WORDS_BIGENDIAN
@ -1353,53 +1385,63 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
mem_index = *args; mem_index = *args;
s_bits = opc & 3; s_bits = opc & 3;
tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, addend = tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 1);
offsetof(CPUArchState, tlb_table[mem_index][0].addr_read));
/* This a conditional BL only to load a pointer within this opcode into LR
for the slow path. We will not be using the value for a tail call. */
label_ptr = s->code_ptr; label_ptr = s->code_ptr;
tcg_out_b_noaddr(s, COND_NE); tcg_out_bl_noaddr(s, COND_NE);
tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
offsetof(CPUTLBEntry, addend)
- offsetof(CPUTLBEntry, addr_read));
switch (opc) { switch (opc) {
case 0: case 0:
tcg_out_ld8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); tcg_out_ld8_r(s, COND_AL, data_reg, addr_reg, addend);
break; break;
case 0 | 4: case 0 | 4:
tcg_out_ld8s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); tcg_out_ld8s_r(s, COND_AL, data_reg, addr_reg, addend);
break; break;
case 1: case 1:
tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, addend);
if (bswap) { if (bswap) {
tcg_out_bswap16(s, COND_AL, data_reg, data_reg); tcg_out_bswap16(s, COND_AL, data_reg, data_reg);
} }
break; break;
case 1 | 4: case 1 | 4:
if (bswap) { if (bswap) {
tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, addend);
tcg_out_bswap16s(s, COND_AL, data_reg, data_reg); tcg_out_bswap16s(s, COND_AL, data_reg, data_reg);
} else { } else {
tcg_out_ld16s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); tcg_out_ld16s_r(s, COND_AL, data_reg, addr_reg, addend);
} }
break; break;
case 2: case 2:
default: default:
tcg_out_ld32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); tcg_out_ld32_r(s, COND_AL, data_reg, addr_reg, addend);
if (bswap) { if (bswap) {
tcg_out_bswap32(s, COND_AL, data_reg, data_reg); tcg_out_bswap32(s, COND_AL, data_reg, data_reg);
} }
break; break;
case 3: case 3:
if (bswap) { {
tcg_out_ld32_rwb(s, COND_AL, data_reg2, TCG_REG_R1, addr_reg); /* Be careful not to modify data_reg and data_reg2
tcg_out_ld32_12(s, COND_AL, data_reg, TCG_REG_R1, 4); for the slow path below. */
tcg_out_bswap32(s, COND_AL, data_reg2, data_reg2); TCGReg dl = (bswap ? data_reg2 : data_reg);
tcg_out_bswap32(s, COND_AL, data_reg, data_reg); TCGReg dh = (bswap ? data_reg : data_reg2);
if (use_armv6_instructions && (dl & 1) == 0 && dh == dl + 1) {
tcg_out_ldrd_r(s, COND_AL, dl, addr_reg, addend);
} else if (dl != addend) {
tcg_out_ld32_rwb(s, COND_AL, dl, addend, addr_reg);
tcg_out_ld32_12(s, COND_AL, dh, addend, 4);
} else { } else {
tcg_out_ld32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg); tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_TMP,
tcg_out_ld32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4); addend, addr_reg, SHIFT_IMM_LSL(0));
tcg_out_ld32_12(s, COND_AL, dl, TCG_REG_TMP, 0);
tcg_out_ld32_12(s, COND_AL, dh, TCG_REG_TMP, 4);
}
if (bswap) {
tcg_out_bswap32(s, COND_AL, dh, dh);
tcg_out_bswap32(s, COND_AL, dl, dl);
}
} }
break; break;
} }
@ -1450,9 +1492,13 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
} }
break; break;
case 3: case 3:
/* TODO: use block load - if (use_armv6_instructions && !bswap
* check that data_reg2 > data_reg or the other way */ && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
if (data_reg == addr_reg) { tcg_out_ldrd_8(s, COND_AL, data_reg, addr_reg, 0);
} else if (use_armv6_instructions && bswap
&& (data_reg2 & 1) == 0 && data_reg == data_reg2 + 1) {
tcg_out_ldrd_8(s, COND_AL, data_reg2, addr_reg, 0);
} else if (data_reg == addr_reg) {
tcg_out_ld32_12(s, COND_AL, data_reg2, addr_reg, bswap ? 0 : 4); tcg_out_ld32_12(s, COND_AL, data_reg2, addr_reg, bswap ? 0 : 4);
tcg_out_ld32_12(s, COND_AL, data_reg, addr_reg, bswap ? 4 : 0); tcg_out_ld32_12(s, COND_AL, data_reg, addr_reg, bswap ? 4 : 0);
} else { } else {
@ -1474,7 +1520,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
bool bswap; bool bswap;
#ifdef CONFIG_SOFTMMU #ifdef CONFIG_SOFTMMU
int mem_index, s_bits; int mem_index, s_bits;
TCGReg addr_reg2; TCGReg addr_reg2, addend;
uint8_t *label_ptr; uint8_t *label_ptr;
#endif #endif
#ifdef TARGET_WORDS_BIGENDIAN #ifdef TARGET_WORDS_BIGENDIAN
@ -1491,51 +1537,49 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
mem_index = *args; mem_index = *args;
s_bits = opc & 3; s_bits = opc & 3;
tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, addend = tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 0);
offsetof(CPUArchState,
tlb_table[mem_index][0].addr_write));
label_ptr = s->code_ptr;
tcg_out_b_noaddr(s, COND_NE);
tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
offsetof(CPUTLBEntry, addend)
- offsetof(CPUTLBEntry, addr_write));
switch (opc) { switch (opc) {
case 0: case 0:
tcg_out_st8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, addend);
break; break;
case 1: case 1:
if (bswap) { if (bswap) {
tcg_out_bswap16st(s, COND_AL, TCG_REG_R0, data_reg); tcg_out_bswap16st(s, COND_EQ, TCG_REG_R0, data_reg);
tcg_out_st16_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1); tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, addend);
} else { } else {
tcg_out_st16_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, addend);
} }
break; break;
case 2: case 2:
default: default:
if (bswap) { if (bswap) {
tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg); tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
tcg_out_st32_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1); tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, addend);
} else { } else {
tcg_out_st32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, addend);
} }
break; break;
case 3: case 3:
if (bswap) { if (bswap) {
tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg2); tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg2);
tcg_out_st32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R1, addr_reg); tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, addend, addr_reg);
tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg); tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
tcg_out_st32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R1, 4); tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, addend, 4);
} else if (use_armv6_instructions
&& (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, addend);
} else { } else {
tcg_out_st32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg); tcg_out_st32_rwb(s, COND_EQ, data_reg, addend, addr_reg);
tcg_out_st32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4); tcg_out_st32_12(s, COND_EQ, data_reg2, addend, 4);
} }
break; break;
} }
/* The conditional call must come last, as we're going to return here. */
label_ptr = s->code_ptr;
tcg_out_bl_noaddr(s, COND_NE);
add_qemu_ldst_label(s, 0, opc, data_reg, data_reg2, addr_reg, addr_reg2, add_qemu_ldst_label(s, 0, opc, data_reg, data_reg2, addr_reg, addr_reg2,
mem_index, s->code_ptr, label_ptr); mem_index, s->code_ptr, label_ptr);
#else /* !CONFIG_SOFTMMU */ #else /* !CONFIG_SOFTMMU */
@ -1576,13 +1620,14 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
} }
break; break;
case 3: case 3:
/* TODO: use block store -
* check that data_reg2 > data_reg or the other way */
if (bswap) { if (bswap) {
tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg2); tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg2);
tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addr_reg, 0); tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addr_reg, 0);
tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg); tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addr_reg, 4); tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addr_reg, 4);
} else if (use_armv6_instructions
&& (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
tcg_out_strd_8(s, COND_AL, data_reg, addr_reg, 0);
} else { } else {
tcg_out_st32_12(s, COND_AL, data_reg, addr_reg, 0); tcg_out_st32_12(s, COND_AL, data_reg, addr_reg, 0);
tcg_out_st32_12(s, COND_AL, data_reg2, addr_reg, 4); tcg_out_st32_12(s, COND_AL, data_reg2, addr_reg, 4);
@ -1991,7 +2036,7 @@ static const TCGTargetOpDef arm_op_defs[] = {
{ INDEX_op_qemu_ld16u, { "r", "l" } }, { INDEX_op_qemu_ld16u, { "r", "l" } },
{ INDEX_op_qemu_ld16s, { "r", "l" } }, { INDEX_op_qemu_ld16s, { "r", "l" } },
{ INDEX_op_qemu_ld32, { "r", "l" } }, { INDEX_op_qemu_ld32, { "r", "l" } },
{ INDEX_op_qemu_ld64, { "L", "L", "l" } }, { INDEX_op_qemu_ld64, { "r", "r", "l" } },
{ INDEX_op_qemu_st8, { "s", "s" } }, { INDEX_op_qemu_st8, { "s", "s" } },
{ INDEX_op_qemu_st16, { "s", "s" } }, { INDEX_op_qemu_st16, { "s", "s" } },
@ -2003,7 +2048,7 @@ static const TCGTargetOpDef arm_op_defs[] = {
{ INDEX_op_qemu_ld16u, { "r", "l", "l" } }, { INDEX_op_qemu_ld16u, { "r", "l", "l" } },
{ INDEX_op_qemu_ld16s, { "r", "l", "l" } }, { INDEX_op_qemu_ld16s, { "r", "l", "l" } },
{ INDEX_op_qemu_ld32, { "r", "l", "l" } }, { INDEX_op_qemu_ld32, { "r", "l", "l" } },
{ INDEX_op_qemu_ld64, { "L", "L", "l", "l" } }, { INDEX_op_qemu_ld64, { "r", "r", "l", "l" } },
{ INDEX_op_qemu_st8, { "s", "s", "s" } }, { INDEX_op_qemu_st8, { "s", "s", "s" } },
{ INDEX_op_qemu_st16, { "s", "s", "s" } }, { INDEX_op_qemu_st16, { "s", "s", "s" } },