|
From: Greg P. <gp...@us...> - 2007-03-09 10:49:01
|
vex ppc64 generates bad code for instruction sequences like this:
li r0, 2
stdx r3, r1, r0
gcc emits code like this when manipulating packed structures
with 8-byte fields on 2-byte boundaries.
First, vex's optimizer substitutes a constant 0x2 for r0:
------ IMark(0x100000F34, 4) ------
PUT(1024) = 0x100000F34:I64
t3 = GET:I64(24)
t14 = GET:I64(8)
t13 = Add64(t14,0x2:I64)
STbe(t13) = t3
Then instruction selection chooses `std` with an index not divisible by 4:
-- STbe(Add64(GET:I64(8),0x2:I64)) = GET:I64(24)
ldz %vR22,8(%r31)
ldz %vR23,24(%r31)
std %vR23,2(%vR22)
Finally, the assembler silently strips the index&3 part,
because `std` can't encode that:
std %r6,2(%r5)
F8 C5 00 00
...but 0xF8C50000 is `std r6, 0(r5)`, which writes to the wrong address.
host-ppc/hdefs.c contains the following:
if (opc1 == 58 || opc1 == 62) { // ld/std: mode64 only
vassert(mode64);
// kludge DS form: lowest 2 bits = 00
idx &= 0xFFFC;
}
That "kludge" doesn't work. Instead, Pin_Load and Pin_Store need to
use a temporary index register in this case, like Pin_AvLdSt does.
Index: VEX/priv/host-ppc/hdefs.c
===================================================================
--- VEX/priv/host-ppc/hdefs.c (revision 1737)
+++ VEX/priv/host-ppc/hdefs.c (working copy)
@@ -1300,21 +1300,43 @@
case Pin_Load: {
Bool idxd = toBool(i->Pin.Load.src->tag == Pam_RR);
UChar sz = i->Pin.Load.sz;
- UChar c_sz = sz==1 ? 'b' : sz==2 ? 'h' : sz==4 ? 'w' : 'd';
- vex_printf("l%cz%s ", c_sz, idxd ? "x" : "" );
- ppHRegPPC(i->Pin.Load.dst);
- vex_printf(",");
- ppPPCAMode(i->Pin.Load.src);
+ if (sz == 8 && !idxd && (i->Pin.Load.src->Pam.IR.index & 3) != 0) {
+ // ld can't encode idx & 3 != 0
+ // Use `li r30, idx; ldx r_dst, r30, r_base` instead.
+ ppLoadImm(hregPPC_GPR30(mode64),
+ i->Pin.Load.src->Pam.IR.index, mode64);
+ vex_printf(" ; ldx ");
+ ppHRegPPC(i->Pin.Load.dst);
+ vex_printf(",%%r30,");
+ ppHRegPPC(i->Pin.Load.src->Pam.IR.base);
+ } else {
+ UChar c_sz = sz==1 ? 'b' : sz==2 ? 'h' : sz==4 ? 'w' : 'd';
+ vex_printf("l%cz%s ", c_sz, idxd ? "x" : "" );
+ ppHRegPPC(i->Pin.Load.dst);
+ vex_printf(",");
+ ppPPCAMode(i->Pin.Load.src);
+ }
return;
}
case Pin_Store: {
UChar sz = i->Pin.Store.sz;
Bool idxd = toBool(i->Pin.Store.dst->tag == Pam_RR);
- UChar c_sz = sz==1 ? 'b' : sz==2 ? 'h' : sz==4 ? 'w' : /*8*/ 'd';
- vex_printf("st%c%s ", c_sz, idxd ? "x" : "" );
- ppHRegPPC(i->Pin.Store.src);
- vex_printf(",");
- ppPPCAMode(i->Pin.Store.dst);
+ if (sz == 8 && !idxd && (i->Pin.Store.dst->Pam.IR.index & 3) != 0) {
+ // std can't encode idx & 3 != 0
+ // Use `li r30, idx; stdx r_src, r30, r_base` instead.
+ ppLoadImm(hregPPC_GPR30(mode64),
+ i->Pin.Store.dst->Pam.IR.index, mode64);
+ vex_printf(" ; stdx ");
+ ppHRegPPC(i->Pin.Store.src);
+ vex_printf(",%%r30,");
+ ppHRegPPC(i->Pin.Store.dst->Pam.IR.base);
+ } else {
+ UChar c_sz = sz==1 ? 'b' : sz==2 ? 'h' : sz==4 ? 'w' : /*8*/ 'd';
+ vex_printf("st%c%s ", c_sz, idxd ? "x" : "" );
+ ppHRegPPC(i->Pin.Store.src);
+ vex_printf(",");
+ ppPPCAMode(i->Pin.Store.dst);
+ }
return;
}
case Pin_Set: {
@@ -1723,10 +1745,18 @@
case Pin_Load:
addRegUsage_PPCAMode(u, i->Pin.Load.src);
addHRegUse(u, HRmWrite, i->Pin.Load.dst);
+ /* ld can't encode all offsets; use R30 as temp */
+ if ( mode64 && i->Pin.Load.src->tag == Pam_IR && i->Pin.Load.sz == 8 &&
+ (i->Pin.Load.src->Pam.IR.index & 3) != 0 )
+ addHRegUse(u, HRmWrite, hregPPC_GPR30(mode64));
return;
case Pin_Store:
addHRegUse(u, HRmRead, i->Pin.Store.src);
addRegUsage_PPCAMode(u, i->Pin.Store.dst);
+ /* std can't encode all offsets; use R30 as temp */
+ if ( mode64 && i->Pin.Store.dst->tag == Pam_IR && i->Pin.Store.sz == 8 &&
+ (i->Pin.Store.dst->Pam.IR.index & 3) != 0 )
+ addHRegUse(u, HRmWrite, hregPPC_GPR30(mode64));
return;
case Pin_Set:
addHRegUse(u, HRmWrite, i->Pin.Set.dst);
@@ -2386,10 +2416,11 @@
rA = iregNo(am->Pam.IR.base, mode64);
idx = am->Pam.IR.index;
- if (opc1 == 58 || opc1 == 62) { // ld/std: mode64 only
+ if (opc1 == 58 || opc1 == 62) {
+ // ld/std can't encode idx&3 != 0.
+ // Use `li r30, idx; ldx/stdx r_sd, r30, r_base` instead.
vassert(mode64);
- // kludge DS form: lowest 2 bits = 00
- idx &= 0xFFFC;
+ vassert((idx & 3) == 0);
}
p = mkFormD(p, opc1, rSD, rA, idx);
return p;
@@ -3035,7 +3066,16 @@
case 8: opc1 = 58; vassert(mode64); break;
default: goto bad;
}
- p = doAMode_IR(p, opc1, r_dst, am_addr, mode64);
+ if (opc1 == 58 && (am_addr->Pam.IR.index & 3) != 0) {
+ // ld can't encode idx & 3 != 0.
+ // Use `li r30, idx; ldx r_dst, r30, r_base` instead.
+ UInt r_idx = 30;
+ UInt r_base = iregNo(am_addr->Pam.IR.base, mode64);
+ p = mkLoadImm(p, r_idx, am_addr->Pam.IR.index, mode64);
+ p = mkFormX(p, 31, r_dst, r_idx, r_base, 21, 0);
+ } else {
+ p = doAMode_IR(p, opc1, r_dst, am_addr, mode64);
+ }
goto done;
case Pam_RR:
switch(sz) {
@@ -3108,7 +3148,16 @@
default:
goto bad;
}
- p = doAMode_IR(p, opc1, r_src, am_addr, mode64);
+ if (opc1 == 62 && (am_addr->Pam.IR.index & 3) != 0) {
+ // std can't encode idx & 3 != 0.
+ // Use `li r30, idx; stdx r_src, r30, r_base` instead.
+ UInt r_idx = 30;
+ UInt r_base = iregNo(am_addr->Pam.IR.base, mode64);
+ p = mkLoadImm(p, r_idx, am_addr->Pam.IR.index, mode64);
+ p = mkFormX(p, 31, r_src, r_idx, r_base, 149, 0);
+ } else {
+ p = doAMode_IR(p, opc1, r_src, am_addr, mode64);
+ }
goto done;
case Pam_RR:
switch(sz) {
@@ -3352,7 +3401,7 @@
r_idx = iregNo(i->Pin.AvLdSt.addr->Pam.RR.index, mode64);
}
- if (i->Pin.FpLdSt.isLoad) { // Load from memory (1,2,4,16)
+ if (i->Pin.AvLdSt.isLoad) { // Load from memory (1,2,4,16)
opc2 = (sz==1) ? 7 : (sz==2) ? 39 : (sz==4) ? 71 : 103;
p = mkFormX(p, 31, v_reg, r_idx, r_base, opc2, 0);
} else { // Store to memory (1,2,4,16)
--
Greg Parker gp...@us...
|
|
From: Julian S. <js...@ac...> - 2007-03-09 11:30:08
|
Greg Wow, a real live code generation bug! Cool! Thanks for chasing that down. Now I think about this, I believe Paul Mackerras pointed out some problem along these lines a while back, but it fell through the cracks. Oops. > gcc emits code like this when manipulating packed structures > with 8-byte fields on 2-byte boundaries. I'd like to make a small C test case that shows gcc doing this, but am not sure from your description what's needed. Can you show a fragment of C with a suitable type definition? J |
|
From: Julian S. <js...@ac...> - 2007-03-09 15:46:00
|
> > gcc emits code like this when manipulating packed structures
> > with 8-byte fields on 2-byte boundaries.
>
> I'd like to make a small C test case that shows gcc doing this,
> but am not sure from your description what's needed. Can you
> show a fragment of C with a suitable type definition?
Well, I can't get gcc-4.1.0 to knowingly emit a misaligned ldx/stdx
on a ppc970 running SuSE 10.1 - it produces a verbose long sequence of
shifts and byte stores instead. But with an inline asm I can reproduce
it easily enough.
J
$ file ./badness
./badness: ELF 64-bit MSB executable, cisco 7500, version 1 (SYSV), for
GNU/Linux 2.6.4, dynamically linked (uses shared libs), for GNU/Linux 2.6.4,
not stripped
$ ./badness
....1122334455667788............
$ vTRUNK --tool=none ./badness
[...]
1122334455667788................
$ cat badness.c
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
typedef
struct __attribute__ ((__packed__)) {
char before[2];
unsigned long long int w64;
char after[6];
}
T;
void foo (T* t, unsigned long long int w)
{
__asm__ __volatile__(
"stdx %0,%1,%2"
: : "b"(w), "b"(t), "b"(2) : "memory"
);
}
int main ( void )
{
T* t;
unsigned char* p;
int i;
assert(sizeof(T) == 16);
t = calloc(sizeof(T),1);
assert(t);
foo(t, 0x1122334455667788);
p = (unsigned char*)t;
for (i = 0; i < 16; i++)
if (p[i] == 0)
printf("..");
else
printf("%02x", (int)p[i]);
printf("\n");
return 0;
}
|
|
From: Greg P. <gp...@us...> - 2007-03-09 18:18:47
|
Julian Seward writes:
> > gcc emits code like this when manipulating packed structures
> > with 8-byte fields on 2-byte boundaries.
>
> I'd like to make a small C test case that shows gcc doing this,
> but am not sure from your description what's needed. Can you
> show a fragment of C with a suitable type definition?
gcc's optimizer is weird. In the test below, the assignment in
main() uses `li ; stdx`, but the assignment in fn() does not.
An asm test is probably more future-proof. I'm using gcc-4.0.1.
#include <stdint.h>
struct s {
char pad[158];
uintptr_t p;
} __attribute__((packed));
__attribute__((noinline))
void fn(struct s *s) {
s->p = 88;
}
int main() {
struct s s;
s.p = 99;
fn(&s);
return 0;
}
% cc -arch ppc64 test.c -o - -S -O3
.section __TEXT,__text,regular,pure_instructions
.section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32
.machine ppc64
.text
.align 2
.p2align 4,,15
.globl _fn
_fn:
li r0,0
li r2,88
stb r0,164(r3)
stb r2,165(r3)
stb r0,158(r3)
stb r0,159(r3)
stb r0,160(r3)
stb r0,161(r3)
stb r0,162(r3)
stb r0,163(r3)
blr
.align 2
.p2align 4,,15
.globl _main
_main:
mflr r0
li r2,99
std r0,16(r1)
stdu r1,-304(r1)
li r0,270
addi r3,r1,112
stdx r2,r1,r0
bl _fn
addi r1,r1,304
li r3,0
ld r0,16(r1)
mtlr r0
blr
.subsections_via_symbols
|
|
From: Julian S. <js...@ac...> - 2007-03-09 18:19:51
|
Fixed (r1739). Thanks for tracking it down. > That "kludge" doesn't work. Instead, Pin_Load and Pin_Store need to > use a temporary index register in this case, like Pin_AvLdSt does. Rather than fix the assembler to work around not-directly representable address modes, I fixed this in the previous stage (instruction selection) by stopping iselWordExpr_AMode constructing them in the first place. For good measure I also checked the handling of load/store amodes in the ppc64->IR conversion (front end) and that all looks correct to me. J |