You can subscribe to this list here.
| 2002 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(1) |
Oct
(122) |
Nov
(152) |
Dec
(69) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2003 |
Jan
(6) |
Feb
(25) |
Mar
(73) |
Apr
(82) |
May
(24) |
Jun
(25) |
Jul
(10) |
Aug
(11) |
Sep
(10) |
Oct
(54) |
Nov
(203) |
Dec
(182) |
| 2004 |
Jan
(307) |
Feb
(305) |
Mar
(430) |
Apr
(312) |
May
(187) |
Jun
(342) |
Jul
(487) |
Aug
(637) |
Sep
(336) |
Oct
(373) |
Nov
(441) |
Dec
(210) |
| 2005 |
Jan
(385) |
Feb
(480) |
Mar
(636) |
Apr
(544) |
May
(679) |
Jun
(625) |
Jul
(810) |
Aug
(838) |
Sep
(634) |
Oct
(521) |
Nov
(965) |
Dec
(543) |
| 2006 |
Jan
(494) |
Feb
(431) |
Mar
(546) |
Apr
(411) |
May
(406) |
Jun
(322) |
Jul
(256) |
Aug
(401) |
Sep
(345) |
Oct
(542) |
Nov
(308) |
Dec
(481) |
| 2007 |
Jan
(427) |
Feb
(326) |
Mar
(367) |
Apr
(255) |
May
(244) |
Jun
(204) |
Jul
(223) |
Aug
(231) |
Sep
(354) |
Oct
(374) |
Nov
(497) |
Dec
(362) |
| 2008 |
Jan
(322) |
Feb
(482) |
Mar
(658) |
Apr
(422) |
May
(476) |
Jun
(396) |
Jul
(455) |
Aug
(267) |
Sep
(280) |
Oct
(253) |
Nov
(232) |
Dec
(304) |
| 2009 |
Jan
(486) |
Feb
(470) |
Mar
(458) |
Apr
(423) |
May
(696) |
Jun
(461) |
Jul
(551) |
Aug
(575) |
Sep
(134) |
Oct
(110) |
Nov
(157) |
Dec
(102) |
| 2010 |
Jan
(226) |
Feb
(86) |
Mar
(147) |
Apr
(117) |
May
(107) |
Jun
(203) |
Jul
(193) |
Aug
(238) |
Sep
(300) |
Oct
(246) |
Nov
(23) |
Dec
(75) |
| 2011 |
Jan
(133) |
Feb
(195) |
Mar
(315) |
Apr
(200) |
May
(267) |
Jun
(293) |
Jul
(353) |
Aug
(237) |
Sep
(278) |
Oct
(611) |
Nov
(274) |
Dec
(260) |
| 2012 |
Jan
(303) |
Feb
(391) |
Mar
(417) |
Apr
(441) |
May
(488) |
Jun
(655) |
Jul
(590) |
Aug
(610) |
Sep
(526) |
Oct
(478) |
Nov
(359) |
Dec
(372) |
| 2013 |
Jan
(467) |
Feb
(226) |
Mar
(391) |
Apr
(281) |
May
(299) |
Jun
(252) |
Jul
(311) |
Aug
(352) |
Sep
(481) |
Oct
(571) |
Nov
(222) |
Dec
(231) |
| 2014 |
Jan
(185) |
Feb
(329) |
Mar
(245) |
Apr
(238) |
May
(281) |
Jun
(399) |
Jul
(382) |
Aug
(500) |
Sep
(579) |
Oct
(435) |
Nov
(487) |
Dec
(256) |
| 2015 |
Jan
(338) |
Feb
(357) |
Mar
(330) |
Apr
(294) |
May
(191) |
Jun
(108) |
Jul
(142) |
Aug
(261) |
Sep
(190) |
Oct
(54) |
Nov
(83) |
Dec
(22) |
| 2016 |
Jan
(49) |
Feb
(89) |
Mar
(33) |
Apr
(50) |
May
(27) |
Jun
(34) |
Jul
(53) |
Aug
(53) |
Sep
(98) |
Oct
(206) |
Nov
(93) |
Dec
(53) |
| 2017 |
Jan
(65) |
Feb
(82) |
Mar
(102) |
Apr
(86) |
May
(187) |
Jun
(67) |
Jul
(23) |
Aug
(93) |
Sep
(65) |
Oct
(45) |
Nov
(35) |
Dec
(17) |
| 2018 |
Jan
(26) |
Feb
(35) |
Mar
(38) |
Apr
(32) |
May
(8) |
Jun
(43) |
Jul
(27) |
Aug
(30) |
Sep
(43) |
Oct
(42) |
Nov
(38) |
Dec
(67) |
| 2019 |
Jan
(32) |
Feb
(37) |
Mar
(53) |
Apr
(64) |
May
(49) |
Jun
(18) |
Jul
(14) |
Aug
(53) |
Sep
(25) |
Oct
(30) |
Nov
(49) |
Dec
(31) |
| 2020 |
Jan
(87) |
Feb
(45) |
Mar
(37) |
Apr
(51) |
May
(99) |
Jun
(36) |
Jul
(11) |
Aug
(14) |
Sep
(20) |
Oct
(24) |
Nov
(40) |
Dec
(23) |
| 2021 |
Jan
(14) |
Feb
(53) |
Mar
(85) |
Apr
(15) |
May
(19) |
Jun
(3) |
Jul
(14) |
Aug
(1) |
Sep
(57) |
Oct
(73) |
Nov
(56) |
Dec
(22) |
| 2022 |
Jan
(3) |
Feb
(22) |
Mar
(6) |
Apr
(55) |
May
(46) |
Jun
(39) |
Jul
(15) |
Aug
(9) |
Sep
(11) |
Oct
(34) |
Nov
(20) |
Dec
(36) |
| 2023 |
Jan
(79) |
Feb
(41) |
Mar
(99) |
Apr
(169) |
May
(48) |
Jun
(16) |
Jul
(16) |
Aug
(57) |
Sep
(19) |
Oct
|
Nov
|
Dec
|
| S | M | T | W | T | F | S |
|---|---|---|---|---|---|---|
|
|
|
|
|
|
|
1
|
|
2
|
3
(4) |
4
(5) |
5
(5) |
6
(3) |
7
|
8
|
|
9
|
10
(8) |
11
(13) |
12
(12) |
13
(1) |
14
(1) |
15
(5) |
|
16
|
17
(12) |
18
(7) |
19
(5) |
20
|
21
(11) |
22
(8) |
|
23
(8) |
24
(6) |
25
|
26
(2) |
27
(3) |
28
(9) |
29
|
|
30
|
31
(5) |
|
|
|
|
|
|
From: <sv...@va...> - 2011-01-21 21:00:00
|
Author: sewardj
Date: 2011-01-21 20:59:52 +0000 (Fri, 21 Jan 2011)
New Revision: 11507
Log:
Add a test for LOOPNEL. See #256669.
(Jakub Jelinek <ja...@re...>)
Added:
trunk/none/tests/amd64/loopnel.c
trunk/none/tests/amd64/loopnel.stderr.exp
trunk/none/tests/amd64/loopnel.stdout.exp
trunk/none/tests/amd64/loopnel.vgtest
Modified:
trunk/none/tests/amd64/Makefile.am
Modified: trunk/none/tests/amd64/Makefile.am
===================================================================
--- trunk/none/tests/amd64/Makefile.am 2011-01-21 18:14:32 UTC (rev 11506)
+++ trunk/none/tests/amd64/Makefile.am 2011-01-21 20:59:52 UTC (rev 11507)
@@ -43,6 +43,7 @@
insn_ssse3.stdout.exp insn_ssse3.stderr.exp insn_ssse3.vgtest \
jrcxz.stderr.exp jrcxz.stdout.exp jrcxz.vgtest \
looper.stderr.exp looper.stdout.exp looper.vgtest \
+ loopnel.stderr.exp loopnel.stdout.exp loopnel.vgtest \
lzcnt64.stderr.exp lzcnt64.stdout.exp lzcnt64.vgtest \
nibz_bennee_mmap.stderr.exp nibz_bennee_mmap.stdout.exp \
nibz_bennee_mmap.vgtest \
@@ -94,6 +95,7 @@
fcmovnu \
fxtract \
looper \
+ loopnel \
jrcxz \
shrld \
slahf-amd64
Added: trunk/none/tests/amd64/loopnel.c
===================================================================
--- trunk/none/tests/amd64/loopnel.c (rev 0)
+++ trunk/none/tests/amd64/loopnel.c 2011-01-21 20:59:52 UTC (rev 11507)
@@ -0,0 +1,11 @@
+#include <stdio.h>
+
+int
+main (void)
+{
+ long rcx = 0x200000005UL;
+ long rax = 5UL;
+ asm volatile ("1: addq $1, %0; loopnel 1b" : "+a" (rax), "+c" (rcx) : : "cc");
+ printf ("%ld %ld\n", rax, rcx);
+ return 0;
+}
Added: trunk/none/tests/amd64/loopnel.stderr.exp
===================================================================
--- trunk/none/tests/amd64/loopnel.stderr.exp (rev 0)
+++ trunk/none/tests/amd64/loopnel.stderr.exp 2011-01-21 20:59:52 UTC (rev 11507)
@@ -0,0 +1,2 @@
+
+
Added: trunk/none/tests/amd64/loopnel.stdout.exp
===================================================================
--- trunk/none/tests/amd64/loopnel.stdout.exp (rev 0)
+++ trunk/none/tests/amd64/loopnel.stdout.exp 2011-01-21 20:59:52 UTC (rev 11507)
@@ -0,0 +1 @@
+10 0
Added: trunk/none/tests/amd64/loopnel.vgtest
===================================================================
--- trunk/none/tests/amd64/loopnel.vgtest (rev 0)
+++ trunk/none/tests/amd64/loopnel.vgtest 2011-01-21 20:59:52 UTC (rev 11507)
@@ -0,0 +1 @@
+prog: loopnel
|
|
From: <sv...@va...> - 2011-01-21 20:56:25
|
Author: sewardj
Date: 2011-01-21 20:56:16 +0000 (Fri, 21 Jan 2011)
New Revision: 2085
Log:
Implement LOOPNEL (32-bit version of LOOPNE). Fixes #256669.
(Jakub Jelinek <ja...@re...>)
Modified:
trunk/priv/guest_amd64_toIR.c
Modified: trunk/priv/guest_amd64_toIR.c
===================================================================
--- trunk/priv/guest_amd64_toIR.c 2011-01-21 18:05:19 UTC (rev 2084)
+++ trunk/priv/guest_amd64_toIR.c 2011-01-21 20:56:16 UTC (rev 2085)
@@ -16401,18 +16401,33 @@
case 0xE1: /* LOOPE disp8: decrement count, jump if count != 0 && ZF==1 */
case 0xE2: /* LOOP disp8: decrement count, jump if count != 0 */
{ /* The docs say this uses rCX as a count depending on the
- address size override, not the operand one. Since we don't
- handle address size overrides, I guess that means RCX. */
+ address size override, not the operand one. */
IRExpr* zbit = NULL;
IRExpr* count = NULL;
IRExpr* cond = NULL;
HChar* xtra = NULL;
- if (have66orF2orF3(pfx) || haveASO(pfx)) goto decode_failure;
+ if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
+ /* So at this point we've rejected any variants which appear to
+ be governed by the usual operand-size modifiers. Hence only
+ the address size prefix can have an effect. It changes the
+ size from 64 (default) to 32. */
d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
delta++;
- putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
+ if (haveASO(pfx)) {
+ /* 64to32 of 64-bit get is merely a get-put improvement
+ trick. */
+ putIReg32(R_RCX, binop(Iop_Sub32,
+ unop(Iop_64to32, getIReg64(R_RCX)),
+ mkU32(1)));
+ } else {
+ putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
+ }
+ /* This is correct, both for 32- and 64-bit versions. If we're
+ doing a 32-bit dec and the result is zero then the default
+ zero extension rule will cause the upper 32 bits to be zero
+ too. Hence a 64-bit check against zero is OK. */
count = getIReg64(R_RCX);
cond = binop(Iop_CmpNE64, count, mkU64(0));
switch (opc) {
@@ -16422,19 +16437,19 @@
case 0xE1:
xtra = "e";
zbit = mk_amd64g_calculate_condition( AMD64CondZ );
- cond = mkAnd1(cond, zbit);
+ cond = mkAnd1(cond, zbit);
break;
case 0xE0:
xtra = "ne";
zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
- cond = mkAnd1(cond, zbit);
+ cond = mkAnd1(cond, zbit);
break;
default:
vassert(0);
}
stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64)) );
- DIP("loop%s 0x%llx\n", xtra, d64);
+ DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", d64);
break;
}
|
|
From: <sv...@va...> - 2011-01-21 18:14:40
|
Author: sewardj
Date: 2011-01-21 18:14:32 +0000 (Fri, 21 Jan 2011)
New Revision: 11506
Log:
Expand this test so as to cover FXSAVE and FXRSTOR, both REX.W and
non-REX.W variants.
Modified:
trunk/memcheck/tests/amd64/fxsave-amd64.c
trunk/memcheck/tests/amd64/fxsave-amd64.stdout.exp
trunk/memcheck/tests/amd64/fxsave-amd64.vgtest
Modified: trunk/memcheck/tests/amd64/fxsave-amd64.c
===================================================================
--- trunk/memcheck/tests/amd64/fxsave-amd64.c 2011-01-21 18:13:02 UTC (rev 11505)
+++ trunk/memcheck/tests/amd64/fxsave-amd64.c 2011-01-21 18:14:32 UTC (rev 11506)
@@ -1,6 +1,9 @@
#include <stdio.h>
#include <stdlib.h>
+#include "tests/asm.h"
+#include "tests/malloc.h"
+#include <string.h>
const unsigned int vec0[4]
= { 0x12345678, 0x11223344, 0x55667788, 0x87654321 };
@@ -8,8 +11,64 @@
const unsigned int vec1[4]
= { 0xABCDEF01, 0xAABBCCDD, 0xEEFF0011, 0x10FEDCBA };
+const unsigned int vecZ[4]
+ = { 0, 0, 0, 0 };
+
+__attribute__((noinline))
+void do_fxsave ( void* p, int rexw ) {
+ if (rexw) {
+ asm __volatile__("rex64/fxsave (%0)" : : "r" (p) : "memory" );
+ } else {
+ asm __volatile__("fxsave (%0)" : : "r" (p) : "memory" );
+ }
+}
+
+__attribute__((noinline))
+void do_fxrstor ( void* p, int rexw ) {
+ if (rexw) {
+ asm __volatile__("rex64/fxrstor (%0)" : : "r" (p) : "memory" );
+ } else {
+ asm __volatile__("fxrstor (%0)" : : "r" (p) : "memory" );
+ }
+}
+
+void do_zeroise ( void )
+{
+ asm __volatile__("finit");
+ asm __volatile__(
+ "fldz\n\t"
+ "fldz\n\t"
+ "fldz\n\t"
+ "fldz\n\t"
+ "fldz\n\t"
+ "fldz\n\t"
+ "fldz\n\t"
+ "fldz\n\t"
+ "finit\n");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm0");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm1");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm2");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm3");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm4");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm5");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm6");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm7");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm8");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm9");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm10");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm11");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm12");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm13");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm14");
+ asm __volatile__("movups " VG_SYM(vecZ) ", %xmm15");
+ asm __volatile__(
+ "pushq $0\n\t"
+ "ldmxcsr 0(%rsp)\n\t"
+ "addq $8,%rsp\n");
+}
+
/* set up the FP and SSE state, and then dump it. */
-void do_fxsave ( void* p )
+void do_setup_then_fxsave ( void* p, int rexw )
{
asm __volatile__("finit");
asm __volatile__("fldpi");
@@ -35,7 +94,7 @@
asm __volatile__("movaps %xmm2, %xmm13");
asm __volatile__("movaps %xmm0, %xmm14");
asm __volatile__("movaps %xmm1, %xmm15");
- asm __volatile__("fxsave (%0)" : : "r" (p) : "memory" );
+ do_fxsave(p, rexw);
}
int isFPLsbs ( int i )
@@ -52,28 +111,81 @@
return 0;
}
-int main ( int argc, char** argv )
+void show ( unsigned char* buf, int xx )
{
- int i, j;
- unsigned char* buf = malloc(512);
- int xx = 1; /* argc > 1;
- printf("Re-run with any arg to suppress least-significant\n"
- " 16 bits of FP numbers\n");
- */
- for (i = 0; i < 512; i++)
- buf[i] = 0x55;
-
- do_fxsave(buf);
- for (j = 0; j < 512; j++) {
- i = (j & 0xFFF0) + (15 - (j & 0xF));
- if ((j % 16) == 0)
- printf("%3d ", j);
+ int i;
+ for (i = 0; i < 512; i++) {
+ if ((i % 16) == 0)
+ printf("%3d ", i);
if (xx && isFPLsbs(i))
printf("xx ");
else
printf("%02x ", buf[i]);
- if (j > 0 && ((j % 16) == 15))
+ if (i > 0 && ((i % 16) == 15))
printf("\n");
}
+}
+
+
+int main ( int argc, char** argv )
+{
+ unsigned char* buf1 = memalign16(512);
+ unsigned char* buf2 = memalign16(512);
+ unsigned char* buf3 = memalign16(512);
+ int xx = argc > 1;
+ printf("Re-run with any arg to suppress least-significant\n"
+ " 16 bits of FP numbers\n");
+
+ printf("\n-------- FXSAVE non-64 (REX.W == 0) --------\n");
+
+ memset(buf1, 0x55, 512);
+ memset(buf2, 0x55, 512);
+ memset(buf3, 0x55, 512);
+
+ /* Load up x87/xmm state and dump it. */
+ do_setup_then_fxsave(buf1, 0);
+ printf("\nBEFORE\n");
+ show(buf1, xx);
+
+ /* Zeroise x87/xmm state and dump it, to show that the
+ regs have been cleared out. */
+ do_zeroise();
+ do_fxsave(buf2, 0);
+ printf("\nZEROED\n");
+ show(buf2, xx);
+
+ /* Reload x87/xmm state from buf1 and dump it in buf3. */
+ do_fxrstor(buf1, 0);
+ do_fxsave(buf3, 0);
+ printf("\nRESTORED\n");
+ show(buf3, xx);
+
+ printf("\n-------- FXSAVE 64 (REX.W == 1) --------\n\n");
+
+ memset(buf1, 0x55, 512);
+ memset(buf2, 0x55, 512);
+ memset(buf3, 0x55, 512);
+
+ /* Load up x87/xmm state and dump it. */
+ do_setup_then_fxsave(buf1, 1);
+ printf("\nBEFORE\n");
+ show(buf1, xx);
+
+ /* Zeroise x87/xmm state and dump it, to show that the
+ regs have been cleared out. */
+ do_zeroise();
+ do_fxsave(buf2, 1);
+ printf("\nZEROED\n");
+ show(buf2, xx);
+
+ /* Reload x87/xmm state from buf1 and dump it in buf3. */
+ do_fxrstor(buf1, 1);
+ do_fxsave(buf3, 1);
+ printf("\nRESTORED\n");
+ show(buf3, xx);
+
+
+ free(buf1); free(buf2); free(buf3);
+
return 0;
}
Modified: trunk/memcheck/tests/amd64/fxsave-amd64.stdout.exp
===================================================================
--- trunk/memcheck/tests/amd64/fxsave-amd64.stdout.exp 2011-01-21 18:13:02 UTC (rev 11505)
+++ trunk/memcheck/tests/amd64/fxsave-amd64.stdout.exp 2011-01-21 18:14:32 UTC (rev 11506)
@@ -1,32 +1,211 @@
- 0 00 00 00 00 00 00 00 00 00 00 00 fe 08 00 03 7f
- 16 00 00 ff ff 00 00 1f 80 00 00 00 00 00 00 00 00
- 32 00 00 00 00 00 00 3f ff 80 00 00 00 00 00 xx xx
- 48 00 00 00 00 00 00 3f ff 80 00 00 00 00 00 xx xx
- 64 00 00 00 00 00 00 40 00 c9 0f da a2 21 68 xx xx
- 80 00 00 00 00 00 00 3f fd 9a 20 9a 84 fb cf xx xx
- 96 00 00 00 00 00 00 3f fe b1 72 17 f7 d1 cf xx xx
-112 00 00 00 00 00 00 3f ff 80 00 00 00 00 00 xx xx
-128 00 00 00 00 00 00 40 00 c9 0f da a2 21 68 xx xx
-144 00 00 00 00 00 00 00 00 00 00 00 00 00 00 xx xx
-160 87 65 43 21 55 66 77 88 11 22 33 44 12 34 56 78
-176 10 fe dc ba ee ff 00 11 aa bb cc dd ab cd ef 01
+Re-run with any arg to suppress least-significant
+ 16 bits of FP numbers
+
+-------- FXSAVE non-64 (REX.W == 0) --------
+
+BEFORE
+ 0 7f 03 00 08 fe 00 00 00 00 00 00 00 00 00 00 00
+ 16 00 00 00 00 00 00 00 00 80 1f 00 00 ff ff 00 00
+ 32 xx xx 00 00 00 00 00 80 ff 3f 00 00 00 00 00 00
+ 48 xx xx 00 00 00 00 00 80 ff 3f 00 00 00 00 00 00
+ 64 xx xx 68 21 a2 da 0f c9 00 40 00 00 00 00 00 00
+ 80 xx xx cf fb 84 9a 20 9a fd 3f 00 00 00 00 00 00
+ 96 xx xx cf d1 f7 17 72 b1 fe 3f 00 00 00 00 00 00
+112 xx xx 00 00 00 00 00 80 ff 3f 00 00 00 00 00 00
+128 xx xx 68 21 a2 da 0f c9 00 40 00 00 00 00 00 00
+144 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+160 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+176 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
192 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
-208 87 65 43 21 55 66 77 88 11 22 33 44 12 34 56 78
-224 10 fe dc ba ee ff 00 11 aa bb cc dd ab cd ef 01
+208 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+224 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
240 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
-256 87 65 43 21 55 66 77 88 11 22 33 44 12 34 56 78
-272 10 fe dc ba ee ff 00 11 aa bb cc dd ab cd ef 01
-288 10 fe dc ba ee ff 00 11 aa bb cc dd ab cd ef 01
+256 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+272 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+288 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
304 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
-320 87 65 43 21 55 66 77 88 11 22 33 44 12 34 56 78
-336 10 fe dc ba ee ff 00 11 aa bb cc dd ab cd ef 01
-352 10 fe dc ba ee ff 00 11 aa bb cc dd ab cd ef 01
+320 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+336 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+352 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
368 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
-384 87 65 43 21 55 66 77 88 11 22 33 44 12 34 56 78
-400 10 fe dc ba ee ff 00 11 aa bb cc dd ab cd ef 01
+384 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+400 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
416 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
432 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
448 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
464 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
480 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
496 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+
+ZEROED
+ 0 7f 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+ 16 00 00 00 00 00 00 00 00 80 1f 00 00 ff ff 00 00
+ 32 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+ 48 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+ 64 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+ 80 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+ 96 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+112 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+128 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+144 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+160 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+176 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+192 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+208 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+224 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+240 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+256 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+272 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+288 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+304 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+320 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+336 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+352 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+368 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+384 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+400 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+416 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+432 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+448 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+464 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+480 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+496 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+
+RESTORED
+ 0 7f 03 00 08 fe 00 00 00 00 00 00 00 00 00 00 00
+ 16 00 00 00 00 00 00 00 00 80 1f 00 00 ff ff 00 00
+ 32 xx xx 00 00 00 00 00 80 ff 3f 00 00 00 00 00 00
+ 48 xx xx 00 00 00 00 00 80 ff 3f 00 00 00 00 00 00
+ 64 xx xx 68 21 a2 da 0f c9 00 40 00 00 00 00 00 00
+ 80 xx xx cf fb 84 9a 20 9a fd 3f 00 00 00 00 00 00
+ 96 xx xx cf d1 f7 17 72 b1 fe 3f 00 00 00 00 00 00
+112 xx xx 00 00 00 00 00 80 ff 3f 00 00 00 00 00 00
+128 xx xx 68 21 a2 da 0f c9 00 40 00 00 00 00 00 00
+144 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+160 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+176 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+192 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+208 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+224 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+240 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+256 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+272 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+288 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+304 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+320 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+336 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+352 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+368 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+384 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+400 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+416 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+432 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+448 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+464 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+480 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+496 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+
+-------- FXSAVE 64 (REX.W == 1) --------
+
+
+BEFORE
+ 0 7f 03 00 08 fe 00 00 00 00 00 00 00 00 00 00 00
+ 16 00 00 00 00 00 00 00 00 80 1f 00 00 ff ff 00 00
+ 32 xx xx 00 00 00 00 00 80 ff 3f 00 00 00 00 00 00
+ 48 xx xx 00 00 00 00 00 80 ff 3f 00 00 00 00 00 00
+ 64 xx xx 68 21 a2 da 0f c9 00 40 00 00 00 00 00 00
+ 80 xx xx cf fb 84 9a 20 9a fd 3f 00 00 00 00 00 00
+ 96 xx xx cf d1 f7 17 72 b1 fe 3f 00 00 00 00 00 00
+112 xx xx 00 00 00 00 00 80 ff 3f 00 00 00 00 00 00
+128 xx xx 68 21 a2 da 0f c9 00 40 00 00 00 00 00 00
+144 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+160 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+176 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+192 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+208 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+224 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+240 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+256 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+272 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+288 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+304 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+320 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+336 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+352 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+368 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+384 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+400 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+416 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+432 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+448 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+464 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+480 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+496 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+
+ZEROED
+ 0 7f 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+ 16 00 00 00 00 00 00 00 00 80 1f 00 00 ff ff 00 00
+ 32 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+ 48 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+ 64 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+ 80 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+ 96 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+112 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+128 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+144 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+160 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+176 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+192 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+208 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+224 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+240 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+256 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+272 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+288 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+304 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+320 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+336 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+352 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+368 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+384 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+400 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+416 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+432 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+448 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+464 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+480 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+496 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+
+RESTORED
+ 0 7f 03 00 08 fe 00 00 00 00 00 00 00 00 00 00 00
+ 16 00 00 00 00 00 00 00 00 80 1f 00 00 ff ff 00 00
+ 32 xx xx 00 00 00 00 00 80 ff 3f 00 00 00 00 00 00
+ 48 xx xx 00 00 00 00 00 80 ff 3f 00 00 00 00 00 00
+ 64 xx xx 68 21 a2 da 0f c9 00 40 00 00 00 00 00 00
+ 80 xx xx cf fb 84 9a 20 9a fd 3f 00 00 00 00 00 00
+ 96 xx xx cf d1 f7 17 72 b1 fe 3f 00 00 00 00 00 00
+112 xx xx 00 00 00 00 00 80 ff 3f 00 00 00 00 00 00
+128 xx xx 68 21 a2 da 0f c9 00 40 00 00 00 00 00 00
+144 xx xx 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+160 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+176 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+192 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+208 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+224 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+240 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+256 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+272 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+288 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+304 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+320 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+336 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+352 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+368 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+384 78 56 34 12 44 33 22 11 88 77 66 55 21 43 65 87
+400 01 ef cd ab dd cc bb aa 11 00 ff ee ba dc fe 10
+416 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+432 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+448 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+464 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+480 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
+496 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55
Modified: trunk/memcheck/tests/amd64/fxsave-amd64.vgtest
===================================================================
--- trunk/memcheck/tests/amd64/fxsave-amd64.vgtest 2011-01-21 18:13:02 UTC (rev 11505)
+++ trunk/memcheck/tests/amd64/fxsave-amd64.vgtest 2011-01-21 18:14:32 UTC (rev 11506)
@@ -1,2 +1,3 @@
prog: fxsave-amd64
vgopts: -q
+args: x
|
|
From: <sv...@va...> - 2011-01-21 18:13:10
|
Author: sewardj
Date: 2011-01-21 18:13:02 +0000 (Fri, 21 Jan 2011)
New Revision: 11505
Log:
Add tests for SSE4.2 CRC32{B,W,L,Q} insns.
Added:
trunk/none/tests/amd64/crc32.c
trunk/none/tests/amd64/crc32.stderr.exp
trunk/none/tests/amd64/crc32.stdout.exp
trunk/none/tests/amd64/crc32.vgtest
Modified:
trunk/none/tests/amd64/Makefile.am
Modified: trunk/none/tests/amd64/Makefile.am
===================================================================
--- trunk/none/tests/amd64/Makefile.am 2011-01-18 05:16:21 UTC (rev 11504)
+++ trunk/none/tests/amd64/Makefile.am 2011-01-21 18:13:02 UTC (rev 11505)
@@ -31,6 +31,7 @@
bug156404-amd64.vgtest bug156404-amd64.stdout.exp \
bug156404-amd64.stderr.exp \
clc.vgtest clc.stdout.exp clc.stderr.exp \
+ crc32.vgtest crc32.stdout.exp crc32.stderr.exp \
cmpxchg.vgtest cmpxchg.stdout.exp cmpxchg.stderr.exp \
faultstatus.disabled faultstatus.stderr.exp \
fcmovnu.vgtest fcmovnu.stderr.exp fcmovnu.stdout.exp \
@@ -81,7 +82,7 @@
check_PROGRAMS += lzcnt64
endif
if BUILD_SSE42_TESTS
- check_PROGRAMS += pcmpstr64 pcmpxstrx64 sse4-64
+ check_PROGRAMS += pcmpstr64 pcmpxstrx64 sse4-64 crc32
endif
# DDD: these need to be made to work on Darwin like the x86/ ones were.
Added: trunk/none/tests/amd64/crc32.c
===================================================================
--- trunk/none/tests/amd64/crc32.c (rev 0)
+++ trunk/none/tests/amd64/crc32.c 2011-01-21 18:13:02 UTC (rev 11505)
@@ -0,0 +1,213 @@
+
+#include <stdlib.h>
+#include <stdio.h>
+
+typedef unsigned int UInt;
+typedef unsigned long long int ULong;
+typedef unsigned char UChar;
+typedef unsigned short int UShort;
+
+
+/////////////////////////////////////////////////////////////////
+
+UInt do_s_crc32b ( UInt crcIn, UChar b )
+{
+ UInt i, crc = (b & 0xFF) ^ crcIn;
+ for (i = 0; i < 8; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78 : 0);
+ return crc;
+}
+
+UInt do_s_crc32w ( UInt crcIn, UShort w )
+{
+ UInt i, crc = (w & 0xFFFF) ^ crcIn;
+ for (i = 0; i < 16; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78 : 0);
+ return crc;
+}
+
+UInt do_s_crc32l ( UInt crcIn, UInt l )
+{
+ UInt i, crc = l ^ crcIn;
+ for (i = 0; i < 32; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78 : 0);
+ return crc;
+}
+
+UInt do_s_crc32q ( UInt crcIn, ULong q )
+{
+ UInt crc = do_s_crc32l(crcIn, (UInt)q);
+ return do_s_crc32l(crc, (UInt)(q >> 32));
+}
+
+UInt do_h_crc32b ( UInt crcIn, UChar b )
+{
+ __asm__ __volatile__(
+ "crc32b %%cl,%%esi\n\t"
+ : "=S"(crcIn) : "0"(crcIn), "c"(b)
+ );
+ return crcIn;
+}
+
+UInt do_h_crc32w ( UInt crcIn, UShort w )
+{
+ __asm__ __volatile__(
+ "crc32w %%cx,%%esi\n\t"
+ : "=S"(crcIn) : "0"(crcIn), "c"(w)
+ );
+ return crcIn;
+}
+
+UInt do_h_crc32l ( UInt crcIn, UInt l )
+{
+ __asm__ __volatile__(
+ "crc32l %%ecx,%%esi\n\t"
+ : "=S"(crcIn) : "0"(crcIn), "c"(l)
+ );
+ return crcIn;
+}
+
+UInt do_h_crc32q ( UInt crcIn, ULong q )
+{
+ __asm__ __volatile__(
+ "crc32q %%rcx,%%rsi\n\t"
+ : "=S"(crcIn) : "0"(crcIn), "c"(q)
+ );
+ return crcIn;
+}
+
+////////////////
+
+UInt do_h_crc32b_mem ( UInt crcIn, UChar* a )
+{
+ __asm__ __volatile__(
+ "crc32b (%2),%%esi\n\t"
+ : "=S"(crcIn) : "0"(crcIn), "r"(a)
+ );
+ return crcIn;
+}
+
+UInt do_h_crc32w_mem ( UInt crcIn, UShort* a )
+{
+ __asm__ __volatile__(
+ "crc32w (%2),%%esi\n\t"
+ : "=S"(crcIn) : "0"(crcIn), "r"(a)
+ );
+ return crcIn;
+}
+
+UInt do_h_crc32l_mem ( UInt crcIn, UInt* a )
+{
+ __asm__ __volatile__(
+ "crc32l (%2),%%esi\n\t"
+ : "=S"(crcIn) : "0"(crcIn), "r"(a)
+ );
+ return crcIn;
+}
+
+UInt do_h_crc32q_mem ( UInt crcIn, ULong* a )
+{
+ __asm__ __volatile__(
+ "crc32q (%2),%%rsi\n\t"
+ : "=S"(crcIn) : "0"(crcIn), "r"(a)
+ );
+ return crcIn;
+}
+
+void try_simple ( void )
+{
+ UInt c0 = 0xFFFFFFFF;
+ UChar c = 0x42;
+
+ UInt cs = do_s_crc32b(c0, c);
+ UInt ch = do_h_crc32b(c0, c);
+ printf("b %08x %08x\n", cs, ch);
+
+ UShort w = 0xed78;;
+ cs = do_s_crc32w(c0, w);
+ ch = do_h_crc32w(c0, w);
+ printf("w %08x %08x\n", cs, ch);
+
+ UInt i = 0xCAFEBABE;
+ cs = do_s_crc32l(c0, i);
+ ch = do_h_crc32l(c0, i);
+ printf("l %08x %08x\n", cs, ch);
+
+ ULong q = 0x0ddC0ffeeBadF00d;
+ cs = do_s_crc32q(c0, q);
+ ch = do_h_crc32q(c0, q);
+ printf("q %08x %08x\n", cs, ch);
+}
+
+#define NMEM 1000
+void try_mem ( void )
+{
+ UInt al, i;
+ UChar* b = malloc(NMEM);
+ for (i = 0; i < NMEM; i++)
+ b[i] = (UChar)(i % 177);
+
+ for (al = 0; al < 1; al++) {
+ UInt crc = 0xFFFFFFFF;
+ for (i = 0; i <= 1000-1-al; i += 1)
+ crc = do_h_crc32b_mem( crc, &b[i+al] );
+ printf("mem b misalign %d = %08x\n", al, crc);
+ }
+
+ for (al = 0; al < 2; al++) {
+ UInt crc = 0xFFFFFFFF;
+ for (i = 0; i <= 1000-2-al; i += 2)
+ crc = do_h_crc32w_mem( crc, (UShort*)&b[i+al] );
+ printf("mem w misalign %d = %08x\n", al, crc);
+ }
+
+ for (al = 0; al < 4; al++) {
+ UInt crc = 0xFFFFFFFF;
+ for (i = 0; i <= 1000-4-al; i += 4)
+ crc = do_h_crc32l_mem( crc, (UInt*)&b[i+al] );
+ printf("mem l misalign %d = %08x\n", al, crc);
+ }
+
+ for (al = 0; al < 8; al++) {
+ UInt crc = 0xFFFFFFFF;
+ for (i = 0; i <= 1000-8-al; i += 8)
+ crc = do_h_crc32q_mem( crc, (ULong*)&b[i+al] );
+ printf("mem q misalign %d = %08x\n", al, crc);
+ }
+
+ free(b);
+}
+
+void try_misc ( void )
+{
+ ULong res = 0xAAAAAAAAAAAAAAAAULL;
+ __asm__ __volatile__(
+ "movabsq $0x5555555555555555, %%rax" "\n\t"
+ "movabsq $042, %%rbx" "\n\t"
+ "crc32b %%bl,%%rax" "\n\t"
+ "movq %%rax, %0" "\n"
+ : "=r"(res) : : "rax","rbx"
+ );
+ printf("try_misc 64bit-dst 0x%016llx\n", res);
+
+ __asm__ __volatile__(
+ "movabsq $0x5555555555555555, %%rax" "\n\t"
+ "movabsq $042, %%rbx" "\n\t"
+ "crc32b %%bl,%%eax" "\n\t"
+ "movq %%rax, %0" "\n"
+ : "=r"(res) : : "rax","rbx"
+ );
+ printf("try_misc 32bit-dst 0x%016llx\n", res);
+}
+
+/////////////////////////////////////////////////////////////////
+
+
+
+int main ( int argc, char** argv )
+{
+ try_simple();
+ try_mem();
+ try_misc();
+ return 0;
+}
Added: trunk/none/tests/amd64/crc32.stderr.exp
===================================================================
Added: trunk/none/tests/amd64/crc32.stdout.exp
===================================================================
--- trunk/none/tests/amd64/crc32.stdout.exp (rev 0)
+++ trunk/none/tests/amd64/crc32.stdout.exp 2011-01-21 18:13:02 UTC (rev 11505)
@@ -0,0 +1,21 @@
+b 0dc2c1e5 0dc2c1e5
+w 70cb7bdb 70cb7bdb
+l 9ca98638 9ca98638
+q f264a907 f264a907
+mem b misalign 0 = f502c278
+mem w misalign 0 = f502c278
+mem w misalign 1 = 0a72a365
+mem l misalign 0 = f502c278
+mem l misalign 1 = 246088f7
+mem l misalign 2 = bcf12db3
+mem l misalign 3 = 00d2a6af
+mem q misalign 0 = f502c278
+mem q misalign 1 = 5be5d059
+mem q misalign 2 = ebc9f7d0
+mem q misalign 3 = c185a801
+mem q misalign 4 = 11ada892
+mem q misalign 5 = c5a2f160
+mem q misalign 6 = 7b84c760
+mem q misalign 7 = ab827214
+try_misc 64bit-dst 0x00000000a50765b3
+try_misc 32bit-dst 0x00000000a50765b3
Added: trunk/none/tests/amd64/crc32.vgtest
===================================================================
--- trunk/none/tests/amd64/crc32.vgtest (rev 0)
+++ trunk/none/tests/amd64/crc32.vgtest 2011-01-21 18:13:02 UTC (rev 11505)
@@ -0,0 +1,3 @@
+prog: crc32
+prereq: ../../../tests/x86_amd64_features amd64-sse42
+vgopts: -q
|
|
From: <sv...@va...> - 2011-01-21 18:05:28
|
Author: sewardj
Date: 2011-01-21 18:05:19 +0000 (Fri, 21 Jan 2011)
New Revision: 2084
Log:
Implement rex.W/FXSAVE and also both variants of FXRSTOR.
Ick. I knew there was a reason I'd been putting this off.
Fixes #194402.
Modified:
trunk/priv/guest_amd64_defs.h
trunk/priv/guest_amd64_helpers.c
trunk/priv/guest_amd64_toIR.c
Modified: trunk/priv/guest_amd64_defs.h
===================================================================
--- trunk/priv/guest_amd64_defs.h 2011-01-21 18:02:54 UTC (rev 2083)
+++ trunk/priv/guest_amd64_defs.h 2011-01-21 18:05:19 UTC (rev 2084)
@@ -154,7 +154,8 @@
extern void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* );
-extern void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State*, HWord );
+extern void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State*, HWord );
+extern VexEmWarn amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State*, HWord );
extern ULong amd64g_dirtyhelper_RDTSC ( void );
Modified: trunk/priv/guest_amd64_helpers.c
===================================================================
--- trunk/priv/guest_amd64_helpers.c 2011-01-21 18:02:54 UTC (rev 2083)
+++ trunk/priv/guest_amd64_helpers.c 2011-01-21 18:05:19 UTC (rev 2084)
@@ -1454,6 +1454,68 @@
}
+/* This is used to implement both 'frstor' and 'fldenv'. The latter
+ appears to differ from the former only in that the 8 FP registers
+ themselves are not transferred into the guest state. */
+static
+VexEmWarn do_put_x87 ( Bool moveRegs,
+ /*IN*/UChar* x87_state,
+ /*OUT*/VexGuestAMD64State* vex_state )
+{
+ Int stno, preg;
+ UInt tag;
+ ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
+ UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
+ Fpu_State* x87 = (Fpu_State*)x87_state;
+ UInt ftop = (x87->env[FP_ENV_STAT] >> 11) & 7;
+ UInt tagw = x87->env[FP_ENV_TAG];
+ UInt fpucw = x87->env[FP_ENV_CTRL];
+ UInt c3210 = x87->env[FP_ENV_STAT] & 0x4700;
+ VexEmWarn ew;
+ UInt fpround;
+ ULong pair;
+
+ /* Copy registers and tags */
+ for (stno = 0; stno < 8; stno++) {
+ preg = (stno + ftop) & 7;
+ tag = (tagw >> (2*preg)) & 3;
+ if (tag == 3) {
+ /* register is empty */
+ /* hmm, if it's empty, does it still get written? Probably
+ safer to say it does. If we don't, memcheck could get out
+ of sync, in that it thinks all FP registers are defined by
+ this helper, but in reality some have not been updated. */
+ if (moveRegs)
+ vexRegs[preg] = 0; /* IEEE754 64-bit zero */
+ vexTags[preg] = 0;
+ } else {
+ /* register is non-empty */
+ if (moveRegs)
+ convert_f80le_to_f64le( &x87->reg[10*stno],
+ (UChar*)&vexRegs[preg] );
+ vexTags[preg] = 1;
+ }
+ }
+
+ /* stack pointer */
+ vex_state->guest_FTOP = ftop;
+
+ /* status word */
+ vex_state->guest_FC3210 = c3210;
+
+ /* handle the control word, setting FPROUND and detecting any
+ emulation warnings. */
+ pair = amd64g_check_fldcw ( (ULong)fpucw );
+ fpround = (UInt)pair;
+ ew = (VexEmWarn)(pair >> 32);
+
+ vex_state->guest_FPROUND = fpround & 3;
+
+ /* emulation warnings --> caller */
+ return ew;
+}
+
+
/* Create an x87 FPU state from the guest state, as close as
we can approximate it. */
static
@@ -1610,6 +1672,94 @@
}
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (writes guest state, reads guest mem) */
+VexEmWarn amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State* gst, HWord addr )
+{
+ Fpu_State tmp;
+ VexEmWarn warnX87 = EmWarn_NONE;
+ VexEmWarn warnXMM = EmWarn_NONE;
+ UShort* addrS = (UShort*)addr;
+ UChar* addrC = (UChar*)addr;
+ U128* xmm = (U128*)(addr + 160);
+ UShort fp_tags;
+ Int r, stno, i;
+
+ /* Restore %xmm0 .. %xmm15. If the host is big-endian, these need
+ to be byte-swapped. */
+ vassert(host_is_little_endian());
+
+# define COPY_U128(_dst,_src) \
+ do { _dst[0] = _src[0]; _dst[1] = _src[1]; \
+ _dst[2] = _src[2]; _dst[3] = _src[3]; } \
+ while (0)
+
+ COPY_U128( gst->guest_XMM0, xmm[0] );
+ COPY_U128( gst->guest_XMM1, xmm[1] );
+ COPY_U128( gst->guest_XMM2, xmm[2] );
+ COPY_U128( gst->guest_XMM3, xmm[3] );
+ COPY_U128( gst->guest_XMM4, xmm[4] );
+ COPY_U128( gst->guest_XMM5, xmm[5] );
+ COPY_U128( gst->guest_XMM6, xmm[6] );
+ COPY_U128( gst->guest_XMM7, xmm[7] );
+ COPY_U128( gst->guest_XMM8, xmm[8] );
+ COPY_U128( gst->guest_XMM9, xmm[9] );
+ COPY_U128( gst->guest_XMM10, xmm[10] );
+ COPY_U128( gst->guest_XMM11, xmm[11] );
+ COPY_U128( gst->guest_XMM12, xmm[12] );
+ COPY_U128( gst->guest_XMM13, xmm[13] );
+ COPY_U128( gst->guest_XMM14, xmm[14] );
+ COPY_U128( gst->guest_XMM15, xmm[15] );
+
+# undef COPY_U128
+
+ /* Copy the x87 registers out of the image, into a temporary
+ Fpu_State struct. */
+ for (i = 0; i < 14; i++) tmp.env[i] = 0;
+ for (i = 0; i < 80; i++) tmp.reg[i] = 0;
+ /* fill in tmp.reg[0..7] */
+ for (stno = 0; stno < 8; stno++) {
+ UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
+ UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
+ dstS[0] = srcS[0];
+ dstS[1] = srcS[1];
+ dstS[2] = srcS[2];
+ dstS[3] = srcS[3];
+ dstS[4] = srcS[4];
+ }
+ /* fill in tmp.env[0..13] */
+ tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
+ tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
+
+ fp_tags = 0;
+ for (r = 0; r < 8; r++) {
+ if (addrC[4] & (1<<r))
+ fp_tags |= (0 << (2*r)); /* EMPTY */
+ else
+ fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
+ }
+ tmp.env[FP_ENV_TAG] = fp_tags;
+
+ /* Now write 'tmp' into the guest state. */
+ warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
+
+ { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
+ | ((((UInt)addrS[13]) & 0xFFFF) << 16);
+ ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
+
+ warnXMM = (VexEmWarn)(w64 >> 32);
+
+ gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
+ }
+
+ /* Prefer an X87 emwarn over an XMM one, if both exist. */
+ if (warnX87 != EmWarn_NONE)
+ return warnX87;
+ else
+ return warnXMM;
+}
+
+
/* DIRTY HELPER (writes guest state) */
/* Initialise the x87 FPU state as per 'finit'. */
void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
Modified: trunk/priv/guest_amd64_toIR.c
===================================================================
--- trunk/priv/guest_amd64_toIR.c 2011-01-21 18:02:54 UTC (rev 2083)
+++ trunk/priv/guest_amd64_toIR.c 2011-01-21 18:05:19 UTC (rev 2084)
@@ -9193,26 +9193,27 @@
thusly placed in guest-x86/toIR.c. */
/* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory.
- Note that REX.W 0F AE /0 writes a slightly different format and
- we don't handle that here. */
- if (haveNo66noF2noF3(pfx) && sz == 4
+ Note that the presence or absence of REX.W slightly affects the
+ written format: whether the saved FPU IP and DP pointers are 64
+ or 32 bits. But the helper function we call simply writes zero
+ bits in the relevant fields (which are 64 bits regardless of
+ what REX.W is) and so it's good enough (iow, equally broken) in
+ both cases. */
+ if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
&& insn[0] == 0x0F && insn[1] == 0xAE
&& !epartIsReg(insn[2]) && gregOfRexRM(pfx,insn[2]) == 0) {
IRDirty* d;
modrm = getUChar(delta+2);
- vassert(sz == 4);
vassert(!epartIsReg(modrm));
- /* REX.W must not be set. That should be assured us by sz == 4
- above. */
- vassert(!(pfx & PFX_REXW));
addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
delta += 2+alen;
+ gen_SEGV_if_not_16_aligned(addr);
- DIP("fxsave %s\n", dis_buf);
+ DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
/* Uses dirty helper:
- void amd64g_do_FXSAVE ( VexGuestAMD64State*, UInt ) */
+ void amd64g_do_FXSAVE ( VexGuestAMD64State*, ULong ) */
d = unsafeIRDirty_0_N (
0/*regparms*/,
"amd64g_dirtyhelper_FXSAVE",
@@ -9268,6 +9269,82 @@
goto decode_success;
}
+ /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory.
+ As with FXSAVE above we ignore the value of REX.W since we're
+ not bothering with the FPU DP and IP fields. */
+ if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
+ && insn[0] == 0x0F && insn[1] == 0xAE
+ && !epartIsReg(insn[2]) && gregOfRexRM(pfx,insn[2]) == 1) {
+ IRDirty* d;
+ modrm = getUChar(delta+2);
+ vassert(!epartIsReg(modrm));
+
+ addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+ delta += 2+alen;
+ gen_SEGV_if_not_16_aligned(addr);
+
+ DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
+
+ /* Uses dirty helper:
+ VexEmWarn amd64g_do_FXRSTOR ( VexGuestAMD64State*, ULong )
+ NOTE:
+ the VexEmWarn value is simply ignored
+ */
+ d = unsafeIRDirty_0_N (
+ 0/*regparms*/,
+ "amd64g_dirtyhelper_FXRSTOR",
+ &amd64g_dirtyhelper_FXRSTOR,
+ mkIRExprVec_1( mkexpr(addr) )
+ );
+ d->needsBBP = True;
+
+ /* declare we're reading memory */
+ d->mFx = Ifx_Read;
+ d->mAddr = mkexpr(addr);
+ d->mSize = 512;
+
+ /* declare we're writing guest state */
+ d->nFxState = 7;
+
+ d->fxState[0].fx = Ifx_Write;
+ d->fxState[0].offset = OFFB_FTOP;
+ d->fxState[0].size = sizeof(UInt);
+
+ d->fxState[1].fx = Ifx_Write;
+ d->fxState[1].offset = OFFB_FPREGS;
+ d->fxState[1].size = 8 * sizeof(ULong);
+
+ d->fxState[2].fx = Ifx_Write;
+ d->fxState[2].offset = OFFB_FPTAGS;
+ d->fxState[2].size = 8 * sizeof(UChar);
+
+ d->fxState[3].fx = Ifx_Write;
+ d->fxState[3].offset = OFFB_FPROUND;
+ d->fxState[3].size = sizeof(ULong);
+
+ d->fxState[4].fx = Ifx_Write;
+ d->fxState[4].offset = OFFB_FC3210;
+ d->fxState[4].size = sizeof(ULong);
+
+ d->fxState[5].fx = Ifx_Write;
+ d->fxState[5].offset = OFFB_XMM0;
+ d->fxState[5].size = 16 * sizeof(U128);
+
+ d->fxState[6].fx = Ifx_Write;
+ d->fxState[6].offset = OFFB_SSEROUND;
+ d->fxState[6].size = sizeof(ULong);
+
+ /* Be paranoid ... this assertion tries to ensure the 16 %xmm
+ images are packed back-to-back. If not, the value of
+ d->fxState[5].size is wrong. */
+ vassert(16 == sizeof(U128));
+ vassert(OFFB_XMM15 == (OFFB_XMM0 + 15 * 16));
+
+ stmt( IRStmt_Dirty(d) );
+
+ goto decode_success;
+ }
+
/* ------ SSE decoder main ------ */
/* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
|
|
From: <sv...@va...> - 2011-01-21 18:03:02
|
Author: sewardj
Date: 2011-01-21 18:02:54 +0000 (Fri, 21 Jan 2011)
New Revision: 2083
Log:
Add alignment checking for FXSAVE/FXRSTOR.
Modified:
trunk/priv/guest_x86_toIR.c
Modified: trunk/priv/guest_x86_toIR.c
===================================================================
--- trunk/priv/guest_x86_toIR.c 2011-01-21 17:51:44 UTC (rev 2082)
+++ trunk/priv/guest_x86_toIR.c 2011-01-21 18:02:54 UTC (rev 2083)
@@ -8095,6 +8095,7 @@
addr = disAMode ( &alen, sorb, delta+2, dis_buf );
delta += 2+alen;
+ gen_SEGV_if_not_16_aligned(addr);
DIP("fxsave %s\n", dis_buf);
@@ -8165,11 +8166,15 @@
addr = disAMode ( &alen, sorb, delta+2, dis_buf );
delta += 2+alen;
+ gen_SEGV_if_not_16_aligned(addr);
DIP("fxrstor %s\n", dis_buf);
/* Uses dirty helper:
- void x86g_do_FXRSTOR ( VexGuestX86State*, UInt ) */
+ VexEmWarn x86g_do_FXRSTOR ( VexGuestX86State*, UInt )
+ NOTE:
+ the VexEmWarn value is simply ignored (unlike for FRSTOR)
+ */
d = unsafeIRDirty_0_N (
0/*regparms*/,
"x86g_dirtyhelper_FXRSTOR",
|
|
From: <sv...@va...> - 2011-01-21 17:51:54
|
Author: sewardj
Date: 2011-01-21 17:51:44 +0000 (Fri, 21 Jan 2011)
New Revision: 2082
Log:
Add support for SSE4.2 CRC32{B,W,L,Q}. Fixes #261966.
Modified:
trunk/priv/guest_amd64_defs.h
trunk/priv/guest_amd64_helpers.c
trunk/priv/guest_amd64_toIR.c
Modified: trunk/priv/guest_amd64_defs.h
===================================================================
--- trunk/priv/guest_amd64_defs.h 2011-01-19 12:21:51 UTC (rev 2081)
+++ trunk/priv/guest_amd64_defs.h 2011-01-21 17:51:44 UTC (rev 2082)
@@ -137,6 +137,10 @@
extern ULong amd64g_calculate_mmx_pmovmskb ( ULong );
extern ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo );
+extern ULong amd64g_calc_crc32b ( ULong crcIn, ULong b );
+extern ULong amd64g_calc_crc32w ( ULong crcIn, ULong w );
+extern ULong amd64g_calc_crc32l ( ULong crcIn, ULong l );
+extern ULong amd64g_calc_crc32q ( ULong crcIn, ULong q );
/* --- DIRTY HELPERS --- */
Modified: trunk/priv/guest_amd64_helpers.c
===================================================================
--- trunk/priv/guest_amd64_helpers.c 2011-01-19 12:21:51 UTC (rev 2081)
+++ trunk/priv/guest_amd64_helpers.c 2011-01-21 17:51:44 UTC (rev 2082)
@@ -2563,7 +2563,44 @@
return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF);
}
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
+{
+ UInt i;
+ ULong crc = (b & 0xFFULL) ^ crcIn;
+ for (i = 0; i < 8; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
+ return crc;
+}
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
+{
+ UInt i;
+ ULong crc = (w & 0xFFFFULL) ^ crcIn;
+ for (i = 0; i < 16; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
+ return crc;
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
+{
+ UInt i;
+ ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
+ for (i = 0; i < 32; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
+ return crc;
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
+{
+ ULong crc = amd64g_calc_crc32l(crcIn, q);
+ return amd64g_calc_crc32l(crc, q >> 32);
+}
+
+
/*---------------------------------------------------------------*/
/*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M} ---*/
/*---------------------------------------------------------------*/
Modified: trunk/priv/guest_amd64_toIR.c
===================================================================
--- trunk/priv/guest_amd64_toIR.c 2011-01-19 12:21:51 UTC (rev 2081)
+++ trunk/priv/guest_amd64_toIR.c 2011-01-21 17:51:44 UTC (rev 2082)
@@ -750,6 +750,13 @@
toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
}
+/* Return True iff pfx has F2 set and F3 clear */
+static Bool haveF2noF3 ( Prefix pfx )
+{
+ return
+ toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
+}
+
/* Return True iff pfx has 66, F2 and F3 clear */
static Bool haveNo66noF2noF3 ( Prefix pfx )
{
@@ -15850,6 +15857,68 @@
goto decode_success;
}
+ /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
+ F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
+ The decoding on this is a bit unusual.
+ */
+ if (haveF2noF3(pfx)
+ && insn[0] == 0x0F && insn[1] == 0x38
+ && (insn[2] == 0xF1
+ || (insn[2] == 0xF0 && !have66(pfx)))) {
+ modrm = insn[3];
+
+ if (insn[2] == 0xF0)
+ sz = 1;
+ else
+ vassert(sz == 2 || sz == 4 || sz == 8);
+
+ IRType tyE = szToITy(sz);
+ IRTemp valE = newTemp(tyE);
+
+ if (epartIsReg(modrm)) {
+ assign(valE, getIRegE(sz, pfx, modrm));
+ delta += 3+1;
+ DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
+ nameIRegG(1==getRexW(pfx) ? 8 : 4 ,pfx, modrm));
+ } else {
+ addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+ assign(valE, loadLE(tyE, mkexpr(addr)));
+ delta += 3+alen;
+ DIP("crc32b %s,%s\n", dis_buf,
+ nameIRegG(1==getRexW(pfx) ? 8 : 4 ,pfx, modrm));
+ }
+
+ /* Somewhat funny getting/putting of the crc32 value, in order
+ to ensure that it turns into 64-bit gets and puts. However,
+ mask off the upper 32 bits so as to not get memcheck false
+ +ves around the helper call. */
+ IRTemp valG0 = newTemp(Ity_I64);
+ assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
+ mkU64(0xFFFFFFFF)));
+
+ HChar* nm = NULL;
+ void* fn = NULL;
+ switch (sz) {
+ case 1: nm = "amd64g_calc_crc32b";
+ fn = &amd64g_calc_crc32b; break;
+ case 2: nm = "amd64g_calc_crc32w";
+ fn = &amd64g_calc_crc32w; break;
+ case 4: nm = "amd64g_calc_crc32l";
+ fn = &amd64g_calc_crc32l; break;
+ case 8: nm = "amd64g_calc_crc32q";
+ fn = &amd64g_calc_crc32q; break;
+ }
+ vassert(nm && fn);
+ IRTemp valG1 = newTemp(Ity_I64);
+ assign(valG1,
+ mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
+ mkIRExprVec_2(mkexpr(valG0),
+ widenUto64(mkexpr(valE)))));
+
+ putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
+ goto decode_success;
+ }
+
/* ---------------------------------------------------- */
/* --- end of the SSE4 decoder --- */
/* ---------------------------------------------------- */
|
|
From: Konstantin S. <kon...@gm...> - 2011-01-21 17:40:29
|
2011/1/21 Julian Seward <js...@ac...> > > > > Can you send the results from --stats=yes ? From that we can > > > see the miss rate on VG_(tt_fast) and perhaps some other > > > significant numbers. > > > > This? > > --17273-- translate: fast SP updates identified: 0 ( --%) > > --17273-- translate: generic_known SP updates identified: 0 ( --%) > > --17273-- translate: generic_unknown SP updates identified: 0 ( --%) > > --17273-- tt/tc: 14,941,381 tt lookups requiring 97,504,544 probes > > --17273-- tt/tc: 14,941,381 fast-cache updates, 13 flushes > > --17273-- transtab: new 292,328 (4,802,899 -> 45,933,795; ratio > > 95:10) [0 scs] > > --17273-- transtab: dumped 0 (0 -> ??) > > --17273-- transtab: discarded 151 (1,798 -> ??) > > --17273-- scheduler: 736,878,221 jumps (bb entries). > > --17273-- scheduler: 9,067/37,285,698 major/minor sched events. > > --17273-- sanity: 9068 cheap, 115 expensive checks. > > --17273-- exectx: 769 lists, 0 contexts (avg 0 per list) > > --17273-- exectx: 0 searches, 0 full compares (0 per 1000) > > --17273-- exectx: 0 cmp2, 0 cmp4, 0 cmpAll > > --17273-- errormgr: 0 supplist searches, 0 comparisons during search > > --17273-- errormgr: 0 errlist searches, 0 comparisons during search > > Yes, this. But .. are these the numbers from the run that had the > unexpectedly high costs? Yes (unless I am very much mistaken) The profile was like this: 143136 51.6351 tsan-amd64-linux tsan-amd64-linux vgPlain_search_transtab 14517 5.2369 tsan-amd64-linux tsan-amd64-linux vgPlain_discard_translations 13297 4.7968 tsan-amd64-linux tsan-amd64-linux delete_translations_in_sector_eclass 12440 4.4876 tsan-amd64-linux tsan-amd64-linux ThreadSanitizerHandleTrace(int, TraceInfo*, unsigned long*) 11463 4.1352 tsan-amd64-linux tsan-amd64-linux vgPlain_run_innerloop__dispatch_unprofiled 10679 3.8524 anon (tgid:17291 range:0x404797a000-0x40525d2000) tsan-amd64-linux anon (tgid:17291 range:0x404797a000-0x40525d2000) 6528 2.3549 tsan-amd64-linux tsan-amd64-linux vgPlain_run_innerloop 6259 2.2579 tsan-amd64-linux tsan-amd64-linux invalidateFastCache > These numbers look normal to me: 736 million > queries in the fast cache, 14.941 million misses and lookups in the > main table (which is what VG_(search_transtab) does), and about 6 > hash probes per lookup (97,504,544 / 14,941,381). > > J > |
|
From: Julian S. <js...@ac...> - 2011-01-21 17:28:25
|
> > Can you send the results from --stats=yes ? From that we can > > see the miss rate on VG_(tt_fast) and perhaps some other > > significant numbers. > > This? > --17273-- translate: fast SP updates identified: 0 ( --%) > --17273-- translate: generic_known SP updates identified: 0 ( --%) > --17273-- translate: generic_unknown SP updates identified: 0 ( --%) > --17273-- tt/tc: 14,941,381 tt lookups requiring 97,504,544 probes > --17273-- tt/tc: 14,941,381 fast-cache updates, 13 flushes > --17273-- transtab: new 292,328 (4,802,899 -> 45,933,795; ratio > 95:10) [0 scs] > --17273-- transtab: dumped 0 (0 -> ??) > --17273-- transtab: discarded 151 (1,798 -> ??) > --17273-- scheduler: 736,878,221 jumps (bb entries). > --17273-- scheduler: 9,067/37,285,698 major/minor sched events. > --17273-- sanity: 9068 cheap, 115 expensive checks. > --17273-- exectx: 769 lists, 0 contexts (avg 0 per list) > --17273-- exectx: 0 searches, 0 full compares (0 per 1000) > --17273-- exectx: 0 cmp2, 0 cmp4, 0 cmpAll > --17273-- errormgr: 0 supplist searches, 0 comparisons during search > --17273-- errormgr: 0 errlist searches, 0 comparisons during search Yes, this. But .. are these the numbers from the run that had the unexpectedly high costs? These numbers look normal to me: 736 million queries in the fast cache, 14.941 million misses and lookups in the main table (which is what VG_(search_transtab) does), and about 6 hash probes per lookup (97,504,544 / 14,941,381). J |
|
From: Konstantin S. <kon...@gm...> - 2011-01-21 16:12:57
|
2011/1/21 Julian Seward <js...@ac...> > > I am really surprised to see this. I know that vgPlain_search_transtab > does take some time, but it's not much more than 2 or 3 %. Especially > after I put in some hacks to make it cheaper, some time around 3.6.0 > (not sure when). > > The guest->host mapping is cached in a direct-mapped cache, > VG_(tt_fast), and VG_(search_transtab) is only used when > the cache misses. But the cache typically has a 99% hit > rate, so VG_(search_transtab) should not see much action. > > The only way I can see is that you are jumping between two > pieces of code which are exactly 2^N bytes apart (for N=17, > or something like that), in the address space. > Then the > cache will miss on each reference because both addresses map > to the same line and there is no associativity and no > victim cache. > > Can you send the results from --stats=yes ? From that we can > see the miss rate on VG_(tt_fast) and perhaps some other > significant numbers. > This? --17273-- translate: fast SP updates identified: 0 ( --%) --17273-- translate: generic_known SP updates identified: 0 ( --%) --17273-- translate: generic_unknown SP updates identified: 0 ( --%) --17273-- tt/tc: 14,941,381 tt lookups requiring 97,504,544 probes --17273-- tt/tc: 14,941,381 fast-cache updates, 13 flushes --17273-- transtab: new 292,328 (4,802,899 -> 45,933,795; ratio 95:10) [0 scs] --17273-- transtab: dumped 0 (0 -> ??) --17273-- transtab: discarded 151 (1,798 -> ??) --17273-- scheduler: 736,878,221 jumps (bb entries). --17273-- scheduler: 9,067/37,285,698 major/minor sched events. --17273-- sanity: 9068 cheap, 115 expensive checks. --17273-- exectx: 769 lists, 0 contexts (avg 0 per list) --17273-- exectx: 0 searches, 0 full compares (0 per 1000) --17273-- exectx: 0 cmp2, 0 cmp4, 0 cmpAll --17273-- errormgr: 0 supplist searches, 0 comparisons during search --17273-- errormgr: 0 errlist searches, 0 comparisons during search > > J > > On Thursday, January 13, 2011, Konstantin Serebryany wrote: > > Hi, > > > > I am running one large test (chrome browser on a heavy JS page) under > > Memcheck and ThreadSanitizer. > > The profile for ThreadSanitizer process looks like this: > > > > 151192 56.6740 tsan-amd64-linux tsan-amd64-linux > > vgPlain_search_transtab > > 10702 4.0116 tsan-amd64-linux tsan-amd64-linux > > vgPlain_run_innerloop__dispatch_unprofiled > > 9741 3.6514 tsan-amd64-linux tsan-amd64-linux > > vgPlain_discard_translations > > > > > > Most of the time is spent in the inner loop in vgPlain_search_transtab > > > > 6 0.0024 : 3807fd75: cltq > > 33 0.0133 : 3807fd77: mov 0x4241ba(%rip),%rcx # > > 384a3f38 <n_lookup_probes> > > 8 0.0032 : 3807fd7e: imul $0x1030,%rax,%rax > > 57 0.0230 : 3807fd85: lea 0xfff1(%rcx),%rbp > > > > : 3807fd8c: mov 0x384a3fa8(%rax),%rbx > > > > 175 0.0706 : 3807fd93: mov %edx,%eax > > 10 0.0040 : 3807fd95: jmp 3807fdb7 > > <vgPlain_search_transtab+0xa7> > > > > : 3807fd97: nopw 0x0(%rax,%rax,1) > > > > 3992 1.6106 : 3807fda0: cmp %rsi,0x18(%r10) > > 27213 10.9791 : 3807fda4: je 3807fe00 > > <vgPlain_search_transtab+0xf0> > > 7304 2.9468 : 3807fda6: add $0x1,%eax > > 641 0.2586 : 3807fda9: cmp $0xfff1,%eax > > 1485 0.5991 : 3807fdae: cmove %r12d,%eax > > 7334 2.9589 : 3807fdb2: cmp %rbp,%rcx > > 420 0.1694 : 3807fdb5: je 3807fde0 > > <vgPlain_search_transtab+0xd0> > > 2269 0.9154 : 3807fdb7: movslq %eax,%r10 > > 414 0.1670 : 3807fdba: add $0x1,%rcx > > 5084 2.0511 : 3807fdbe: lea (%r10,%r10,4),%r11 > > 409 0.1650 : 3807fdc2: mov %rcx,0x42416f(%rip) # > > 384a3f38 <n_lookup_probes> > > 3501 1.4125 : 3807fdc9: lea (%r10,%r11,2),%r10 > > 697 0.2812 : 3807fdcd: lea (%rbx,%r10,8),%r10 > > 5691 2.2960 : 3807fdd1: mov 0x8(%r10),%r11d > > 65972 26.6165 : 3807fdd5: test %r11d,%r11d > > 6220 2.5095 : 3807fdd8: je 3807fda0 > > <vgPlain_search_transtab+0x90> > > 3302 1.3322 : 3807fdda: cmp $0x2,%r11d > > 7211 2.9093 : 3807fdde: jne 3807fda6 > > <vgPlain_search_transtab+0x96> > > 11 0.0044 : 3807fde0: add $0x1,%r13d > > 40 0.0161 : 3807fde4: add $0x4,%r14 > > 11 0.0044 : 3807fde8: cmp $0x8,%r13d > > 8 0.0032 : 3807fdec: jne 3807fd6d > > <vgPlain_search_transtab+0x5d> > > > > Memcheck profile looks a bit less scary, but still most of the time is > > spent in transtab. > > > > 34472 12.4832 memcheck-amd64-linux memcheck-amd64-linux > > delete_translations_in_sector_eclass > > 31870 11.5409 memcheck-amd64-linux memcheck-amd64-linux > > vgMemCheck_helperc_MAKE_STACK_UNINIT > > 26495 9.5945 memcheck-amd64-linux memcheck-amd64-linux > > vgPlain_search_transtab > > 26203 9.4888 memcheck-amd64-linux memcheck-amd64-linux > > vgPlain_discard_translations > > > > > > Is there any known performance trouble in transtab when running jitted > > code? > > Are there any knobs one could tweak to boost transtab? > > > > Thanks! > > --kcc > > |
|
From: Julian S. <js...@ac...> - 2011-01-21 15:00:26
|
I am really surprised to see this. I know that vgPlain_search_transtab does take some time, but it's not much more than 2 or 3 %. Especially after I put in some hacks to make it cheaper, some time around 3.6.0 (not sure when). The guest->host mapping is cached in a direct-mapped cache, VG_(tt_fast), and VG_(search_transtab) is only used when the cache misses. But the cache typically has a 99% hit rate, so VG_(search_transtab) should not see much action. The only way I can see is that you are jumping between two pieces of code which are exactly 2^N bytes apart (for N=17, or something like that), in the address space. Then the cache will miss on each reference because both addresses map to the same line and there is no associativity and no victim cache. Can you send the results from --stats=yes ? From that we can see the miss rate on VG_(tt_fast) and perhaps some other significant numbers. J On Thursday, January 13, 2011, Konstantin Serebryany wrote: > Hi, > > I am running one large test (chrome browser on a heavy JS page) under > Memcheck and ThreadSanitizer. > The profile for ThreadSanitizer process looks like this: > > 151192 56.6740 tsan-amd64-linux tsan-amd64-linux > vgPlain_search_transtab > 10702 4.0116 tsan-amd64-linux tsan-amd64-linux > vgPlain_run_innerloop__dispatch_unprofiled > 9741 3.6514 tsan-amd64-linux tsan-amd64-linux > vgPlain_discard_translations > > > Most of the time is spent in the inner loop in vgPlain_search_transtab > > 6 0.0024 : 3807fd75: cltq > 33 0.0133 : 3807fd77: mov 0x4241ba(%rip),%rcx # > 384a3f38 <n_lookup_probes> > 8 0.0032 : 3807fd7e: imul $0x1030,%rax,%rax > 57 0.0230 : 3807fd85: lea 0xfff1(%rcx),%rbp > > : 3807fd8c: mov 0x384a3fa8(%rax),%rbx > > 175 0.0706 : 3807fd93: mov %edx,%eax > 10 0.0040 : 3807fd95: jmp 3807fdb7 > <vgPlain_search_transtab+0xa7> > > : 3807fd97: nopw 0x0(%rax,%rax,1) > > 3992 1.6106 : 3807fda0: cmp %rsi,0x18(%r10) > 27213 10.9791 : 3807fda4: je 3807fe00 > <vgPlain_search_transtab+0xf0> > 7304 2.9468 : 3807fda6: add $0x1,%eax > 641 0.2586 : 3807fda9: cmp $0xfff1,%eax > 1485 0.5991 : 3807fdae: cmove %r12d,%eax > 7334 2.9589 : 3807fdb2: cmp %rbp,%rcx > 420 0.1694 : 3807fdb5: je 3807fde0 > <vgPlain_search_transtab+0xd0> > 2269 0.9154 : 3807fdb7: movslq %eax,%r10 > 414 0.1670 : 3807fdba: add $0x1,%rcx > 5084 2.0511 : 3807fdbe: lea (%r10,%r10,4),%r11 > 409 0.1650 : 3807fdc2: mov %rcx,0x42416f(%rip) # > 384a3f38 <n_lookup_probes> > 3501 1.4125 : 3807fdc9: lea (%r10,%r11,2),%r10 > 697 0.2812 : 3807fdcd: lea (%rbx,%r10,8),%r10 > 5691 2.2960 : 3807fdd1: mov 0x8(%r10),%r11d > 65972 26.6165 : 3807fdd5: test %r11d,%r11d > 6220 2.5095 : 3807fdd8: je 3807fda0 > <vgPlain_search_transtab+0x90> > 3302 1.3322 : 3807fdda: cmp $0x2,%r11d > 7211 2.9093 : 3807fdde: jne 3807fda6 > <vgPlain_search_transtab+0x96> > 11 0.0044 : 3807fde0: add $0x1,%r13d > 40 0.0161 : 3807fde4: add $0x4,%r14 > 11 0.0044 : 3807fde8: cmp $0x8,%r13d > 8 0.0032 : 3807fdec: jne 3807fd6d > <vgPlain_search_transtab+0x5d> > > Memcheck profile looks a bit less scary, but still most of the time is > spent in transtab. > > 34472 12.4832 memcheck-amd64-linux memcheck-amd64-linux > delete_translations_in_sector_eclass > 31870 11.5409 memcheck-amd64-linux memcheck-amd64-linux > vgMemCheck_helperc_MAKE_STACK_UNINIT > 26495 9.5945 memcheck-amd64-linux memcheck-amd64-linux > vgPlain_search_transtab > 26203 9.4888 memcheck-amd64-linux memcheck-amd64-linux > vgPlain_discard_translations > > > Is there any known performance trouble in transtab when running jitted > code? > Are there any knobs one could tweak to boost transtab? > > Thanks! > --kcc |