You can subscribe to this list here.
| 2002 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(1) |
Oct
(122) |
Nov
(152) |
Dec
(69) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2003 |
Jan
(6) |
Feb
(25) |
Mar
(73) |
Apr
(82) |
May
(24) |
Jun
(25) |
Jul
(10) |
Aug
(11) |
Sep
(10) |
Oct
(54) |
Nov
(203) |
Dec
(182) |
| 2004 |
Jan
(307) |
Feb
(305) |
Mar
(430) |
Apr
(312) |
May
(187) |
Jun
(342) |
Jul
(487) |
Aug
(637) |
Sep
(336) |
Oct
(373) |
Nov
(441) |
Dec
(210) |
| 2005 |
Jan
(385) |
Feb
(480) |
Mar
(636) |
Apr
(544) |
May
(679) |
Jun
(625) |
Jul
(810) |
Aug
(838) |
Sep
(634) |
Oct
(521) |
Nov
(965) |
Dec
(543) |
| 2006 |
Jan
(494) |
Feb
(431) |
Mar
(546) |
Apr
(411) |
May
(406) |
Jun
(322) |
Jul
(256) |
Aug
(401) |
Sep
(345) |
Oct
(542) |
Nov
(308) |
Dec
(481) |
| 2007 |
Jan
(427) |
Feb
(326) |
Mar
(367) |
Apr
(255) |
May
(244) |
Jun
(204) |
Jul
(223) |
Aug
(231) |
Sep
(354) |
Oct
(374) |
Nov
(497) |
Dec
(362) |
| 2008 |
Jan
(322) |
Feb
(482) |
Mar
(658) |
Apr
(422) |
May
(476) |
Jun
(396) |
Jul
(455) |
Aug
(267) |
Sep
(280) |
Oct
(253) |
Nov
(232) |
Dec
(304) |
| 2009 |
Jan
(486) |
Feb
(470) |
Mar
(458) |
Apr
(423) |
May
(696) |
Jun
(461) |
Jul
(551) |
Aug
(575) |
Sep
(134) |
Oct
(110) |
Nov
(157) |
Dec
(102) |
| 2010 |
Jan
(226) |
Feb
(86) |
Mar
(147) |
Apr
(117) |
May
(107) |
Jun
(203) |
Jul
(193) |
Aug
(238) |
Sep
(300) |
Oct
(246) |
Nov
(23) |
Dec
(75) |
| 2011 |
Jan
(133) |
Feb
(195) |
Mar
(315) |
Apr
(200) |
May
(267) |
Jun
(293) |
Jul
(353) |
Aug
(237) |
Sep
(278) |
Oct
(611) |
Nov
(274) |
Dec
(260) |
| 2012 |
Jan
(303) |
Feb
(391) |
Mar
(417) |
Apr
(441) |
May
(488) |
Jun
(655) |
Jul
(590) |
Aug
(610) |
Sep
(526) |
Oct
(478) |
Nov
(359) |
Dec
(372) |
| 2013 |
Jan
(467) |
Feb
(226) |
Mar
(391) |
Apr
(281) |
May
(299) |
Jun
(252) |
Jul
(311) |
Aug
(352) |
Sep
(481) |
Oct
(571) |
Nov
(222) |
Dec
(231) |
| 2014 |
Jan
(185) |
Feb
(329) |
Mar
(245) |
Apr
(238) |
May
(281) |
Jun
(399) |
Jul
(382) |
Aug
(500) |
Sep
(579) |
Oct
(435) |
Nov
(487) |
Dec
(256) |
| 2015 |
Jan
(338) |
Feb
(357) |
Mar
(330) |
Apr
(294) |
May
(191) |
Jun
(108) |
Jul
(142) |
Aug
(261) |
Sep
(190) |
Oct
(54) |
Nov
(83) |
Dec
(22) |
| 2016 |
Jan
(49) |
Feb
(89) |
Mar
(33) |
Apr
(50) |
May
(27) |
Jun
(34) |
Jul
(53) |
Aug
(53) |
Sep
(98) |
Oct
(206) |
Nov
(93) |
Dec
(53) |
| 2017 |
Jan
(65) |
Feb
(82) |
Mar
(102) |
Apr
(86) |
May
(187) |
Jun
(67) |
Jul
(23) |
Aug
(93) |
Sep
(65) |
Oct
(45) |
Nov
(35) |
Dec
(17) |
| 2018 |
Jan
(26) |
Feb
(35) |
Mar
(38) |
Apr
(32) |
May
(8) |
Jun
(43) |
Jul
(27) |
Aug
(30) |
Sep
(43) |
Oct
(42) |
Nov
(38) |
Dec
(67) |
| 2019 |
Jan
(32) |
Feb
(37) |
Mar
(53) |
Apr
(64) |
May
(49) |
Jun
(18) |
Jul
(14) |
Aug
(53) |
Sep
(25) |
Oct
(30) |
Nov
(49) |
Dec
(31) |
| 2020 |
Jan
(87) |
Feb
(45) |
Mar
(37) |
Apr
(51) |
May
(99) |
Jun
(36) |
Jul
(11) |
Aug
(14) |
Sep
(20) |
Oct
(24) |
Nov
(40) |
Dec
(23) |
| 2021 |
Jan
(14) |
Feb
(53) |
Mar
(85) |
Apr
(15) |
May
(19) |
Jun
(3) |
Jul
(14) |
Aug
(1) |
Sep
(57) |
Oct
(73) |
Nov
(56) |
Dec
(22) |
| 2022 |
Jan
(3) |
Feb
(22) |
Mar
(6) |
Apr
(55) |
May
(46) |
Jun
(39) |
Jul
(15) |
Aug
(9) |
Sep
(11) |
Oct
(34) |
Nov
(20) |
Dec
(36) |
| 2023 |
Jan
(79) |
Feb
(41) |
Mar
(99) |
Apr
(169) |
May
(48) |
Jun
(16) |
Jul
(16) |
Aug
(57) |
Sep
(19) |
Oct
|
Nov
|
Dec
|
| S | M | T | W | T | F | S |
|---|---|---|---|---|---|---|
|
|
|
|
|
1
(16) |
2
(23) |
3
(15) |
|
4
(19) |
5
(21) |
6
(27) |
7
(18) |
8
(17) |
9
(15) |
10
(11) |
|
11
(9) |
12
(18) |
13
(26) |
14
(28) |
15
(26) |
16
(20) |
17
(27) |
|
18
(16) |
19
(40) |
20
(2) |
21
(11) |
22
(27) |
23
(24) |
24
(16) |
|
25
(10) |
26
(12) |
27
(16) |
28
(7) |
29
(6) |
30
(15) |
31
(5) |
|
From: <sv...@va...> - 2005-12-15 23:07:50
|
Author: njn
Date: 2005-12-15 23:07:45 +0000 (Thu, 15 Dec 2005)
New Revision: 5356
Log:
Remove some unnecessary variables.
Modified:
branches/COMPVBITS/memcheck/mc_main.c
Modified: branches/COMPVBITS/memcheck/mc_main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- branches/COMPVBITS/memcheck/mc_main.c 2005-12-15 22:57:35 UTC (rev 53=
55)
+++ branches/COMPVBITS/memcheck/mc_main.c 2005-12-15 23:07:45 UTC (rev 53=
56)
@@ -804,13 +804,12 @@
primary map. =20
*/
=20
-static SecMap** find_secmap_binder_for_addr ( Addr aA )
+static SecMap** find_secmap_binder_for_addr ( Addr a )
{
- if (aA > MAX_PRIMARY_ADDRESS) {
- AuxMapEnt* am =3D find_or_alloc_in_auxmap(aA);
+ if (a > MAX_PRIMARY_ADDRESS) {
+ AuxMapEnt* am =3D find_or_alloc_in_auxmap(a);
return &am->sm;
} else {
- UWord a =3D (UWord)aA;
UWord sec_no =3D (UWord)(a >> 16);
# if VG_DEBUG_MEMORY >=3D 1
tl_assert(sec_no < N_PRIMARY_MAP);
@@ -856,7 +855,7 @@
UWord vabits8 =3D vabits64 & 0x3;
SizeT i;
for (i =3D 0; i < lenT; i++) {
- set_vabits8(aA + i, vabits8);
+ set_vabits8(a + i, vabits8);
}
return;
}
@@ -2642,21 +2641,19 @@
/* ------------------------ Size =3D 8 ------------------------ */
=20
static inline __attribute__((always_inline))
-ULong mc_LOADV8 ( Addr aA, Bool isBigEndian )
+ULong mc_LOADV8 ( Addr a, Bool isBigEndian )
{
- UWord a, sm_off64, vabits64;
+ UWord sm_off64, vabits64;
SecMap* sm;
=20
PROF_EVENT(200, "mc_LOADV8");
=20
if (VG_DEBUG_MEMORY >=3D 2)
- return mc_LOADVn_slow( aA, 8, isBigEndian );
+ return mc_LOADVn_slow( a, 8, isBigEndian );
=20
- a =3D (UWord)aA;
-
if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,8) )) {
PROF_EVENT(201, "mc_LOADV8-slow1");
- return (UWord)mc_LOADVn_slow( aA, 8, isBigEndian );
+ return (UWord)mc_LOADVn_slow( a, 8, isBigEndian );
}
=20
sm =3D get_secmap_readable_low(a);
@@ -2688,9 +2685,9 @@
=20
=20
static inline __attribute__((always_inline))
-void mc_STOREV8 ( Addr aA, ULong vbytes, Bool isBigEndian )
+void mc_STOREV8 ( Addr a, ULong vbytes, Bool isBigEndian )
{
- UWord a, sm_off64, vabits64;
+ UWord sm_off64, vabits64;
SecMap* sm;
=20
PROF_EVENT(210, "mc_STOREV8");
@@ -2698,15 +2695,13 @@
// XXX: this slow case seems to be marginally faster than the fast ca=
se!
// Investigate further.
if (VG_DEBUG_MEMORY >=3D 2) {
- mc_STOREVn_slow( aA, 8, vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 8, vbytes, isBigEndian );
return;
}
=20
- a =3D (UWord)aA;
-
if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,8) )) {
PROF_EVENT(211, "mc_STOREV8-slow1");
- mc_STOREVn_slow( aA, 8, vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 8, vbytes, isBigEndian );
return;
}
=20
@@ -2729,12 +2724,12 @@
} else {
/* Slow but general case -- writing partially defined bytes. */
PROF_EVENT(212, "mc_STOREV8-slow2");
- mc_STOREVn_slow( aA, 8, vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 8, vbytes, isBigEndian );
}
} else {
/* Slow but general case. */
PROF_EVENT(213, "mc_STOREV8-slow3");
- mc_STOREVn_slow( aA, 8, vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 8, vbytes, isBigEndian );
}
}
=20
@@ -2802,23 +2797,21 @@
=20
=20
static inline __attribute__((always_inline))
-void mc_STOREV4 ( Addr aA, UWord vbytes, Bool isBigEndian )
+void mc_STOREV4 ( Addr a, UWord vbytes, Bool isBigEndian )
{
- UWord a, sm_off, vabits32;
+ UWord sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(230, "mc_STOREV4");
=20
if (VG_DEBUG_MEMORY >=3D 2) {
- mc_STOREVn_slow( aA, 4, (ULong)vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 4, (ULong)vbytes, isBigEndian );
return;
}
=20
- a =3D (UWord)aA;
-
if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,4) )) {
PROF_EVENT(231, "mc_STOREV4-slow1");
- mc_STOREVn_slow( aA, 4, (ULong)vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 4, (ULong)vbytes, isBigEndian );
return;
}
=20
@@ -2839,7 +2832,7 @@
} else {
// not readable/writable, or distinguished and changing state
PROF_EVENT(232, "mc_STOREV4-slow2");
- mc_STOREVn_slow( aA, 4, (ULong)vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 4, (ULong)vbytes, isBigEndian );
}
} else if (V_BITS32_INVALID =3D=3D vbytes) {
if (vabits32 =3D=3D (UInt)VA_BITS32_WRITABLE) {
@@ -2849,12 +2842,12 @@
} else {
// not readable/writable, or distinguished and changing state
PROF_EVENT(233, "mc_STOREV4-slow3");
- mc_STOREVn_slow( aA, 4, (ULong)vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 4, (ULong)vbytes, isBigEndian );
}
} else {
// Partially defined word
PROF_EVENT(234, "mc_STOREV4-slow4");
- mc_STOREVn_slow( aA, 4, (ULong)vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 4, (ULong)vbytes, isBigEndian );
}
//----------------------------------------------------------------------=
-----
#else
@@ -2873,12 +2866,12 @@
} else {
/* Slow but general case -- writing partially defined bytes. */
PROF_EVENT(232, "mc_STOREV4-slow2");
- mc_STOREVn_slow( aA, 4, (ULong)vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 4, (ULong)vbytes, isBigEndian );
}
} else {
/* Slow but general case. */
PROF_EVENT(233, "mc_STOREV4-slow3");
- mc_STOREVn_slow( aA, 4, (ULong)vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 4, (ULong)vbytes, isBigEndian );
}
#endif
//----------------------------------------------------------------------=
-----
@@ -2899,21 +2892,19 @@
/* ------------------------ Size =3D 2 ------------------------ */
=20
static inline __attribute__((always_inline))
-UWord mc_LOADV2 ( Addr aA, Bool isBigEndian )
+UWord mc_LOADV2 ( Addr a, Bool isBigEndian )
{
- UWord a, sm_off, vabits32;
+ UWord sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(240, "mc_LOADV2");
=20
if (VG_DEBUG_MEMORY >=3D 2)
- return (UWord)mc_LOADVn_slow( aA, 2, isBigEndian );
+ return (UWord)mc_LOADVn_slow( a, 2, isBigEndian );
=20
- a =3D (UWord)aA;
-
if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,2) )) {
PROF_EVENT(241, "mc_LOADV2-slow1");
- return (UWord)mc_LOADVn_slow( aA, 2, isBigEndian );
+ return (UWord)mc_LOADVn_slow( a, 2, isBigEndian );
}
=20
sm =3D get_secmap_readable_low(a);
@@ -2930,7 +2921,7 @@
// XXX: could extract the vabits16 and check it first... (see
// LOADV1)... depends how common this case is.
PROF_EVENT(242, "mc_LOADV2-slow2");
- return (UWord)mc_LOADVn_slow( aA, 2, isBigEndian );
+ return (UWord)mc_LOADVn_slow( a, 2, isBigEndian );
}
}
=20
@@ -2947,23 +2938,21 @@
=20
=20
static inline __attribute__((always_inline))
-void mc_STOREV2 ( Addr aA, UWord vbytes, Bool isBigEndian )
+void mc_STOREV2 ( Addr a, UWord vbytes, Bool isBigEndian )
{
- UWord a, sm_off, vabits32;
+ UWord sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(250, "mc_STOREV2");
=20
if (VG_DEBUG_MEMORY >=3D 2) {
- mc_STOREVn_slow( aA, 2, (ULong)vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 2, (ULong)vbytes, isBigEndian );
return;
}
=20
- a =3D (UWord)aA;
-
if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,2) )) {
PROF_EVENT(251, "mc_STOREV2-slow1");
- mc_STOREVn_slow( aA, 2, (ULong)vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 2, (ULong)vbytes, isBigEndian );
return;
}
=20
@@ -2979,22 +2968,20 @@
// Convert full V-bits in register to compact 2-bit form.
// XXX: is it best to check for VALID before INVALID?
if (V_BITS16_VALID =3D=3D vbytes) {
- //mc_STOREVn_slow( aA, 2, (ULong)vbytes, isBigEndian );
insert_vabits16_into_vabits32( a, VA_BITS16_READABLE,
&(sm->vabits32[sm_off]) );
} else if (V_BITS16_INVALID =3D=3D vbytes) {
- //mc_STOREVn_slow( aA, 2, (ULong)vbytes, isBigEndian );
insert_vabits16_into_vabits32( a, VA_BITS16_WRITABLE,
&(sm->vabits32[sm_off]) );
} else {
/* Slow but general case -- writing partially defined bytes. */
PROF_EVENT(252, "mc_STOREV2-slow2");
- mc_STOREVn_slow( aA, 2, (ULong)vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 2, (ULong)vbytes, isBigEndian );
}
} else {
/* Slow but general case. */
PROF_EVENT(253, "mc_STOREV2-slow3");
- mc_STOREVn_slow( aA, 2, (ULong)vbytes, isBigEndian );
+ mc_STOREVn_slow( a, 2, (ULong)vbytes, isBigEndian );
}
}
=20
@@ -3014,22 +3001,20 @@
/* Note: endianness is irrelevant for size =3D=3D 1 */
=20
VG_REGPARM(1)
-UWord MC_(helperc_LOADV1) ( Addr aA )
+UWord MC_(helperc_LOADV1) ( Addr a )
{
- UWord a, sm_off, vabits32;
+ UWord sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(260, "helperc_LOADV1");
=20
# if VG_DEBUG_MEMORY >=3D 2
- return (UWord)mc_LOADVn_slow( aA, 1, False/*irrelevant*/ );
+ return (UWord)mc_LOADVn_slow( a, 1, False/*irrelevant*/ );
# endif
=20
- a =3D (UWord)aA;
-
if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,1) )) {
PROF_EVENT(261, "helperc_LOADV1-slow1");
- return (UWord)mc_LOADVn_slow( aA, 1, False/*irrelevant*/ );
+ return (UWord)mc_LOADVn_slow( a, 1, False/*irrelevant*/ );
}
=20
sm =3D get_secmap_readable_low(a);
@@ -3050,30 +3035,28 @@
else {
/* Slow but general case. */
PROF_EVENT(262, "helperc_LOADV1-slow2");
- return (UWord)mc_LOADVn_slow( aA, 1, False/*irrelevant*/ );
+ return (UWord)mc_LOADVn_slow( a, 1, False/*irrelevant*/ );
}
}
}
=20
=20
VG_REGPARM(2)
-void MC_(helperc_STOREV1) ( Addr aA, UWord vbyte )
+void MC_(helperc_STOREV1) ( Addr a, UWord vbyte )
{
- UWord a, sm_off, vabits32;
+ UWord sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(270, "helperc_STOREV1");
=20
# if VG_DEBUG_MEMORY >=3D 2
- mc_STOREVn_slow( aA, 1, (ULong)vbyte, False/*irrelevant*/ );
+ mc_STOREVn_slow( a, 1, (ULong)vbyte, False/*irrelevant*/ );
return;
# endif
=20
- a =3D (UWord)aA;
-
if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,1) )) {
PROF_EVENT(271, "helperc_STOREV1-slow1");
- mc_STOREVn_slow( aA, 1, (ULong)vbyte, False/*irrelevant*/ );
+ mc_STOREVn_slow( a, 1, (ULong)vbyte, False/*irrelevant*/ );
return;
}
=20
@@ -4124,6 +4107,8 @@
=20
tl_assert( mc_expensive_sanity_check() );
=20
+ // {LOADV,STOREV}[8421] will all fail horribly if this isn't true.
+ tl_assert(sizeof(UWord) =3D=3D sizeof(Addr));
}
=20
VG_DETERMINE_INTERFACE_VERSION(mc_pre_clo_init)
|
|
From: <sv...@va...> - 2005-12-15 22:57:42
|
Author: njn
Date: 2005-12-15 22:57:35 +0000 (Thu, 15 Dec 2005)
New Revision: 5355
Log:
Factor out the masking from {LOADV,STOREV}[1248]. Cuts 30 lines and make=
s
things more readable.
Modified:
branches/COMPVBITS/memcheck/mc_main.c
Modified: branches/COMPVBITS/memcheck/mc_main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- branches/COMPVBITS/memcheck/mc_main.c 2005-12-15 22:40:32 UTC (rev 53=
54)
+++ branches/COMPVBITS/memcheck/mc_main.c 2005-12-15 22:57:35 UTC (rev 53=
55)
@@ -2632,12 +2632,19 @@
are a UWord, and for STOREV8 they are a ULong.
*/
=20
+/* If any part of '_a' indicated by the mask is 1, either
+ '_a' is not naturally '_sz'-aligned, or it exceeds the range
+ covered by the primary map. */
+#define UNALIGNED_OR_HIGH(_a,_sz) ((_a) & MASK((_sz)))
+#define MASK(_sz) ( ~((0x10000-(_sz)) | ((N_PRIMARY_MAP-1) << 16)) )
+
+
/* ------------------------ Size =3D 8 ------------------------ */
=20
static inline __attribute__((always_inline))
ULong mc_LOADV8 ( Addr aA, Bool isBigEndian )
{
- UWord mask, a, sm_off64, vabits64;
+ UWord a, sm_off64, vabits64;
SecMap* sm;
=20
PROF_EVENT(200, "mc_LOADV8");
@@ -2645,14 +2652,9 @@
if (VG_DEBUG_MEMORY >=3D 2)
return mc_LOADVn_slow( aA, 8, isBigEndian );
=20
- mask =3D ~((0x10000-8) | ((N_PRIMARY_MAP-1) << 16));
- a =3D (UWord)aA;
+ a =3D (UWord)aA;
=20
- /* If any part of 'a' indicated by the mask is 1, either */
- /* 'a' is not naturally aligned, or 'a' exceeds the range */
- /* covered by the primary map. Either way we defer to the */
- /* slow-path case. */
- if (EXPECTED_NOT_TAKEN(a & mask)) {
+ if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,8) )) {
PROF_EVENT(201, "mc_LOADV8-slow1");
return (UWord)mc_LOADVn_slow( aA, 8, isBigEndian );
}
@@ -2688,7 +2690,7 @@
static inline __attribute__((always_inline))
void mc_STOREV8 ( Addr aA, ULong vbytes, Bool isBigEndian )
{
- UWord mask, a, sm_off64, vabits64;
+ UWord a, sm_off64, vabits64;
SecMap* sm;
=20
PROF_EVENT(210, "mc_STOREV8");
@@ -2700,14 +2702,9 @@
return;
}
=20
- mask =3D ~((0x10000-8) | ((N_PRIMARY_MAP-1) << 16));
- a =3D (UWord)aA;
+ a =3D (UWord)aA;
=20
- /* If any part of 'a' indicated by the mask is 1, either */
- /* 'a' is not naturally aligned, or 'a' exceeds the range */
- /* covered by the primary map. Either way we defer to the */
- /* slow-path case. */
- if (EXPECTED_NOT_TAKEN(a & mask)) {
+ if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,8) )) {
PROF_EVENT(211, "mc_STOREV8-slow1");
mc_STOREVn_slow( aA, 8, vbytes, isBigEndian );
return;
@@ -2758,7 +2755,7 @@
static inline __attribute__((always_inline))
UWord mc_LOADV4 ( Addr a, Bool isBigEndian )
{
- UWord mask, sm_off, vabits32;
+ UWord sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(220, "mc_LOADV4");
@@ -2766,13 +2763,7 @@
if (VG_DEBUG_MEMORY >=3D 2)
return (UWord)mc_LOADVn_slow( a, 4, isBigEndian );
=20
- mask =3D ~((0x10000-4) | ((N_PRIMARY_MAP-1) << 16));
-
- /* If any part of 'a' indicated by the mask is 1, either */
- /* 'a' is not naturally aligned, or 'a' exceeds the range */
- /* covered by the primary map. Either way we defer to the */
- /* slow-path case. */
- if (EXPECTED_NOT_TAKEN(a & mask)) {
+ if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,4) )) {
PROF_EVENT(221, "mc_LOADV4-slow1");
return (UWord)mc_LOADVn_slow( a, 4, isBigEndian );
}
@@ -2813,7 +2804,7 @@
static inline __attribute__((always_inline))
void mc_STOREV4 ( Addr aA, UWord vbytes, Bool isBigEndian )
{
- UWord mask, a, sm_off, vabits32;
+ UWord a, sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(230, "mc_STOREV4");
@@ -2823,14 +2814,9 @@
return;
}
=20
- mask =3D ~((0x10000-4) | ((N_PRIMARY_MAP-1) << 16));
- a =3D (UWord)aA;
+ a =3D (UWord)aA;
=20
- /* If any part of 'a' indicated by the mask is 1, either */
- /* 'a' is not naturally aligned, or 'a' exceeds the range */
- /* covered by the primary map. Either way we defer to the */
- /* slow-path case. */
- if (EXPECTED_NOT_TAKEN(a & mask)) {
+ if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,4) )) {
PROF_EVENT(231, "mc_STOREV4-slow1");
mc_STOREVn_slow( aA, 4, (ULong)vbytes, isBigEndian );
return;
@@ -2915,7 +2901,7 @@
static inline __attribute__((always_inline))
UWord mc_LOADV2 ( Addr aA, Bool isBigEndian )
{
- UWord mask, a, sm_off, vabits32;
+ UWord a, sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(240, "mc_LOADV2");
@@ -2923,14 +2909,9 @@
if (VG_DEBUG_MEMORY >=3D 2)
return (UWord)mc_LOADVn_slow( aA, 2, isBigEndian );
=20
- mask =3D ~((0x10000-2) | ((N_PRIMARY_MAP-1) << 16));
- a =3D (UWord)aA;
+ a =3D (UWord)aA;
=20
- /* If any part of 'a' indicated by the mask is 1, either */
- /* 'a' is not naturally aligned, or 'a' exceeds the range */
- /* covered by the primary map. Either way we defer to the */
- /* slow-path case. */
- if (EXPECTED_NOT_TAKEN(a & mask)) {
+ if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,2) )) {
PROF_EVENT(241, "mc_LOADV2-slow1");
return (UWord)mc_LOADVn_slow( aA, 2, isBigEndian );
}
@@ -2968,7 +2949,7 @@
static inline __attribute__((always_inline))
void mc_STOREV2 ( Addr aA, UWord vbytes, Bool isBigEndian )
{
- UWord mask, a, sm_off, vabits32;
+ UWord a, sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(250, "mc_STOREV2");
@@ -2978,14 +2959,9 @@
return;
}
=20
- mask =3D ~((0x10000-2) | ((N_PRIMARY_MAP-1) << 16));
- a =3D (UWord)aA;
+ a =3D (UWord)aA;
=20
- /* If any part of 'a' indicated by the mask is 1, either */
- /* 'a' is not naturally aligned, or 'a' exceeds the range */
- /* covered by the primary map. Either way we defer to the */
- /* slow-path case. */
- if (EXPECTED_NOT_TAKEN(a & mask)) {
+ if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,2) )) {
PROF_EVENT(251, "mc_STOREV2-slow1");
mc_STOREVn_slow( aA, 2, (ULong)vbytes, isBigEndian );
return;
@@ -3040,7 +3016,7 @@
VG_REGPARM(1)
UWord MC_(helperc_LOADV1) ( Addr aA )
{
- UWord mask, a, sm_off, vabits32;
+ UWord a, sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(260, "helperc_LOADV1");
@@ -3049,13 +3025,9 @@
return (UWord)mc_LOADVn_slow( aA, 1, False/*irrelevant*/ );
# endif
=20
- mask =3D ~((0x10000-1) | ((N_PRIMARY_MAP-1) << 16));
- a =3D (UWord)aA;
+ a =3D (UWord)aA;
=20
- /* If any part of 'a' indicated by the mask is 1, it means 'a'
- exceeds the range covered by the primary map. In which case we
- defer to the slow-path case. */
- if (EXPECTED_NOT_TAKEN(a & mask)) {
+ if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,1) )) {
PROF_EVENT(261, "helperc_LOADV1-slow1");
return (UWord)mc_LOADVn_slow( aA, 1, False/*irrelevant*/ );
}
@@ -3087,7 +3059,7 @@
VG_REGPARM(2)
void MC_(helperc_STOREV1) ( Addr aA, UWord vbyte )
{
- UWord mask, a, sm_off, vabits32;
+ UWord a, sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(270, "helperc_STOREV1");
@@ -3097,12 +3069,9 @@
return;
# endif
=20
- mask =3D ~((0x10000-1) | ((N_PRIMARY_MAP-1) << 16));
- a =3D (UWord)aA;
- /* If any part of 'a' indicated by the mask is 1, it means 'a'
- exceeds the range covered by the primary map. In which case we
- defer to the slow-path case. */
- if (EXPECTED_NOT_TAKEN(a & mask)) {
+ a =3D (UWord)aA;
+
+ if (EXPECTED_NOT_TAKEN( UNALIGNED_OR_HIGH(a,1) )) {
PROF_EVENT(271, "helperc_STOREV1-slow1");
mc_STOREVn_slow( aA, 1, (ULong)vbyte, False/*irrelevant*/ );
return;
|
|
From: <sv...@va...> - 2005-12-15 22:40:36
|
Author: njn
Date: 2005-12-15 22:40:32 +0000 (Thu, 15 Dec 2005)
New Revision: 5354
Log:
Factor out a couple of common accesses to the primary map. Cuts 60 lines
and make things more readable.
Modified:
branches/COMPVBITS/memcheck/mc_main.c
Modified: branches/COMPVBITS/memcheck/mc_main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- branches/COMPVBITS/memcheck/mc_main.c 2005-12-15 21:53:17 UTC (rev 53=
53)
+++ branches/COMPVBITS/memcheck/mc_main.c 2005-12-15 22:40:32 UTC (rev 53=
54)
@@ -381,6 +381,16 @@
=20
/* --------------- SecMap fundamentals --------------- */
=20
+__attribute__((always_inline))
+static inline SecMap* get_secmap_readable_low ( Addr a )
+{
+ UWord pm_off =3D a >> 16;
+# if VG_DEBUG_MEMORY >=3D 1
+ tl_assert(pm_off < N_PRIMARY_MAP);
+# endif
+ return primary_map[ pm_off ];
+}
+
/* Produce the secmap for 'a', either from the primary map or by
ensuring there is an entry for it in the aux primary map. The
secmap may be a distinguished one as the caller will only want to
@@ -389,8 +399,7 @@
static SecMap* get_secmap_readable ( Addr a )
{
if (a <=3D MAX_PRIMARY_ADDRESS) {
- UWord pm_off =3D a >> 16;
- return primary_map[ pm_off ];
+ return get_secmap_readable_low(a);
} else {
AuxMapEnt* am =3D find_or_alloc_in_auxmap(a);
return am->sm;
@@ -404,16 +413,26 @@
static SecMap* maybe_get_secmap_for ( Addr a )
{
if (a <=3D MAX_PRIMARY_ADDRESS) {
- UWord pm_off =3D a >> 16;
- return primary_map[ pm_off ];
+ return get_secmap_readable_low(a);
} else {
AuxMapEnt* am =3D maybe_find_in_auxmap(a);
return am ? am->sm : NULL;
}
}
=20
+// Produce the secmap for 'a', where 'a' is known to be in the primary m=
ap.
+__attribute__((always_inline))
+static inline SecMap* get_secmap_writable_low(Addr a)
+{
+ UWord pm_off =3D a >> 16;
+# if VG_DEBUG_MEMORY >=3D 1
+ tl_assert(pm_off < N_PRIMARY_MAP);
+# endif
+ if (EXPECTED_NOT_TAKEN(is_distinguished_sm(primary_map[pm_off])))
+ primary_map[pm_off] =3D copy_for_writing(primary_map[pm_off]);
+ return primary_map[pm_off];
+}
=20
-
/* Produce the secmap for 'a', either from the primary map or by
ensuring there is an entry for it in the aux primary map. The
secmap may not be a distinguished one, since the caller will want
@@ -424,10 +443,7 @@
static SecMap* get_secmap_writable ( Addr a )
{
if (a <=3D MAX_PRIMARY_ADDRESS) {
- UWord pm_off =3D a >> 16;
- if (is_distinguished_sm(primary_map[ pm_off ]))
- primary_map[pm_off] =3D copy_for_writing(primary_map[pm_off]);
- return primary_map[pm_off];
+ return get_secmap_writable_low(a);
} else {
AuxMapEnt* am =3D find_or_alloc_in_auxmap(a);
if (is_distinguished_sm(am->sm))
@@ -1083,7 +1099,7 @@
static __inline__
void make_aligned_word32_writable ( Addr a )
{
- UWord sec_no, sm_off;
+ UWord sm_off;
SecMap* sm;
=20
PROF_EVENT(300, "make_aligned_word32_writable");
@@ -1099,16 +1115,7 @@
return;
}
=20
- sec_no =3D (UWord)(a >> 16);
-# if VG_DEBUG_MEMORY >=3D 1
- tl_assert(sec_no < N_PRIMARY_MAP);
-# endif
-
- // XXX: This is basically what get_secmap_writable is doing.
- if (EXPECTED_NOT_TAKEN(is_distinguished_sm(primary_map[sec_no])))
- primary_map[sec_no] =3D copy_for_writing(primary_map[sec_no]);
-
- sm =3D primary_map[sec_no];
+ sm =3D get_secmap_writable_low(a);
sm_off =3D SM_OFF(a);
sm->vabits32[sm_off] =3D VA_BITS32_WRITABLE;
}
@@ -1118,7 +1125,7 @@
static __inline__
void make_aligned_word32_noaccess ( Addr a )
{
- UWord sec_no, sm_off;
+ UWord sm_off;
SecMap* sm;
=20
PROF_EVENT(310, "make_aligned_word32_noaccess");
@@ -1134,16 +1141,7 @@
return;
}
=20
- sec_no =3D (UWord)(a >> 16);
-# if VG_DEBUG_MEMORY >=3D 1
- tl_assert(sec_no < N_PRIMARY_MAP);
-# endif
-
- // XXX: This is basically what get_secmap_writable is doing.
- if (EXPECTED_NOT_TAKEN(is_distinguished_sm(primary_map[sec_no])))
- primary_map[sec_no] =3D copy_for_writing(primary_map[sec_no]);
-
- sm =3D primary_map[sec_no];
+ sm =3D get_secmap_writable_low(a);
sm_off =3D SM_OFF(a);
sm->vabits32[sm_off] =3D VA_BITS32_NOACCESS;
}
@@ -1153,7 +1151,7 @@
static __inline__
void make_aligned_word64_writable ( Addr a )
{
- UWord sec_no, sm_off64;
+ UWord sm_off64;
SecMap* sm;
=20
PROF_EVENT(320, "make_aligned_word64_writable");
@@ -1169,15 +1167,7 @@
return;
}
=20
- sec_no =3D (UWord)(a >> 16);
-# if VG_DEBUG_MEMORY >=3D 1
- tl_assert(sec_no < N_PRIMARY_MAP);
-# endif
-
- if (EXPECTED_NOT_TAKEN(is_distinguished_sm(primary_map[sec_no])))
- primary_map[sec_no] =3D copy_for_writing(primary_map[sec_no]);
-
- sm =3D primary_map[sec_no];
+ sm =3D get_secmap_writable_low(a);
sm_off64 =3D SM_OFF_64(a);
((UShort*)(sm->vabits32))[sm_off64] =3D VA_BITS64_WRITABLE;
}
@@ -1186,7 +1176,7 @@
static __inline__
void make_aligned_word64_noaccess ( Addr a )
{
- UWord sec_no, sm_off64;
+ UWord sm_off64;
SecMap* sm;
=20
PROF_EVENT(330, "make_aligned_word64_noaccess");
@@ -1202,15 +1192,7 @@
return;
}
=20
- sec_no =3D (UWord)(a >> 16);
-# if VG_DEBUG_MEMORY >=3D 1
- tl_assert(sec_no < N_PRIMARY_MAP);
-# endif
-
- if (EXPECTED_NOT_TAKEN(is_distinguished_sm(primary_map[sec_no])))
- primary_map[sec_no] =3D copy_for_writing(primary_map[sec_no]);
-
- sm =3D primary_map[sec_no];
+ sm =3D get_secmap_writable_low(a);
sm_off64 =3D SM_OFF_64(a);
((UShort*)(sm->vabits32))[sm_off64] =3D VA_BITS64_NOACCESS;
}
@@ -2655,7 +2637,7 @@
static inline __attribute__((always_inline))
ULong mc_LOADV8 ( Addr aA, Bool isBigEndian )
{
- UWord mask, a, sec_no, sm_off64, vabits64;
+ UWord mask, a, sm_off64, vabits64;
SecMap* sm;
=20
PROF_EVENT(200, "mc_LOADV8");
@@ -2675,12 +2657,7 @@
return (UWord)mc_LOADVn_slow( aA, 8, isBigEndian );
}
=20
- sec_no =3D (UWord)(a >> 16);
-
- if (VG_DEBUG_MEMORY >=3D 1)
- tl_assert(sec_no < N_PRIMARY_MAP);
-
- sm =3D primary_map[sec_no];
+ sm =3D get_secmap_readable_low(a);
sm_off64 =3D SM_OFF_64(a);
vabits64 =3D ((UShort*)(sm->vabits32))[sm_off64];
=20
@@ -2711,7 +2688,7 @@
static inline __attribute__((always_inline))
void mc_STOREV8 ( Addr aA, ULong vbytes, Bool isBigEndian )
{
- UWord mask, a, sec_no, sm_off64, vabits64;
+ UWord mask, a, sm_off64, vabits64;
SecMap* sm;
=20
PROF_EVENT(210, "mc_STOREV8");
@@ -2736,12 +2713,7 @@
return;
}
=20
- sec_no =3D (UWord)(a >> 16);
-
- if (VG_DEBUG_MEMORY >=3D 1)
- tl_assert(sec_no < N_PRIMARY_MAP);
-
- sm =3D primary_map[sec_no];
+ sm =3D get_secmap_readable_low(a);
sm_off64 =3D SM_OFF_64(a);
vabits64 =3D ((UShort*)(sm->vabits32))[sm_off64];
=20
@@ -2786,7 +2758,7 @@
static inline __attribute__((always_inline))
UWord mc_LOADV4 ( Addr a, Bool isBigEndian )
{
- UWord mask, sec_no, sm_off, vabits32;
+ UWord mask, sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(220, "mc_LOADV4");
@@ -2805,12 +2777,7 @@
return (UWord)mc_LOADVn_slow( a, 4, isBigEndian );
}
=20
- sec_no =3D (UWord)(a >> 16);
-
- if (VG_DEBUG_MEMORY >=3D 1)
- tl_assert(sec_no < N_PRIMARY_MAP);
-
- sm =3D primary_map[sec_no];
+ sm =3D get_secmap_readable_low(a);
sm_off =3D SM_OFF(a);
vabits32 =3D sm->vabits32[sm_off];
=20
@@ -2846,7 +2813,7 @@
static inline __attribute__((always_inline))
void mc_STOREV4 ( Addr aA, UWord vbytes, Bool isBigEndian )
{
- UWord mask, a, sec_no, sm_off, vabits32;
+ UWord mask, a, sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(230, "mc_STOREV4");
@@ -2869,12 +2836,7 @@
return;
}
=20
- sec_no =3D (UWord)(a >> 16);
-
- if (VG_DEBUG_MEMORY >=3D 1)
- tl_assert(sec_no < N_PRIMARY_MAP);
-
- sm =3D primary_map[sec_no];
+ sm =3D get_secmap_readable_low(a);
sm_off =3D SM_OFF(a);
vabits32 =3D sm->vabits32[sm_off];
=20
@@ -2953,7 +2915,7 @@
static inline __attribute__((always_inline))
UWord mc_LOADV2 ( Addr aA, Bool isBigEndian )
{
- UWord mask, a, sec_no, sm_off, vabits32;
+ UWord mask, a, sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(240, "mc_LOADV2");
@@ -2973,12 +2935,7 @@
return (UWord)mc_LOADVn_slow( aA, 2, isBigEndian );
}
=20
- sec_no =3D (UWord)(a >> 16);
-
- if (VG_DEBUG_MEMORY >=3D 1)
- tl_assert(sec_no < N_PRIMARY_MAP);
-
- sm =3D primary_map[sec_no];
+ sm =3D get_secmap_readable_low(a);
sm_off =3D SM_OFF(a);
vabits32 =3D sm->vabits32[sm_off];
// Convert V bits from compact memory form to expanded register form
@@ -3011,7 +2968,7 @@
static inline __attribute__((always_inline))
void mc_STOREV2 ( Addr aA, UWord vbytes, Bool isBigEndian )
{
- UWord mask, a, sec_no, sm_off, vabits32;
+ UWord mask, a, sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(250, "mc_STOREV2");
@@ -3034,12 +2991,7 @@
return;
}
=20
- sec_no =3D (UWord)(a >> 16);
-
- if (VG_DEBUG_MEMORY >=3D 1)
- tl_assert(sec_no < N_PRIMARY_MAP);
-
- sm =3D primary_map[sec_no];
+ sm =3D get_secmap_readable_low(a);
sm_off =3D SM_OFF(a);
vabits32 =3D sm->vabits32[sm_off];
if (EXPECTED_TAKEN( !is_distinguished_sm(sm) &&=20
@@ -3088,7 +3040,7 @@
VG_REGPARM(1)
UWord MC_(helperc_LOADV1) ( Addr aA )
{
- UWord mask, a, sec_no, sm_off, vabits32;
+ UWord mask, a, sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(260, "helperc_LOADV1");
@@ -3108,13 +3060,7 @@
return (UWord)mc_LOADVn_slow( aA, 1, False/*irrelevant*/ );
}
=20
- sec_no =3D (UWord)(a >> 16);
-
-# if VG_DEBUG_MEMORY >=3D 1
- tl_assert(sec_no < N_PRIMARY_MAP);
-# endif
-
- sm =3D primary_map[sec_no];
+ sm =3D get_secmap_readable_low(a);
sm_off =3D SM_OFF(a);
vabits32 =3D sm->vabits32[sm_off];
// Convert V bits from compact memory form to expanded register form
@@ -3141,7 +3087,7 @@
VG_REGPARM(2)
void MC_(helperc_STOREV1) ( Addr aA, UWord vbyte )
{
- UWord mask, a, sec_no, sm_off, vabits32;
+ UWord mask, a, sm_off, vabits32;
SecMap* sm;
=20
PROF_EVENT(270, "helperc_STOREV1");
@@ -3162,13 +3108,7 @@
return;
}
=20
- sec_no =3D (UWord)(a >> 16);
-
-# if VG_DEBUG_MEMORY >=3D 1
- tl_assert(sec_no < N_PRIMARY_MAP);
-# endif
-
- sm =3D primary_map[sec_no];
+ sm =3D get_secmap_readable_low(a);
sm_off =3D SM_OFF(a);
vabits32 =3D sm->vabits32[sm_off];
if (EXPECTED_TAKEN( !is_distinguished_sm(sm) &&=20
|
|
From: <sv...@va...> - 2005-12-15 21:53:23
|
Author: njn
Date: 2005-12-15 21:53:17 +0000 (Thu, 15 Dec 2005)
New Revision: 5353
Log:
Fix up some comments, and other minor stuff.
Modified:
branches/COMPVBITS/memcheck/mc_include.h
branches/COMPVBITS/memcheck/mc_main.c
Modified: branches/COMPVBITS/memcheck/mc_include.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- branches/COMPVBITS/memcheck/mc_include.h 2005-12-15 21:40:34 UTC (rev=
5352)
+++ branches/COMPVBITS/memcheck/mc_include.h 2005-12-15 21:53:17 UTC (rev=
5353)
@@ -55,10 +55,10 @@
typedef
struct _MC_Chunk {
struct _MC_Chunk* next;
- Addr data; // ptr to actual block
- SizeT size : (sizeof(UWord)*8)-2; // size requested; 30 or=
62 bits
+ Addr data; // ptr to actual block
+ SizeT size : (sizeof(UWord)*8)-2; // size requested; 30 or =
62 bits
MC_AllocKind allockind : 2; // which wrapper did the allocation
- ExeContext* where; // where it was allocated
+ ExeContext* where; // where it was allocated
}
MC_Chunk;
=20
Modified: branches/COMPVBITS/memcheck/mc_main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- branches/COMPVBITS/memcheck/mc_main.c 2005-12-15 21:40:34 UTC (rev 53=
52)
+++ branches/COMPVBITS/memcheck/mc_main.c 2005-12-15 21:53:17 UTC (rev 53=
53)
@@ -70,36 +70,66 @@
=20
=20
/*------------------------------------------------------------*/
+/*--- V bits and A bits ---*/
+/*------------------------------------------------------------*/
+
+/* Conceptually, every byte value has 8 V bits, which track whether Memc=
heck
+ thinks the corresponding value bit is defined. And every memory byte
+ has an A bit, which tracks whether Memcheck thinks the program can ac=
cess
+ it safely. So every N-bit register is shadowed with N V bits, and e=
very
+ memory byte is shadowed with 8 V bits and one A bit.
+
+ In the implementation, we use two forms of compression (compressed V =
bits
+ and distinguished secondary maps) to avoid the 9-bit-per-byte overhea=
d
+ for memory.
+
+ Memcheck also tracks extra information about each heap block that is
+ allocated, for detecting memory leaks and other purposes.
+*/
+
+/*------------------------------------------------------------*/
/*--- Basic A/V bitmap representation. ---*/
/*------------------------------------------------------------*/
=20
-/* TODO: fix this comment */
-//zz /* All reads and writes are checked against a memory map, which
-//zz records the state of all memory in the process. The memory map =
is
-//zz organised like this:
-//zz=20
-//zz The top 16 bits of an address are used to index into a top-level
-//zz map table, containing 65536 entries. Each entry is a pointer to=
a
-//zz second-level map, which records the accesibililty and validity
-//zz permissions for the 65536 bytes indexed by the lower 16 bits of =
the
-//zz address. Each byte is represented by nine bits, one indicating
-//zz accessibility, the other eight validity. So each second-level m=
ap
-//zz contains 73728 bytes. This two-level arrangement conveniently
-//zz divides the 4G address space into 64k lumps, each size 64k bytes=
.
-//zz=20
-//zz All entries in the primary (top-level) map must point to a valid
-//zz secondary (second-level) map. Since most of the 4G of address
-//zz space will not be in use -- ie, not mapped at all -- there is a
-//zz distinguished secondary map, which indicates 'not addressible an=
d
-//zz not valid' writeable for all bytes. Entries in the primary map =
for
-//zz which the entire 64k is not in use at all point at this
-//zz distinguished map.
-//zz=20
-//zz There are actually 4 distinguished secondaries. These are used =
to
-//zz represent a memory range which is either not addressable (validi=
ty
-//zz doesn't matter), addressable+not valid, addressable+valid.
-//zz */
+/* All reads and writes are checked against a memory map (a.k.a. shadow
+ memory), which records the state of all memory in the process. =20
+ =20
+ On 32-bit machine the memory map is organised as follows.
+ The top 16 bits of an address are used to index into a top-level
+ map table, containing 65536 entries. Each entry is a pointer to a
+ second-level map, which records the accesibililty and validity
+ permissions for the 65536 bytes indexed by the lower 16 bits of the
+ address. Each byte is represented by two bits (details are below). =
So
+ each second-level map contains 16384 bytes. This two-level arrangeme=
nt
+ conveniently divides the 4G address space into 64k lumps, each size 6=
4k
+ bytes.
=20
+ All entries in the primary (top-level) map must point to a valid
+ secondary (second-level) map. Since many of the 64kB chunks will
+ have the same status for every bit -- ie. not mapped at all (for unus=
ed
+ address space) or entirely readable (for code segments) -- there are
+ three distinguished secondary maps, which indicate 'noaccess', 'writa=
ble'
+ and 'readable'. For these uniform 64kB chunks, the primary map entry
+ points to the relevant distinguished map. In practice, typically aro=
und
+ half of the addressable memory is represented with the 'writable' or
+ 'readable' distinguished secondary map, so it gives a good saving. I=
t
+ also lets us set the V+A bits of large address regions quickly in
+ set_address_range_perms().
+
+ On 64-bit machines it's more complicated. If we followed the same ba=
sic
+ scheme we'd have a four-level table which would require too many memo=
ry
+ accesses. So instead the top-level map table has 2^19 entries (index=
ed
+ using bits 16..34 of the address); this covers the bottom 32GB. Any
+ accesses above 32GB are handled with a slow, sparse auxiliary table.
+ Valgrind's address space manager tries very hard to keep things below
+ this 32GB barrier so that performance doesn't suffer too much.
+
+ Note that this file has a lot of different functions for reading and
+ writing shadow memory. Only a couple are strictly necessary (eg.
+ get_vabits8 and set_vabits8), most are just specialised for specific
+ common cases to improve performance.
+*/
+
/* --------------- Basic configuration --------------- */
=20
/* Only change this. N_PRIMARY_MAP *must* be a power of 2. */
@@ -158,16 +188,27 @@
// easily (without having to do any shifting and/or masking), and that i=
s a
// very common operation. (Note that although each vabits32 chunk
// represents 32 bits of memory, but is only 8 bits in size.)
+//
+// The representation is "inverse" little-endian... each 4 bytes of
+// memory is represented by a 1 byte value, where:
+//
+// - the status of byte (a+0) is held in bits [1..0]
+// - the status of byte (a+1) is held in bits [3..2]
+// - the status of byte (a+2) is held in bits [5..4]
+// - the status of byte (a+3) is held in bits [7..6]
+//
+// It's "inverse" because endianness normally describes a mapping from
+// value bits to memory addresses; in this case the mapping is inverted=
.
+// Ie. instead of particular value bits being held in certain addresses,=
in
+// this case certain addresses are represented by particular value bits.
+// See insert_vabits8_into_vabits32() for an example.
//=20
-// XXX: something about endianness. Storing 1st byte in bits 1..0, 2nd =
byte
-// in bits 3..2, 3rd in 5..4, 4th in 7..6. (Little endian?)
-//
// But note that we don't compress the V bits stored in registers; they
// need to be explicit to made the shadow operations possible. Therefor=
e
-// when moving values between registers and memory we need to convert be=
tween
-// the expanded in-register format and the compressed in-memory format.
-// This isn't so difficult, it just requires careful attention in a few
-// places.
+// when moving values between registers and memory we need to convert
+// between the expanded in-register format and the compressed in-memory
+// format. This isn't so difficult, it just requires careful attention =
in a
+// few places.
=20
#define VA_BITS8_NOACCESS 0x0 // 00b
#define VA_BITS8_WRITABLE 0x1 // 01b
@@ -574,7 +615,7 @@
the bytes in the word, from the most significant down to the
least. */
ULong vbits64 =3D V_BITS64_INVALID;
- SizeT i =3D szB-1;
+ SSizeT i =3D szB-1; // Must be signed
SizeT n_addrs_bad =3D 0;
Addr ai;
Bool partial_load_exemption_applies;
@@ -583,15 +624,11 @@
PROF_EVENT(30, "mc_LOADVn_slow");
tl_assert(szB =3D=3D 8 || szB =3D=3D 4 || szB =3D=3D 2 || szB =3D=3D =
1);
=20
- // XXX: change this to a for loop. The loop var i must be signed.
- while (True) {
+ for (i =3D szB-1; i >=3D 0; i--) {
PROF_EVENT(31, "mc_LOADVn_slow(loop)");
ai =3D a+byte_offset_w(szB,bigendian,i);
vabits8 =3D get_vabits8(ai);
// Convert the in-memory format to in-register format.
- // XXX: We check in order of most likely to least likely...
- // XXX: could maybe have a little lookup table instead of these
- // chained conditionals? and elsewhere?
if ( VA_BITS8_READABLE =3D=3D vabits8 ) { vbits8 =3D V_BITS8_=
VALID; }
else if ( VA_BITS8_WRITABLE =3D=3D vabits8 ) { vbits8 =3D V_BITS8_=
INVALID; }
else if ( VA_BITS8_NOACCESS =3D=3D vabits8 ) {
@@ -603,8 +640,6 @@
}
vbits64 <<=3D 8;=20
vbits64 |=3D vbits8;
- if (i =3D=3D 0) break;
- i--;
}
=20
/* This is a hack which avoids producing errors for code which
|
|
From: <sv...@va...> - 2005-12-15 21:40:40
|
Author: sewardj
Date: 2005-12-15 21:40:34 +0000 (Thu, 15 Dec 2005)
New Revision: 5352
Log:
Rewrite ppc32 dispatch loop to avoid profiling overhead, as per
today's x86 and amd64 rewrites.
Modified:
trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S
trunk/docs/internals/performance.txt
Modified: trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S 2005-12-15 21:18:34=
UTC (rev 5351)
+++ trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S 2005-12-15 21:40:34=
UTC (rev 5352)
@@ -1,8 +1,8 @@
=20
-##--------------------------------------------------------------------##
-##--- The core dispatch loop, for jumping to a code address. ---##
-##--- dispatch-ppc32.S ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- The core dispatch loop, for jumping to a code address. ---*/
+/*--- dispatch-ppc32.S ---*/
+/*--------------------------------------------------------------------*/
=20
/*
This file is part of Valgrind, a dynamic binary instrumentation
@@ -38,12 +38,20 @@
/*--- The dispatch loop. ---*/
/*------------------------------------------------------------*/
=20
-/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */
+/*----------------------------------------------------*/
+/*--- Preamble (set everything up) ---*/
+/*----------------------------------------------------*/
=20
- .globl VG_(run_innerloop)
+/* signature:
+UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+*/
+.text
+.globl VG_(run_innerloop)
VG_(run_innerloop):
+ /* r3 holds guest_state */
+ /* r4 holds do_profiling */
+
/* ----- entry point to VG_(run_innerloop) ----- */
-
/* For Linux/ppc32 we need the SysV ABI, which uses
LR->4(parent_sp), CR->anywhere.
(The AIX ABI, used on Darwin, and maybe Linux/ppc64?,
@@ -58,10 +66,10 @@
stwu 1,-496(1) /* sp should maintain 16-byte alignment */
=20
/* Save callee-saved registers... */
- /* r3 is live here (guest state ptr), so use r4 */
- lis 4,VG_(machine_ppc32_has_FP)@ha
- lwz 4,VG_(machine_ppc32_has_FP)@l(4)
- cmplwi 4,0
+ /* r3, r4 are live here, so use r5 */
+ lis 5,VG_(machine_ppc32_has_FP)@ha
+ lwz 5,VG_(machine_ppc32_has_FP)@l(5)
+ cmplwi 5,0
beq LafterFP1
=20
/* Floating-point reg save area : 144 bytes */
@@ -111,43 +119,43 @@
/* It's necessary to save/restore VRSAVE in the AIX / Darwin ABI=
.
The Linux kernel might not actually use VRSAVE for its intend=
ed
purpose, but it should be harmless to preserve anyway. */
- /* r3 is live here (guest state ptr), so use r4 */
- lis 4,VG_(machine_ppc32_has_VMX)@ha
- lwz 4,VG_(machine_ppc32_has_VMX)@l(4)
- cmplwi 4,0
+ /* r3, r4 are live here (guest state ptr), so use r5 */
+ lis 5,VG_(machine_ppc32_has_VMX)@ha
+ lwz 5,VG_(machine_ppc32_has_VMX)@l(5)
+ cmplwi 5,0
beq LafterVMX1
=20
/* VRSAVE save word : 32 bytes */
- mfspr 4,256 /* vrsave reg is spr number 256 */
- stw 4,244(1)
+ mfspr 5,256 /* vrsave reg is spr number 256 */
+ stw 5,244(1)
=20
/* Alignment padding : 4 bytes */
=20
/* Vector reg save area (quadword aligned) : 192 bytes */
- li 4,224
- stvx 31,4,1
- li 4,208
- stvx 30,4,1
- li 4,192
- stvx 29,4,1
- li 4,176
- stvx 28,4,1
- li 4,160
- stvx 27,4,1
- li 4,144
- stvx 26,4,1
- li 4,128
- stvx 25,4,1
- li 4,112
- stvx 24,4,1
- li 4,96
- stvx 23,4,1
- li 4,80
- stvx 22,4,1
- li 4,64
- stvx 21,4,1
- li 4,48
- stvx 20,4,1
+ li 5,224
+ stvx 31,5,1
+ li 5,208
+ stvx 30,5,1
+ li 5,192
+ stvx 29,5,1
+ li 5,176
+ stvx 28,5,1
+ li 5,160
+ stvx 27,5,1
+ li 5,144
+ stvx 26,5,1
+ li 5,128
+ stvx 25,5,1
+ li 5,112
+ stvx 25,5,1
+ li 5,96
+ stvx 23,5,1
+ li 5,80
+ stvx 22,5,1
+ li 5,64
+ stvx 21,5,1
+ li 5,48
+ stvx 20,5,1
LafterVMX1:
=20
/* Save cr */
@@ -159,8 +167,9 @@
/* 32(sp) used later to check FPSCR[RM] */
=20
/* r3 holds guest_state */
- mr 31,3
- stw 3,28(1) /* spill orig guest_state ptr */
+ /* r4 holds do_profiling */
+ mr 31,3 /* r31 (generated code gsp) =3D r3 */
+ stw 3,28(1) /* spill orig guest_state ptr */
=20
/* 24(sp) used later to stop ctr reg being clobbered */
/* 20(sp) used later to load fpscr with zero */
@@ -171,40 +180,37 @@
0(sp) : back-chain
*/
=20
-// CAB TODO: Use a caller-saved reg for orig guest_state ptr
-// - rem to set non-allocateable in isel.c
+ /* CAB TODO: Use a caller-saved reg for orig guest_state ptr
+ - rem to set non-allocateable in isel.c */
=20
/* hold dispatch_ctr in ctr reg */
- lis 17,VG_(dispatch_ctr)@ha
- lwz 17,VG_(dispatch_ctr)@l(17)
- mtctr 17
+ lis 5,VG_(dispatch_ctr)@ha
+ lwz 5,VG_(dispatch_ctr)@l(5)
+ mtctr 5
=20
- /* fetch %CIA into r30 */
- lwz 30,OFFSET_ppc32_CIA(31)
-
/* set host FPU control word to the default mode expected=20
by VEX-generated code. See comments in libvex.h for
more info. */
- lis 3,VG_(machine_ppc32_has_FP)@ha
- lwz 3,VG_(machine_ppc32_has_FP)@l(3)
- cmplwi 3,0
+ lis 5,VG_(machine_ppc32_has_FP)@ha
+ lwz 5,VG_(machine_ppc32_has_FP)@l(5)
+ cmplwi 5,0
beq LafterFP2
=20
- /* get zero into f3 (tedious) */
- /* note: fsub 3,3,3 is not a reliable way to do this,=20
- since if f3 holds a NaN or similar then we don't necessarily
- wind up with zero. */
- li 3,0
- stw 3,20(1)
+ /* get zero into f3 (tedious) */
+ /* note: fsub 3,3,3 is not a reliable way to do this,=20
+ since if f3 holds a NaN or similar then we don't necessarily
+ wind up with zero. */
+ li 5,0
+ stw 5,20(1)
lfs 3,20(1)
mtfsf 0xFF,3 /* fpscr =3D f3 */
LafterFP2:
=20
/* set host AltiVec control word to the default mode expected=20
by VEX-generated code. */
- lis 3,VG_(machine_ppc32_has_VMX)@ha
- lwz 3,VG_(machine_ppc32_has_VMX)@l(3)
- cmplwi 3,0
+ lis 5,VG_(machine_ppc32_has_VMX)@ha
+ lwz 5,VG_(machine_ppc32_has_VMX)@l(5)
+ cmplwi 5,0
beq LafterVMX2
=20
vspltisw 3,0x0 /* generate zero */
@@ -214,36 +220,108 @@
/* make a stack frame for the code we are calling */
stwu 1,-16(1)
=20
- /* fall into main loop */
+ /* fetch %CIA into r3 */
+ lwz 3,OFFSET_ppc32_CIA(31)
=20
-/* Live regs:
- r1 (=3Dsp)
- r30 (=3DCIA =3D jump address)
- r31 (=3Dguest_state)
- ctr (=3Ddispatch_ctr)
- Stack state:
- 44(r1) (=3Dorig guest_state)
-*/
+ /* fall into main loop (the right one) */
+ /* r4 =3D do_profiling. It's probably trashed after here,
+ but that's OK: we don't need it after here. */
+ cmplwi 4,0
+ beq VG_(run_innerloop__dispatch_unprofiled)
+ b VG_(run_innerloop__dispatch_profiled)
+ /*NOTREACHED*/
=20
-dispatch_boring:
+/*----------------------------------------------------*/
+/*--- NO-PROFILING (standard) dispatcher ---*/
+/*----------------------------------------------------*/
+
+.global VG_(run_innerloop__dispatch_unprofiled)
+VG_(run_innerloop__dispatch_unprofiled):
+ /* At entry: Live regs:
+ r1 (=3Dsp)
+ r3 (=3DCIA =3D next guest address)
+ r31 (=3Dguest_state)
+ ctr (=3Ddispatch_ctr)
+ Stack state:
+ 44(r1) (=3Dorig guest_state)
+ */
+
+ /* Has the guest state pointer been messed with? If yes, exit. */
+ lwz 5,44(1) /* original guest_state ptr */
+ cmpw 5,31
+ bne gsp_changed
+
/* save the jump address in the guest state */
- stw 30,OFFSET_ppc32_CIA(31)
+ stw 3,OFFSET_ppc32_CIA(31)
=20
/* Are we out of timeslice? If yes, defer to scheduler. */
bdz counter_is_zero /* decrements ctr reg */
=20
/* try a fast lookup in the translation cache */
/* r4=3D((r30<<2) & (VG_TT_FAST_MASK<<2)) */
- rlwinm 4,30, 2, 32-2-VG_TT_FAST_BITS, 31-2 =20
-// CAB: use a caller-saved reg for this ?
+ rlwinm 4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2 =20
addis 5,4,VG_(tt_fast)@ha
lwz 5,VG_(tt_fast)@l(5)
lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
- cmpw 30,6
+ cmpw 3,6
bne fast_lookup_failed
=20
+ /* Found a match. Call tce[1], which is 8 bytes along, since
+ each tce element is a 64-bit int. */
+ addi 8,5,8
+ mtlr 8
+
+ /* stop ctr being clobbered */
+ mfctr 5
+ stw 5,40(1) /* =3D> 40-16 =3D 24(1) on our parent stack */
+
+ /* run the translation */
+ blrl
+
+ /* reinstate clobbered ctr */
+ lwz 5,40(1)
+ mtctr 5
+
+ /* start over */
+ b VG_(run_innerloop__dispatch_unprofiled)
+ /*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- PROFILING dispatcher (can be much slower) ---*/
+/*----------------------------------------------------*/
+
+.global VG_(run_innerloop__dispatch_profiled)
+VG_(run_innerloop__dispatch_profiled):
+ /* At entry: Live regs:
+ r1 (=3Dsp)
+ r3 (=3DCIA =3D next guest address)
+ r31 (=3Dguest_state)
+ ctr (=3Ddispatch_ctr)
+ Stack state:
+ 44(r1) (=3Dorig guest_state)
+ */
+
+ /* Has the guest state pointer been messed with? If yes, exit. */
+ lwz 5,44(1) /* original guest_state ptr */
+ cmpw 5,31
+ bne gsp_changed
+
+ /* save the jump address in the guest state */
+ stw 3,OFFSET_ppc32_CIA(31)
+
+ /* Are we out of timeslice? If yes, defer to scheduler. */
+ bdz counter_is_zero /* decrements ctr reg */
+
+ /* try a fast lookup in the translation cache */
+ /* r4=3D((r30<<2) & (VG_TT_FAST_MASK<<2)) */
+ rlwinm 4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2 =20
+ addis 5,4,VG_(tt_fast)@ha
+ lwz 5,VG_(tt_fast)@l(5)
+ lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
+ cmpw 3,6
+ bne fast_lookup_failed
+
/* increment bb profile counter */
-// CAB: use a caller-saved reg for this ?
addis 6,4,VG_(tt_fastN)@ha
lwz 7,VG_(tt_fastN)@l(6)
lwz 8,0(7)
@@ -256,37 +334,57 @@
mtlr 8
=20
/* stop ctr being clobbered */
-// CAB: use a caller-saved reg for this ?
-// but then (bdz) =3D> (decr, cmp, bc)... still better than a stw?
- mfctr 9
- stw 9,40(1) /* =3D> 40-16 =3D 24(1) on our parent stack */
+ mfctr 5
+ stw 5,40(1) /* =3D> 40-16 =3D 24(1) on our parent stack */
=20
+ /* run the translation */
blrl
=20
+ /* reinstate clobbered ctr */
+ lwz 5,40(1)
+ mtctr 5
=20
- /* On return from guest code:
- r3 holds destination (original) address.
+ /* start over */
+ b VG_(run_innerloop__dispatch_profiled)
+ /*NOTREACHED*/
=20
- r31 may be unchanged (guest_state), or may indicate further
- details of the control transfer requested to *r3.
+/*----------------------------------------------------*/
+/*--- exit points ---*/
+/*----------------------------------------------------*/
=20
- If r31 is unchanged (=3D=3D 44(r1)), just jump next to r3.
+gsp_changed:
+ /* Someone messed with the gsp (in r31). Have to
+ defer to scheduler to resolve this. dispatch ctr
+ is not yet decremented, so no need to increment. */
+ /* %CIA is NOT up to date here. First, need to write
+ %r3 back to %CIA, but without trashing %r31 since
+ that holds the value we want to return to the scheduler.
+ Hence use %r5 transiently for the guest state pointer. */
+ lwz 5,44(1) /* original guest_state ptr */
+ stw 3,OFFSET_ppc32_CIA(5)
+ mr 3,31 /* r3 =3D new gsp value */
+ b run_innerloop_exit
+ /*NOTREACHED*/
=20
- Otherwise fall out, back to the scheduler, and let it
- figure out what to do next.
- */
+counter_is_zero:
+ /* %CIA is up to date */
+ /* back out decrement of the dispatch counter */
+ mfctr 5
+ addi 5,5,1
+ mtctr 5
+ li 3,VG_TRC_INNER_COUNTERZERO
+ b run_innerloop_exit
=20
- /* reinstate clobbered ctr */
- lwz 9,40(1)
- mtctr 9
+fast_lookup_failed:
+ /* %CIA is up to date */
+ /* back out decrement of the dispatch counter */
+ mfctr 5
+ addi 5,5,1
+ mtctr 5
+ li 3,VG_TRC_INNER_FASTMISS
+ b run_innerloop_exit
=20
- mr 30,3 /* put CIA (=3Dr3) in r30 */
- lwz 16,44(1) /* original guest_state ptr */
- cmpw 16,31
- beq dispatch_boring /* r31 unchanged... */
=20
- mr 3,31 /* put return val (=3Dr31) in r3 */
- b dispatch_exceptional
=20
/* All exits from the dispatcher go through here.
r3 holds the return value.=20
@@ -301,8 +399,9 @@
cmplwi 10,0
beq LafterFP8
=20
-/* This check avoidance may be removable if stfiwx is implemented. */
-#if !defined(ENABLE_INNER)
+ /* This check avoidance may be removable if stfiwx is
+ implemented. */
+# if !defined(ENABLE_INNER)
/* Check FPSCR & 0xFF =3D=3D 0 (lowest 8bits are controls) */
mffs 4 /* fpscr -> fpr */
li 5,48
@@ -311,7 +410,7 @@
andi. 6,6,0xFF /* mask wanted bits */
cmplwi 6,0x0 /* cmp with zero */
bne invariant_violation /* branch if not zero */
-#endif
+# endif
LafterFP8:
=20
/* Using r11 - value used again further on, so don't trash! */
@@ -445,36 +544,9 @@
addi 1,1,496 /* stack_size */
blr
=20
-
-/* Other ways of getting out of the inner loop. Placed out-of-line to
- make it look cleaner.=20
-*/
-dispatch_exceptional:
- /* this is jumped to only, not fallen-through from above */
- /* save r30 in %CIA and defer to sched */
- lwz 16,44(1)
- stw 30,OFFSET_ppc32_CIA(16)
- b run_innerloop_exit
-
-fast_lookup_failed:
- /* %CIA is up to date here since dispatch_boring dominates */
- mfctr 17
- addi 17,17,1
- mtctr 17
- li 3,VG_TRC_INNER_FASTMISS
- b run_innerloop_exit
-
-counter_is_zero:
- /* %CIA is up to date here since dispatch_boring dominates */
- mfctr 17
- addi 17,17,1
- mtctr 17
- li 3,VG_TRC_INNER_COUNTERZERO
- b run_innerloop_exit
-
/* Let the linker know we don't need an executable stack */
.section .note.GNU-stack,"",@progbits
=20
-##--------------------------------------------------------------------##
-##--- end ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- end ---*/
+/*--------------------------------------------------------------------*/
Modified: trunk/docs/internals/performance.txt
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/docs/internals/performance.txt 2005-12-15 21:18:34 UTC (rev 535=
1)
+++ trunk/docs/internals/performance.txt 2005-12-15 21:40:34 UTC (rev 535=
2)
@@ -14,11 +14,12 @@
- Nick improved vg_SP_update_pass() to identify more small constant
increments/decrements of SP, so the fast cases can be used more often.
Saved 1--3% on a few programs.
-- r5345,r5346: Julian improved the dispatcher so that x86 and AMD64 use
- jumps instead of call/return for calling translations, and also remove=
d
- the --profile-flags profiling from the dispatcher unless --profile-fla=
gs
- is being used. Improved Nulgrind performance typically by 10--20%,
- and Memcheck performance typically by 2--20%.
+- r5345,r5346,r5352: Julian improved the dispatcher so that x86 and
+ AMD64 use jumps instead of call/return for calling translations.
+ Also, on x86, amd64 and ppc32, --profile-flags style profiling was
+ removed from the despatch loop unless --profile-flags is being used.
+ Improved Nulgrind performance typically by 10--20%, and Memcheck
+ performance typically by 2--20%.
=20
COMPVBITS branch:
- Nick converted to compress V bits, initial version saved 0--5% on most
|
|
From: <sv...@va...> - 2005-12-15 21:33:52
|
Author: sewardj
Date: 2005-12-15 21:33:50 +0000 (Thu, 15 Dec 2005)
New Revision: 1496
Log:
Make suitable changes for ppc32/ppc64 following recent x86/amd64
dispatch changes. Note, this doesn't change the generated code at
all.
Modified:
trunk/priv/host-ppc32/hdefs.c
trunk/priv/host-ppc32/hdefs.h
Modified: trunk/priv/host-ppc32/hdefs.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-ppc32/hdefs.c 2005-12-15 15:45:20 UTC (rev 1495)
+++ trunk/priv/host-ppc32/hdefs.c 2005-12-15 21:33:50 UTC (rev 1496)
@@ -227,6 +227,7 @@
(*arr)[i++] =3D hregPPC_GPR12(mode64);
}
// GPR13 =3D thread specific pointer
+ // GPR 14 and above are callee save. Yay.
(*arr)[i++] =3D hregPPC_GPR14(mode64);
(*arr)[i++] =3D hregPPC_GPR15(mode64);
(*arr)[i++] =3D hregPPC_GPR16(mode64);
@@ -243,9 +244,12 @@
(*arr)[i++] =3D hregPPC_GPR27(mode64);
(*arr)[i++] =3D hregPPC_GPR28(mode64);
(*arr)[i++] =3D hregPPC_GPR29(mode64);
- // GPR30 AltiVec spill reg temporary
- // GPR31 =3D GuestStatePtr
+ // GPR30 is reserved as AltiVec spill reg temporary
+ // GPR31 is reserved for the GuestStatePtr
=20
+ /* Don't waste the reg-allocs's time trawling through zillions of
+ FP registers - they mostly will never be used. We'll tolerate
+ the occasional extra spill instead. */
(*arr)[i++] =3D hregPPC32_FPR0();
(*arr)[i++] =3D hregPPC32_FPR1();
(*arr)[i++] =3D hregPPC32_FPR2();
@@ -254,32 +258,8 @@
(*arr)[i++] =3D hregPPC32_FPR5();
(*arr)[i++] =3D hregPPC32_FPR6();
(*arr)[i++] =3D hregPPC32_FPR7();
-/*
- (*arr)[i++] =3D hregPPC32_FPR8();
- (*arr)[i++] =3D hregPPC32_FPR9();
- (*arr)[i++] =3D hregPPC32_FPR10();
- (*arr)[i++] =3D hregPPC32_FPR11();
- (*arr)[i++] =3D hregPPC32_FPR12();
- (*arr)[i++] =3D hregPPC32_FPR13();
- (*arr)[i++] =3D hregPPC32_FPR14();
- (*arr)[i++] =3D hregPPC32_FPR15();
- (*arr)[i++] =3D hregPPC32_FPR16();
- (*arr)[i++] =3D hregPPC32_FPR17();
- (*arr)[i++] =3D hregPPC32_FPR18();
- (*arr)[i++] =3D hregPPC32_FPR19();
- (*arr)[i++] =3D hregPPC32_FPR20();
- (*arr)[i++] =3D hregPPC32_FPR21();
- (*arr)[i++] =3D hregPPC32_FPR22();
- (*arr)[i++] =3D hregPPC32_FPR23();
- (*arr)[i++] =3D hregPPC32_FPR24();
- (*arr)[i++] =3D hregPPC32_FPR25();
- (*arr)[i++] =3D hregPPC32_FPR26();
- (*arr)[i++] =3D hregPPC32_FPR27();
- (*arr)[i++] =3D hregPPC32_FPR28();
- (*arr)[i++] =3D hregPPC32_FPR29();
- (*arr)[i++] =3D hregPPC32_FPR30();
- (*arr)[i++] =3D hregPPC32_FPR31();
-*/
+
+ /* Same deal re Altivec */
(*arr)[i++] =3D hregPPC32_VR0();
(*arr)[i++] =3D hregPPC32_VR1();
(*arr)[i++] =3D hregPPC32_VR2();
@@ -288,32 +268,7 @@
(*arr)[i++] =3D hregPPC32_VR5();
(*arr)[i++] =3D hregPPC32_VR6();
(*arr)[i++] =3D hregPPC32_VR7();
-/*
- (*arr)[i++] =3D hregPPC32_VR8();
- (*arr)[i++] =3D hregPPC32_VR9();
- (*arr)[i++] =3D hregPPC32_VR10();
- (*arr)[i++] =3D hregPPC32_VR11();
- (*arr)[i++] =3D hregPPC32_VR12();
- (*arr)[i++] =3D hregPPC32_VR13();
- (*arr)[i++] =3D hregPPC32_VR14();
- (*arr)[i++] =3D hregPPC32_VR15();
- (*arr)[i++] =3D hregPPC32_VR16();
- (*arr)[i++] =3D hregPPC32_VR17();
- (*arr)[i++] =3D hregPPC32_VR18();
- (*arr)[i++] =3D hregPPC32_VR19();
- (*arr)[i++] =3D hregPPC32_VR20();
- (*arr)[i++] =3D hregPPC32_VR21();
- (*arr)[i++] =3D hregPPC32_VR22();
- (*arr)[i++] =3D hregPPC32_VR23();
- (*arr)[i++] =3D hregPPC32_VR24();
- (*arr)[i++] =3D hregPPC32_VR25();
- (*arr)[i++] =3D hregPPC32_VR26();
- (*arr)[i++] =3D hregPPC32_VR27();
- (*arr)[i++] =3D hregPPC32_VR28();
- (*arr)[i++] =3D hregPPC32_VR29();
- (*arr)[i++] =3D hregPPC32_VR30();
- (*arr)[i++] =3D hregPPC32_VR31();
-*/
+
vassert(i =3D=3D *nregs);
}
=20
@@ -1263,7 +1218,9 @@
vex_printf("if (%s) ", showPPC32CondCode(i->Pin.Goto.cond));
}
vex_printf("{ ");
- if (i->Pin.Goto.jk !=3D Ijk_Boring) {
+ if (i->Pin.Goto.jk !=3D Ijk_Boring
+ && i->Pin.Goto.jk !=3D Ijk_Call
+ && i->Pin.Goto.jk !=3D Ijk_Ret) {
vex_printf("li %%r31,$");
ppIRJumpKind(i->Pin.Goto.jk);
vex_printf(" ; ");
@@ -1670,7 +1627,12 @@
addRegUsage_PPC32RI(u, i->Pin.Goto.dst);
/* GPR3 holds destination address from Pin_Goto */
addHRegUse(u, HRmWrite, hregPPC_GPR3(mode64));
- if (i->Pin.Goto.jk !=3D Ijk_Boring)
+ if (i->Pin.Goto.jk !=3D Ijk_Boring
+ && i->Pin.Goto.jk !=3D Ijk_Call
+ && i->Pin.Goto.jk !=3D Ijk_Ret)
+ /* note, this is irrelevant since the guest state pointer
+ register is not actually available to the allocator.
+ But still .. */
addHRegUse(u, HRmWrite, GuestStatePtr(mode64));
return;
case Pin_CMov:
@@ -2437,9 +2399,14 @@
=20
/* Emit an instruction into buf and return the number of bytes used.
Note that buf is not the insn's final place, and therefore it is
- imperative to emit position-independent code. */
+ imperative to emit position-independent code.=20
=20
-Int emit_PPC32Instr ( UChar* buf, Int nbuf, PPC32Instr* i, Bool mode64 )
+ Note, dispatch should always be NULL since ppc32/ppc64 backends
+ use a call-return scheme to get from the dispatcher to generated
+ code and back.
+*/
+Int emit_PPC32Instr ( UChar* buf, Int nbuf, PPC32Instr* i,=20
+ Bool mode64, void* dispatch )
{
UChar* p =3D &buf[0];
UChar* ptmp =3D p;
@@ -2789,6 +2756,8 @@
PPC32CondCode cond =3D i->Pin.Goto.cond;
UInt r_dst;
ULong imm_dst;
+
+ vassert(dispatch =3D=3D NULL);
=20
/* First off, if this is conditional, create a conditional
jump over the rest of it. */
Modified: trunk/priv/host-ppc32/hdefs.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-ppc32/hdefs.h 2005-12-15 15:45:20 UTC (rev 1495)
+++ trunk/priv/host-ppc32/hdefs.h 2005-12-15 21:33:50 UTC (rev 1496)
@@ -88,9 +88,9 @@
extern HReg hregPPC_GPR26 ( Bool mode64 );
extern HReg hregPPC_GPR27 ( Bool mode64 );
extern HReg hregPPC_GPR28 ( Bool mode64 );
-extern HReg hregPPC_GPR29 ( Bool mode64 );
-extern HReg hregPPC_GPR30 ( Bool mode64 );
-extern HReg hregPPC_GPR31 ( Bool mode64 ); // GuestStatePtr
+extern HReg hregPPC_GPR29 ( Bool mode64 ); // reserved for dispatcher
+extern HReg hregPPC_GPR30 ( Bool mode64 ); // we use as VMX spill tempor=
ary
+extern HReg hregPPC_GPR31 ( Bool mode64 ); // GuestStatePtr (callee-save=
d)
=20
extern HReg hregPPC32_FPR0 ( void );
extern HReg hregPPC32_FPR1 ( void );
@@ -783,7 +783,8 @@
extern void getRegUsage_PPC32Instr ( HRegUsage*, PPC32Instr*, Bo=
ol mode64 );
extern void mapRegs_PPC32Instr ( HRegRemap*, PPC32Instr* , B=
ool mode64);
extern Bool isMove_PPC32Instr ( PPC32Instr*, HReg*, HReg* )=
;
-extern Int emit_PPC32Instr ( UChar* buf, Int nbuf, PPC32=
Instr*, Bool mode64 );
+extern Int emit_PPC32Instr ( UChar* buf, Int nbuf, PPC32=
Instr*,=20
+ Bool mode64, void* dispatch=
);
extern PPC32Instr* genSpill_PPC32 ( HReg rreg, UShort offsetB, =
Bool mode64 );
extern PPC32Instr* genReload_PPC32 ( HReg rreg, UShort offsetB, =
Bool mode64 );
extern void getAllocableRegs_PPC32 ( Int*, HReg**, Bool mode64 )=
;
|
|
From: <sv...@va...> - 2005-12-15 21:18:41
|
Author: njn
Date: 2005-12-15 21:18:34 +0000 (Thu, 15 Dec 2005)
New Revision: 5351
Log:
Minor secondary V bit table optimisations:
- Make each node cover 4 bytes instead of one. This costs no extra space
because of alignment, and reduces the number of nodes by 4x in the wors=
t
case. On real programs it seems to only makes a small difference, thou=
gh
(< 3%).
- STOREVn_slow() now has code to detect if the byte being overwritten has=
a
sec V bit entry that can be removed. This can reduce the number of sta=
le
nodes in the sec V bit table, although in practice not by very much
because most of the staleness is caused by set_address_range_perms() an=
d
make_aligned*(). Currently it's commented out because it slowed things
down a little bit for little gain.
Modified:
branches/COMPVBITS/memcheck/mc_main.c
Modified: branches/COMPVBITS/memcheck/mc_main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- branches/COMPVBITS/memcheck/mc_main.c 2005-12-15 19:53:50 UTC (rev 53=
50)
+++ branches/COMPVBITS/memcheck/mc_main.c 2005-12-15 21:18:34 UTC (rev 53=
51)
@@ -398,52 +398,106 @@
=20
/* --------------- Secondary V bit table ------------ */
=20
-// XXX: this table can hold out-of-date stuff. Eg. write a partially
-// defined byte, then overwrite it with a fully defined byte. The info =
for
-// the partially defined bytes will still be here. But it shouldn't eve=
r
-// get accessed, I think...
+// Note: the nodes in this table can become stale. Eg. if you write a
+// partially defined byte (PDB), then overwrite the same address with a
+// fully defined byte, the sec-V-bit node will not necessarily be remove=
d.
+// This is because checking for whether removal is necessary would slow =
down
+// the fast paths. Hopefully this is not a problem. If it becomes a
+// problem, we may have to consider doing a clean-up pass every so often=
.
=20
-// XXX: profile, esp. with Julian's random-ORing stress test. Could may=
be
-// store in chunks up to a page size.
+static OSet* secVBitTable;
=20
-OSet* secVBitTable;
+static ULong sec_vbits_bytes_allocd =3D 0;
+static ULong sec_vbits_bytes_freed =3D 0;
+static ULong sec_vbits_bytes_curr =3D 0;
+static ULong sec_vbits_bytes_peak =3D 0;
=20
+// 4 is the best value here. We can go from 1 to 4 for free -- it doesn=
't
+// change the size of the SecVBitNode because of padding. If we make it
+// larger, we have bigger nodes, but can possibly fit more partially def=
ined
+// bytes in each node. In practice it seems that partially defined byte=
s
+// are not clustered close to each other, so going bigger than 4 does no=
t
+// save space.
+#define BYTES_PER_SEC_VBIT_NODE 4
+
typedef=20
struct {
Addr a;
- UWord vbits8;
+ UChar vbits8[BYTES_PER_SEC_VBIT_NODE];
}=20
SecVBitNode;
=20
static UWord get_sec_vbits8(Addr a)
{
- SecVBitNode* n;
- n =3D VG_(OSet_Lookup)(secVBitTable, &a);
+ Addr aAligned =3D VG_ROUNDDN(a, BYTES_PER_SEC_VBIT_NODE);
+ Int amod =3D a % BYTES_PER_SEC_VBIT_NODE;
+ SecVBitNode* n =3D VG_(OSet_Lookup)(secVBitTable, &aAligned);
+ UChar vbits8;
tl_assert(n);
// Shouldn't be fully defined or fully undefined -- those cases shoul=
dn't
// make it to the secondary V bits table.
- tl_assert(V_BITS8_VALID !=3D n->vbits8 && V_BITS8_INVALID !=3D n->vbi=
ts8 );
- return n->vbits8;
+ vbits8 =3D n->vbits8[amod];
+ tl_assert(V_BITS8_VALID !=3D vbits8 && V_BITS8_INVALID !=3D vbits8);
+ return vbits8;
}
=20
static void set_sec_vbits8(Addr a, UWord vbits8)
{
- SecVBitNode* n;
- n =3D VG_(OSet_Lookup)(secVBitTable, &a);
+ Addr aAligned =3D VG_ROUNDDN(a, BYTES_PER_SEC_VBIT_NODE);
+ Int i, amod =3D a % BYTES_PER_SEC_VBIT_NODE;
+ SecVBitNode* n =3D VG_(OSet_Lookup)(secVBitTable, &aAligned);
// Shouldn't be fully defined or fully undefined -- those cases shoul=
dn't
// make it to the secondary V bits table.
- tl_assert(V_BITS8_VALID !=3D vbits8 && V_BITS8_INVALID !=3D vbits8 );
+ tl_assert(V_BITS8_VALID !=3D vbits8 && V_BITS8_INVALID !=3D vbits8);
if (n) {
- n->vbits8 =3D vbits8; // update
+ n->vbits8[amod] =3D vbits8; // update
} else {
+ // New node: assign the specific byte, make the rest invalid (the=
y
+ // should never be read as-is, but be cautious).
+ sec_vbits_bytes_allocd +=3D sizeof(SecVBitNode);
+ sec_vbits_bytes_curr +=3D sizeof(SecVBitNode);
+ if (sec_vbits_bytes_curr > sec_vbits_bytes_peak)
+ sec_vbits_bytes_peak =3D sec_vbits_bytes_curr;
n =3D VG_(OSet_AllocNode)(secVBitTable, sizeof(SecVBitNode));
- n->a =3D a;
- n->vbits8 =3D vbits8;
+ n->a =3D aAligned;
+ for (i =3D 0; i < BYTES_PER_SEC_VBIT_NODE; i++) {
+ n->vbits8[i] =3D V_BITS8_INVALID;
+ }
+ n->vbits8[amod] =3D vbits8;
VG_(OSet_Insert)(secVBitTable, n);
}
}
=20
+// Remove the node if its V bytes (other than the one for 'a') are all f=
ully
+// defined or fully undefined. We ignore the V byte for 'a' because it'=
s
+// about to be overwritten with a fully defined or fully undefined value=
.
+__attribute__((unused))
+static void maybe_remove_sec_vbits8(Addr a)
+{
+ Addr aAligned =3D VG_ROUNDDN(a, BYTES_PER_SEC_VBIT_NODE);
+ Int i, amod =3D a % BYTES_PER_SEC_VBIT_NODE;
+ SecVBitNode* n =3D VG_(OSet_Lookup)(secVBitTable, &aAligned);
+ tl_assert(n);
+ for (i =3D 0; i < BYTES_PER_SEC_VBIT_NODE; i++) {
+ UChar vbits8 =3D n->vbits8[i];
=20
+ // Ignore the V byte for 'a'.
+ if (i =3D=3D amod)
+ continue;
+ =20
+ // One of the other V bytes is still partially defined -- don't re=
move
+ // this entry from the table.
+ if (V_BITS8_VALID !=3D vbits8 && V_BITS8_INVALID !=3D vbits8)
+ return;
+ }
+ n =3D VG_(OSet_Remove)(secVBitTable, &aAligned);
+ VG_(OSet_FreeNode)(secVBitTable, n);
+ sec_vbits_bytes_freed +=3D sizeof(SecVBitNode);
+ sec_vbits_bytes_curr -=3D sizeof(SecVBitNode);
+ tl_assert(n);
+}
+
+
/* --------------- Endianness helpers --------------- */
=20
/* Returns the offset in memory of the byteno-th most significant byte
@@ -457,7 +511,7 @@
/* --------------- Fundamental functions --------------- */
=20
static inline
-void insert_vabit8_into_vabits32 ( Addr a, UChar vabits8, UChar* vabits3=
2 )
+void insert_vabits8_into_vabits32 ( Addr a, UChar vabits8, UChar* vabits=
32 )
{
UInt shift =3D (a & 3) << 1; // shift by 0, 2, 4, or 6
*vabits32 &=3D ~(0x3 << shift); // mask out the two old bits
@@ -489,10 +543,7 @@
{
SecMap* sm =3D get_secmap_writable(a);
UWord sm_off =3D SM_OFF(a);
-// VG_(printf)("se:%p, %d\n", a, sm_off);
-// VG_(printf)("s1:%p (0x%x)\n", &(sm->vabits32[sm_off]), vabits8);
- insert_vabit8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) );
-// VG_(printf)("s2: 0x%x\n", sm->vabits32[sm_off]);
+ insert_vabits8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) );
}
=20
static inline
@@ -518,16 +569,16 @@
static
ULong mc_LOADVn_slow ( Addr a, SizeT szB, Bool bigendian )
{
- /* Make up a result V word, which contains the loaded data for
+ /* Make up a 64-bit result V word, which contains the loaded data for
valid addresses and Defined for invalid addresses. Iterate over
the bytes in the word, from the most significant down to the
least. */
- ULong vw =3D V_BITS64_INVALID;
+ ULong vbits64 =3D V_BITS64_INVALID;
SizeT i =3D szB-1;
SizeT n_addrs_bad =3D 0;
Addr ai;
Bool partial_load_exemption_applies;
- UWord vbyte, vabits8;
+ UWord vbits8, vabits8;
=20
PROF_EVENT(30, "mc_LOADVn_slow");
tl_assert(szB =3D=3D 8 || szB =3D=3D 4 || szB =3D=3D 2 || szB =3D=3D =
1);
@@ -541,17 +592,17 @@
// XXX: We check in order of most likely to least likely...
// XXX: could maybe have a little lookup table instead of these
// chained conditionals? and elsewhere?
- if ( VA_BITS8_READABLE =3D=3D vabits8 ) { vbyte =3D V_BITS8_V=
ALID; }
- else if ( VA_BITS8_WRITABLE =3D=3D vabits8 ) { vbyte =3D V_BITS8_I=
NVALID; }
+ if ( VA_BITS8_READABLE =3D=3D vabits8 ) { vbits8 =3D V_BITS8_=
VALID; }
+ else if ( VA_BITS8_WRITABLE =3D=3D vabits8 ) { vbits8 =3D V_BITS8_=
INVALID; }
else if ( VA_BITS8_NOACCESS =3D=3D vabits8 ) {
- vbyte =3D V_BITS8_VALID; // Make V bits defined!
+ vbits8 =3D V_BITS8_VALID; // Make V bits defined!
n_addrs_bad++;
} else {
tl_assert( VA_BITS8_OTHER =3D=3D vabits8 );
- vbyte =3D get_sec_vbits8(ai);
+ vbits8 =3D get_sec_vbits8(ai);
}
- vw <<=3D 8;=20
- vw |=3D vbyte;
+ vbits64 <<=3D 8;=20
+ vbits64 |=3D vbits8;
if (i =3D=3D 0) break;
i--;
}
@@ -577,7 +628,7 @@
if (n_addrs_bad > 0 && !partial_load_exemption_applies)
mc_record_address_error( VG_(get_running_tid)(), a, szB, False );
=20
- return vw;
+ return vbits64;
}
=20
=20
@@ -585,7 +636,7 @@
void mc_STOREVn_slow ( Addr a, SizeT szB, ULong vbytes, Bool bigendian )
{
SizeT i, n_addrs_bad =3D 0;
- UWord vbyte, vabits8;
+ UWord vbits8, vabits8;
Addr ai;
=20
PROF_EVENT(35, "mc_STOREVn_slow");
@@ -597,17 +648,33 @@
for (i =3D 0; i < szB; i++) {
PROF_EVENT(36, "mc_STOREVn_slow(loop)");
ai =3D a+byte_offset_w(szB,bigendian,i);
- vbyte =3D vbytes & 0xff;
+ vbits8 =3D vbytes & 0xff;
vabits8 =3D get_vabits8(ai);
if ( VA_BITS8_NOACCESS !=3D vabits8 ) {
// Addressable. Convert in-register format to in-memory format=
.
- if ( V_BITS8_VALID =3D=3D vbyte ) { vabits8 =3D VA_BITS8=
_READABLE; }
- else if ( V_BITS8_INVALID =3D=3D vbyte ) { vabits8 =3D VA_BITS8=
_WRITABLE; }
- else {=20
+ // Also remove any existing sec V bit entry for the byte if no
+ // longer necessary.
+ //
+ // XXX: the calls to maybe_remove_sec_vbits8() are commented ou=
t
+ // because they slow things down a bit (eg. 10% for perf/bz2)
+ // and the space saving is quite small (eg. 1--2% reduction in =
the
+ // size of the sec-V-bit-table?)
+ if ( V_BITS8_VALID =3D=3D vbits8 ) {=20
+// if (VA_BITS8_OTHER =3D=3D vabits8)
+// maybe_remove_sec_vbits8(ai);
+ vabits8 =3D VA_BITS8_READABLE;=20
+
+ } else if ( V_BITS8_INVALID =3D=3D vbits8 ) {=20
+// if (VA_BITS8_OTHER =3D=3D vabits8)
+// maybe_remove_sec_vbits8(ai);
+ vabits8 =3D VA_BITS8_WRITABLE;=20
+
+ } else {=20
vabits8 =3D VA_BITS8_OTHER;
- set_sec_vbits8(ai, vbyte);
+ set_sec_vbits8(ai, vbits8);
}
set_vabits8(ai, vabits8);
+
} else {
// Unaddressable! Do nothing -- when writing to unaddressable
// memory it acts as a black hole, and the V bits can never be =
seen
@@ -827,7 +894,7 @@
if (lenA < 1) break;
PROF_EVENT(156, "set_address_range_perms-loop1a");
sm_off =3D SM_OFF(a);
- insert_vabit8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) )=
;
+ insert_vabits8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) =
);
a +=3D 1;
lenA -=3D 1;
}
@@ -845,7 +912,7 @@
if (lenA < 1) break;
PROF_EVENT(158, "set_address_range_perms-loop1b");
sm_off =3D SM_OFF(a);
- insert_vabit8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) )=
;
+ insert_vabits8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) =
);
a +=3D 1;
lenA -=3D 1;
}
@@ -916,7 +983,7 @@
if (lenB < 1) return;
PROF_EVENT(164, "set_address_range_perms-loop1c");
sm_off =3D SM_OFF(a);
- insert_vabit8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) )=
;
+ insert_vabits8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) =
);
a +=3D 1;
lenB -=3D 1;
}
@@ -3078,11 +3145,11 @@
// Convert full V-bits in register to compact 2-bit form.
// XXX: is it best to check for VALID before INVALID?
if (V_BITS8_VALID =3D=3D vbyte) {
- insert_vabit8_into_vabits32( a, VA_BITS8_READABLE,
- &(sm->vabits32[sm_off]) );
+ insert_vabits8_into_vabits32( a, VA_BITS8_READABLE,
+ &(sm->vabits32[sm_off]) );
} else if (V_BITS8_INVALID =3D=3D vbyte) {
- insert_vabit8_into_vabits32( a, VA_BITS8_WRITABLE,
- &(sm->vabits32[sm_off]) );
+ insert_vabits8_into_vabits32( a, VA_BITS8_WRITABLE,
+ &(sm->vabits32[sm_off]) );
} else {
/* Slow but general case -- writing partially defined bytes. */
PROF_EVENT(272, "helperc_STOREV1-slow2");
|
|
From: Nicholas N. <nj...@cs...> - 2005-12-15 20:20:30
|
On Thu, 15 Dec 2005, Thomas Lavergne wrote: > I do not know what you call a "simple" raytracer, but I would say the > "rayshade" is quite simple. Written in C (not a very recent C style), the > core tracer routines have around 6000 lines of instructions. It has a history > of configuration on linux / aix (does it proove the point on big- and little- > endian?) and I read somewhere about a 64-bit build. It was part (is still > part???) of some Linux distros. Like all ray-tracer, it involves quite a few > double precision floating point operations. Moreover, you can design your > test-case "as heavy as you wish" by using more complicated scenes to be > rendered. > The coding style tries to emulate an object-oriented encapsulation with > structures and function pointers. > I already ran valgrind on rayshade and it is clear that there is a very poor > memory management: what is allocated is rarely freed. I do not know if you > want an error-free code for you test cases. > Rayshade development stopped around 1994 and some hacks will certainly be > needed to have it build smoothly on all platforms. Especially, there is a > Lex/Yacc grammar file that needs some updates in order to build under Linux. > There is 1 popen() command, but it is not mandatory. > Let me know if I can be of any help. I have a 32-bit Linux PC and have built > rayshade from source (with modifications). The good things: - The size is good. - The quasi-OO style is good, since much Valgrind use is on C++ programs. - The popen() is not a problem. - The poor memory management doesn't sound like a problem. - The use of double precision is good. The bad things: - The dependence on Lex/Yacc is bad. Perhaps the generated .c file(s) could be used, hopefully they're not too big. - Portability is a concern; if it's not portable that would be a problem. Nick |
|
From: Nicholas N. <nj...@cs...> - 2005-12-15 19:55:40
|
Hi, Julian's commits r5345 and r5346 (avoiding the profiling in the dispatcher, and using jumps instead of call/return) have the following effect on my 3.0 GHz P4 Prescott. Before and after, trunk: -- bigcode1 -- bigcode1 trunk1 : 0.2s nl: 6.5s (32.7x, -----) mc:12.1s (60.7x, -----) bigcode1 trunk5 : 0.2s nl: 5.5s (27.7x, 15.3%) mc: 9.4s (47.0x, 22.6%) -- bigcode2 -- bigcode2 trunk1 : 0.2s nl:13.1s (65.4x, -----) mc:23.6s (117.8x, -----) bigcode2 trunk5 : 0.2s nl:11.4s (57.0x, 12.9%) mc:20.6s (103.0x, 12.6%) -- bz2 -- bz2 trunk1 : 1.3s nl: 9.4s ( 7.3x, -----) mc:25.9s (20.1x, -----) bz2 trunk5 : 1.3s nl: 7.2s ( 5.5x, 23.6%) mc:22.5s (17.4x, 13.4%) -- fbench -- fbench trunk1 : 1.1s nl: 5.0s ( 4.5x, -----) mc:12.8s (11.4x, -----) fbench trunk5 : 1.1s nl: 4.2s ( 3.8x, 15.5%) mc:12.2s (10.9x, 4.5%) -- ffbench -- ffbench trunk1 : 0.8s nl: 3.8s ( 4.5x, -----) mc:11.2s (13.1x, -----) ffbench trunk5 : 0.8s nl: 4.1s ( 4.8x, -6.8%) mc:10.9s (12.8x, 2.2%) -- gcc -- gcc trunk1 : 0.3s nl:12.3s (38.4x, -----) mc:31.1s (97.3x, -----) gcc trunk5 : 0.3s nl:10.8s (33.7x, 12.3%) mc:30.0s (93.8x, 3.6%) -- sarp -- sarp trunk1 : 0.1s nl: 0.9s (12.4x, -----) mc:11.1s (158.4x, -----) sarp trunk5 : 0.1s nl: 0.5s ( 6.4x, 48.3%) mc:10.8s (154.9x, 2.3%) -- Finished tests in perf ---------------------------------------------- Before and after, COMPVBITS: -- bigcode1 -- bigcode1 compvbits : 0.2s nl: 7.0s (34.8x, -----) mc:10.6s (53.1x, -----) bigcode1 compvbits3: 0.2s nl: 5.6s (27.8x, 20.1%) mc: 8.9s (44.3x, 16.6%) -- bigcode2 -- bigcode2 compvbits : 0.2s nl:12.7s (63.2x, -----) mc:21.8s (109.0x, -----) bigcode2 compvbits3: 0.2s nl:11.5s (57.5x, 9.0%) mc:19.6s (98.0x, 10.0%) -- bz2 -- bz2 compvbits : 1.3s nl: 9.4s ( 7.2x, -----) mc:27.1s (20.9x, -----) bz2 compvbits3: 1.3s nl: 7.3s ( 5.6x, 22.5%) mc:22.3s (17.2x, 17.8%) -- fbench -- fbench compvbits : 1.1s nl: 5.0s ( 4.4x, -----) mc:11.6s (10.3x, -----) fbench compvbits3: 1.1s nl: 4.2s ( 3.7x, 15.6%) mc:11.2s ( 9.9x, 3.4%) -- ffbench -- ffbench compvbits : 0.8s nl: 3.8s ( 4.5x, -----) mc: 9.1s (10.7x, -----) ffbench compvbits3: 0.8s nl: 4.2s ( 5.0x,-10.4%) mc: 8.8s (10.3x, 3.3%) -- gcc -- gcc compvbits : 0.3s nl:12.1s (39.2x, -----) mc:29.1s (94.0x, -----) gcc compvbits3: 0.3s nl:10.8s (34.8x, 11.1%) mc:28.3s (91.3x, 2.8%) -- sarp -- sarp compvbits : 0.1s nl: 0.8s (12.0x, -----) mc: 4.4s (62.4x, -----) sarp compvbits3: 0.1s nl: 0.4s ( 6.3x, 47.6%) mc: 4.1s (59.1x, 5.3%) -- Finished tests in perf ---------------------------------------------- Before and after, trunk and COMPVBITS (the percentages here are all relative to trunk1, which is the "before" version of the trunk). This lets you compare the trunk against COMPVBITS: -- bigcode1 -- bigcode1 trunk1 : 0.2s nl: 6.5s (32.6x, -----) mc:12.2s (60.9x, -----) bigcode1 trunk5 : 0.2s nl: 5.5s (27.7x, 15.0%) mc: 9.4s (46.8x, 23.2%) bigcode1 compvbits : 0.2s nl: 7.0s (35.0x, -7.4%) mc:10.5s (52.6x, 13.6%) bigcode1 compvbits3: 0.2s nl: 5.5s (27.7x, 15.0%) mc: 8.8s (44.0x, 27.8%) -- bigcode2 -- bigcode2 trunk1 : 0.2s nl:13.1s (65.5x, -----) mc:23.6s (118.1x, -----) bigcode2 trunk5 : 0.2s nl:11.4s (57.0x, 13.0%) mc:20.6s (103.1x, 12.7%) bigcode2 compvbits : 0.2s nl:12.7s (63.6x, 2.9%) mc:21.9s (109.3x, 7.4%) bigcode2 compvbits3: 0.2s nl:11.5s (57.4x, 12.4%) mc:20.0s (99.9x, 15.4%) -- bz2 -- bz2 trunk1 : 1.3s nl: 9.3s ( 7.3x, -----) mc:25.9s (20.4x, -----) bz2 trunk5 : 1.3s nl: 7.2s ( 5.7x, 22.6%) mc:22.6s (17.8x, 12.8%) bz2 compvbits : 1.3s nl: 9.4s ( 7.4x, -0.8%) mc:27.0s (21.2x, -4.2%) bz2 compvbits3: 1.3s nl: 7.3s ( 5.7x, 21.7%) mc:22.3s (17.6x, 13.9%) -- fbench -- fbench trunk1 : 1.1s nl: 5.0s ( 4.5x, -----) mc:12.7s (11.3x, -----) fbench trunk5 : 1.1s nl: 4.2s ( 3.8x, 16.0%) mc:12.1s (10.7x, 5.0%) fbench compvbits : 1.1s nl: 5.0s ( 4.4x, 1.2%) mc:11.6s (10.2x, 9.1%) fbench compvbits3: 1.1s nl: 4.2s ( 3.7x, 16.2%) mc:11.3s (10.0x, 11.5%) -- ffbench -- ffbench trunk1 : 0.9s nl: 4.2s ( 4.5x, -----) mc:11.1s (11.8x, -----) ffbench trunk5 : 0.9s nl: 4.0s ( 4.3x, 3.3%) mc:10.9s (11.6x, 1.7%) ffbench compvbits : 0.9s nl: 4.2s ( 4.4x, 0.5%) mc: 9.0s ( 9.6x, 19.2%) ffbench compvbits3: 0.9s nl: 4.0s ( 4.2x, 5.5%) mc: 8.7s ( 9.3x, 21.6%) -- gcc -- gcc trunk1 : 0.3s nl:12.4s (39.9x, -----) mc:31.1s (100.3x, -----) gcc trunk5 : 0.3s nl:10.8s (34.9x, 12.4%) mc:30.0s (96.8x, 3.5%) gcc compvbits : 0.3s nl:12.2s (39.2x, 1.6%) mc:29.3s (94.5x, 5.8%) gcc compvbits3: 0.3s nl:10.8s (34.9x, 12.5%) mc:28.3s (91.2x, 9.1%) -- sarp -- sarp trunk1 : 0.1s nl: 0.9s (12.3x, -----) mc:11.1s (158.7x, -----) sarp trunk5 : 0.1s nl: 0.4s ( 6.3x, 48.8%) mc:10.9s (155.4x, 2.1%) sarp compvbits : 0.1s nl: 0.8s (12.0x, 2.3%) mc: 4.4s (62.3x, 60.8%) sarp compvbits3: 0.1s nl: 0.4s ( 6.3x, 48.8%) mc: 4.1s (58.9x, 62.9%) -- Finished tests in perf ---------------------------------------------- The 'gcc' test is not in the repository; it's GCC compiling (but not assembling or linking) a 2234 line pre-processed C program at -O3. So overall it gives up to 20% improvements on Memcheck. ffbench under Nulgrind is a little weird, no idea why it slows down, but it doesn't seem important. And COMPVBITS is generally faster than the trunk, which is good. Nice work, Julian. Profiling is useful. Nick |
|
From: <sv...@va...> - 2005-12-15 19:54:01
|
Author: njn
Date: 2005-12-15 19:53:50 +0000 (Thu, 15 Dec 2005)
New Revision: 5350
Log:
Merge in r5345 and r5346 from trunk -- dispatcher improvements that speed
things up a lot.
Modified:
branches/COMPVBITS/coregrind/m_dispatch/dispatch-amd64-linux.S
branches/COMPVBITS/coregrind/m_dispatch/dispatch-x86-linux.S
branches/COMPVBITS/coregrind/m_scheduler/scheduler.c
branches/COMPVBITS/coregrind/m_translate.c
branches/COMPVBITS/coregrind/pub_core_dispatch.h
Modified: branches/COMPVBITS/coregrind/m_dispatch/dispatch-amd64-linux.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- branches/COMPVBITS/coregrind/m_dispatch/dispatch-amd64-linux.S 2005-1=
2-15 19:41:14 UTC (rev 5349)
+++ branches/COMPVBITS/coregrind/m_dispatch/dispatch-amd64-linux.S 2005-1=
2-15 19:53:50 UTC (rev 5350)
@@ -1,8 +1,8 @@
=20
-##--------------------------------------------------------------------##
-##--- The core dispatch loop, for jumping to a code address. ---##
-##--- dispatch-amd64.S ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- The core dispatch loop, for jumping to a code address. ---*/
+/*--- dispatch-amd64.S ---*/
+/*--------------------------------------------------------------------*/
=20
/*
This file is part of Valgrind, a dynamic binary instrumentation
@@ -39,11 +39,19 @@
/*--- The dispatch loop. ---*/
/*------------------------------------------------------------*/
=20
-/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */
+/*----------------------------------------------------*/
+/*--- Preamble (set everything up) ---*/
+/*----------------------------------------------------*/
=20
+/* signature:
+UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+*/
+
+.text
.globl VG_(run_innerloop)
VG_(run_innerloop):
/* %rdi holds guest_state */
+ /* %rsi holds do_profiling */
=09
/* ----- entry point to VG_(run_innerloop) ----- */
pushq %rbx
@@ -59,12 +67,13 @@
pushq %r13
pushq %r14
pushq %r15
- pushq %rdi
+ pushq %rdi /* guest_state */
=20
- movq VG_(dispatch_ctr)@GOTPCREL(%rip), %rsi
- pushq (%rsi)
+ movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
+ movl (%r15), %r15d
+ pushq %r15
=20
- /* 8(%rsp) holds cached copy of guest_state */
+ /* 8(%rsp) holds cached copy of guest_state ptr */
/* 0(%rsp) holds cached copy of VG_(dispatch_ctr) */
=20
/* Set up the guest state pointer */
@@ -90,12 +99,26 @@
/* set dir flag to known value */
cld
=20
- /* fall into main loop */
+ /* fall into main loop (the right one) */
+ cmpq $0, %rsi
+ je VG_(run_innerloop__dispatch_unprofiled)
+ jmp VG_(run_innerloop__dispatch_profiled)
+ /*NOTREACHED*/=09
=20
- /* Here, %rax is the only live (real) register. The entire
- simulated state is saved in the ThreadState. */
+/*----------------------------------------------------*/
+/*--- NO-PROFILING (standard) dispatcher ---*/
+/*----------------------------------------------------*/
=20
-dispatch_boring:
+.align 16
+.global VG_(run_innerloop__dispatch_unprofiled)
+VG_(run_innerloop__dispatch_unprofiled):
+ /* AT ENTRY: %rax is next guest addr, %rbp is possibly
+ modified guest state ptr */
+
+ /* Has the guest state pointer been messed with? If yes, exit. */
+ cmpq 8(%rsp), %rbp
+ jnz gsp_changed
+
/* save the jump address in the guest state */
movq %rax, OFFSET_amd64_RIP(%rbp)
=20
@@ -104,40 +127,99 @@
jz counter_is_zero
=20
/* try a fast lookup in the translation cache */
- movq %rax, %rbx
- andq $VG_TT_FAST_MASK, %rbx
- movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
- movq (%rcx,%rbx,8), %rcx
- cmpq %rax, (%rcx)
- jnz fast_lookup_failed
- /* increment bb profile counter */
- movq VG_(tt_fastN)@GOTPCREL(%rip), %rdx
- movq (%rdx,%rbx,8), %rdx
- incl (%rdx)
+ movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
+ movq %rax, %rbx
+ andq $VG_TT_FAST_MASK, %rbx
+ movq (%rcx,%rbx,8), %rcx
+ cmpq %rax, (%rcx)
+ jnz fast_lookup_failed
=20
/* Found a match. Call tce[1], which is 8 bytes along, since
each tce element is a 64-bit int. */
addq $8, %rcx
- call *%rcx
+ jmp *%rcx
+ ud2 /* persuade insn decoders not to speculate past here */
+ /* generated code should run, then jump back to
+ VG_(run_innerloop__dispatch_unprofiled). */
+ /*NOTREACHED*/
=20
- /*=20
- %rax holds destination (original) address.
- %rbp indicates further details of the control transfer
- requested to the address in %rax.
-=09
- If rbp is unchanged (=3D=3D * 8(%rsp)), just jump next to %rax.
+/*----------------------------------------------------*/
+/*--- PROFILING dispatcher (can be much slower) ---*/
+/*----------------------------------------------------*/
=20
- Otherwise fall out, back to the scheduler, and let it
- figure out what to do next.
- */
+.align 16
+.global VG_(run_innerloop__dispatch_profiled)
+VG_(run_innerloop__dispatch_profiled):
+ /* AT ENTRY: %rax is next guest addr, %rbp is possibly
+ modified guest state ptr */
=20
+ /* Has the guest state pointer been messed with? If yes, exit. */
cmpq 8(%rsp), %rbp
- jz dispatch_boring
+ jnz gsp_changed
=20
- jmp dispatch_exceptional
+ /* save the jump address in the guest state */
+ movq %rax, OFFSET_amd64_RIP(%rbp)
=20
+ /* Are we out of timeslice? If yes, defer to scheduler. */
+ subl $1, 0(%rsp)
+ jz counter_is_zero
=20
+ /* try a fast lookup in the translation cache */
+ movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
+ movq %rax, %rbx
+ andq $VG_TT_FAST_MASK, %rbx
+ movq (%rcx,%rbx,8), %rcx
+ cmpq %rax, (%rcx)
+ jnz fast_lookup_failed
=20
+ /* increment bb profile counter */
+ movq VG_(tt_fastN)@GOTPCREL(%rip), %rdx
+ movq (%rdx,%rbx,8), %rdx
+ addl $1, (%rdx)
+
+ /* Found a match. Call tce[1], which is 8 bytes along, since
+ each tce element is a 64-bit int. */
+ addq $8, %rcx
+ jmp *%rcx
+ ud2 /* persuade insn decoders not to speculate past here */
+ /* generated code should run, then jump back to
+ VG_(run_innerloop__dispatch_profiled). */
+ /*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- exit points ---*/
+/*----------------------------------------------------*/
+
+gsp_changed:
+ /* Someone messed with the gsp. Have to
+ defer to scheduler to resolve this. dispatch ctr
+ is not yet decremented, so no need to increment. */
+ /* %RIP is NOT up to date here. First, need to write
+ %rax back to %RIP, but without trashing %rbp since
+ that holds the value we want to return to the scheduler.
+ Hence use %r15 transiently for the guest state pointer. */
+ movq 8(%rsp), %r15
+ movq %rax, OFFSET_amd64_RIP(%r15)
+ movq %rbp, %rax
+ jmp run_innerloop_exit
+ /*NOTREACHED*/
+
+counter_is_zero:
+ /* %RIP is up to date here */
+ /* back out decrement of the dispatch counter */
+ addl $1, 0(%rsp)
+ movq $VG_TRC_INNER_COUNTERZERO, %rax
+ jmp run_innerloop_exit
+
+fast_lookup_failed:
+ /* %RIP is up to date here */
+ /* back out decrement of the dispatch counter */
+ addl $1, 0(%rsp)
+ movq $VG_TRC_INNER_FASTMISS, %rax
+ jmp run_innerloop_exit
+
+
+
/* All exits from the dispatcher go through here. %rax holds
the return value.=20
*/
@@ -150,14 +232,14 @@
pushq $0
fstcw (%rsp)
cmpl $0x027F, (%rsp)
- popq %r11 /* get rid of the word without trashing %eflags */
+ popq %r15 /* get rid of the word without trashing %eflags */
jnz invariant_violation
#endif
pushq $0
stmxcsr (%rsp)
andl $0xFFFFFFC0, (%rsp) /* mask out status flags */
cmpl $0x1F80, (%rsp)
- popq %r11
+ popq %r15
jnz invariant_violation
/* otherwise we're OK */
jmp run_innerloop_exit_REALLY
@@ -167,8 +249,12 @@
jmp run_innerloop_exit_REALLY
=20
run_innerloop_exit_REALLY:
- movq VG_(dispatch_ctr)@GOTPCREL(%rip), %rsi
- popq (%rsi)
+
+ /* restore VG_(dispatch_ctr) */=09
+ popq %r14
+ movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
+ movl %r14d, (%r15)
+
popq %rdi
popq %r15
popq %r14
@@ -190,31 +276,13 @@
/* Other ways of getting out of the inner loop. Placed out-of-line to
make it look cleaner.=20
*/
-dispatch_exceptional:
- /* this is jumped to only, not fallen-through from above */
=20
- /* save %rax in %RIP and defer to sched */
- movq 8(%rsp), %rdi
- movq %rax, OFFSET_amd64_RIP(%rdi)
- movq %rbp, %rax
- jmp run_innerloop_exit
=20
-fast_lookup_failed:
- /* %RIP is up to date here since dispatch_boring dominates */
- addl $1, 0(%rsp)
- movq $VG_TRC_INNER_FASTMISS, %rax
- jmp run_innerloop_exit
=20
-counter_is_zero:
- /* %RIP is up to date here since dispatch_boring dominates */
- addl $1, 0(%rsp)
- movq $VG_TRC_INNER_COUNTERZERO, %rax
- jmp run_innerloop_exit
=20
-
/* Let the linker know we don't need an executable stack */
.section .note.GNU-stack,"",@progbits
=20
-##--------------------------------------------------------------------##
-##--- end ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- end ---*/
+/*--------------------------------------------------------------------*/
Modified: branches/COMPVBITS/coregrind/m_dispatch/dispatch-x86-linux.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- branches/COMPVBITS/coregrind/m_dispatch/dispatch-x86-linux.S 2005-12-=
15 19:41:14 UTC (rev 5349)
+++ branches/COMPVBITS/coregrind/m_dispatch/dispatch-x86-linux.S 2005-12-=
15 19:53:50 UTC (rev 5350)
@@ -1,8 +1,8 @@
=20
-##--------------------------------------------------------------------##
-##--- The core dispatch loop, for jumping to a code address. ---##
-##--- dispatch-x86.S ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- The core dispatch loop, for jumping to a code address. ---*/
+/*--- dispatch-x86.S ---*/
+/*--------------------------------------------------------------------*/
=20
/*
This file is part of Valgrind, a dynamic binary instrumentation
@@ -39,11 +39,18 @@
/*--- The dispatch loop. ---*/
/*------------------------------------------------------------*/
=20
-/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */
+/*----------------------------------------------------*/
+/*--- Preamble (set everything up) ---*/
+/*----------------------------------------------------*/
=20
+/* signature:
+UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+*/
+.text
.globl VG_(run_innerloop)
VG_(run_innerloop):
/* 4(%esp) holds guest_state */
+ /* 8(%esp) holds do_profiling */
=09
/* ----- entry point to VG_(run_innerloop) ----- */
pushl %ebx
@@ -54,6 +61,7 @@
pushl %ebp
=09
/* 28(%esp) holds guest_state */
+ /* 32(%esp) holds do_profiling */
=20
/* Set up the guest state pointer */
movl 28(%esp), %ebp
@@ -80,52 +88,128 @@
/* set dir flag to known value */
cld
=09
- /* fall into main loop */
+ /* fall into main loop (the right one) */
+ cmpl $0, 32(%esp) /* do_profiling */
+ je VG_(run_innerloop__dispatch_unprofiled)
+ jmp VG_(run_innerloop__dispatch_profiled)
+ /*NOTREACHED*/
=20
- /* Here, %eax is the only live (real) register. The entire
- simulated state is saved in the ThreadState. */
+/*----------------------------------------------------*/
+/*--- NO-PROFILING (standard) dispatcher ---*/
+/*----------------------------------------------------*/
=20
-dispatch_boring:
+.align 16
+.global VG_(run_innerloop__dispatch_unprofiled)
+VG_(run_innerloop__dispatch_unprofiled):
+ /* AT ENTRY: %eax is next guest addr, %ebp is possibly
+ modified guest state ptr */
+
+ /* Has the guest state pointer been messed with? If yes, exit. */
+ cmpl 28(%esp), %ebp
+ jnz gsp_changed
+
/* save the jump address in the guest state */
movl %eax, OFFSET_x86_EIP(%ebp)
=20
/* Are we out of timeslice? If yes, defer to scheduler. */
- subl $1, VG_(dispatch_ctr)
+ subl $1, VG_(dispatch_ctr)
jz counter_is_zero
=20
/* try a fast lookup in the translation cache */
- movl %eax, %ebx
- andl $VG_TT_FAST_MASK, %ebx
- movl VG_(tt_fast)(,%ebx,4), %ecx
- cmpl %eax, (%ecx)
- jnz fast_lookup_failed
- /* increment bb profile counter */
- movl VG_(tt_fastN)(,%ebx,4), %edx
- incl (%edx)
+ movl %eax, %ebx
+ andl $VG_TT_FAST_MASK, %ebx
+ movl VG_(tt_fast)(,%ebx,4), %ecx
+ cmpl %eax, (%ecx)
+ jnz fast_lookup_failed
=20
- /* Found a match. Call tce[1], which is 8 bytes along, since
- each tce element is a 64-bit int. */
+ /* Found a match. Jump to tce[1], which is 8 bytes along,
+ since each tce element is a 64-bit int. */
addl $8, %ecx
- call *%ecx
-=09
- /*=20
- %eax holds destination (original) address.
- %ebp indicates further details of the control transfer
- requested to the address in %eax.
-=09
- If ebp is unchanged (=3D=3D * 28(%esp)), just jump next to %eax.
+ jmp *%ecx
+ ud2 /* persuade insn decoders not to speculate past here */
+ /* generated code should run, then jump back to
+ VG_(run_innerloop__dispatch_unprofiled). */
+ /*NOTREACHED*/
=20
- Otherwise fall out, back to the scheduler, and let it
- figure out what to do next.
- */
+/*----------------------------------------------------*/
+/*--- PROFILING dispatcher (can be much slower) ---*/
+/*----------------------------------------------------*/
=20
+.align 16
+.global VG_(run_innerloop__dispatch_profiled)
+VG_(run_innerloop__dispatch_profiled):
+ /* AT ENTRY: %eax is next guest addr, %ebp is possibly
+ modified guest state ptr */
+
+ /* Has the guest state pointer been messed with? If yes, exit. */
cmpl 28(%esp), %ebp
- jz dispatch_boring
+ jnz gsp_changed
=20
- jmp dispatch_exceptional
+ /* save the jump address in the guest state */
+ movl %eax, OFFSET_x86_EIP(%ebp)
=20
-=09
+ /* Are we out of timeslice? If yes, defer to scheduler. */
+ subl $1, VG_(dispatch_ctr)
+ jz counter_is_zero
=20
+ /* try a fast lookup in the translation cache */
+ movl %eax, %ebx
+ andl $VG_TT_FAST_MASK, %ebx
+ movl VG_(tt_fast)(,%ebx,4), %ecx
+ cmpl %eax, (%ecx)
+ jnz fast_lookup_failed
+ /* increment bb profile counter */
+ /* note: innocuous as this sounds, it causes a huge amount more
+ stress on D1 and significantly slows everything down. */
+ movl VG_(tt_fastN)(,%ebx,4), %edx
+ /* Use "addl $1", not "incl", to avoid partial-flags stall on P4 */
+ addl $1, (%edx)
+
+ /* Found a match. Jump to tce[1], which is 8 bytes along,
+ since each tce element is a 64-bit int. */
+ addl $8, %ecx
+ jmp *%ecx
+ ud2 /* persuade insn decoders not to speculate past here */
+ /* generated code should run, then jump back to
+ VG_(run_innerloop__dispatch_profiled). */
+ /*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- exit points ---*/
+/*----------------------------------------------------*/
+
+gsp_changed:
+ /* Someone messed with the gsp. Have to
+ defer to scheduler to resolve this. dispatch ctr
+ is not yet decremented, so no need to increment. */
+ /* %EIP is NOT up to date here. First, need to write
+ %eax back to %EIP, but without trashing %ebp since
+ that holds the value we want to return to the scheduler.
+ Hence use %esi transiently for the guest state pointer. */
+ movl 28(%esp), %esi
+ movl %eax, OFFSET_x86_EIP(%esi)
+ movl %ebp, %eax
+ jmp run_innerloop_exit
+ /*NOTREACHED*/
+
+counter_is_zero:
+ /* %EIP is up to date here */
+ /* back out decrement of the dispatch counter */
+ addl $1, VG_(dispatch_ctr)
+ movl $VG_TRC_INNER_COUNTERZERO, %eax
+ jmp run_innerloop_exit
+ /*NOTREACHED*/
+
+fast_lookup_failed:
+ /* %EIP is up to date here */
+ /* back out decrement of the dispatch counter */
+ addl $1, VG_(dispatch_ctr)
+ movl $VG_TRC_INNER_FASTMISS, %eax
+ jmp run_innerloop_exit
+ /*NOTREACHED*/
+
+
+
/* All exits from the dispatcher go through here. %eax holds
the return value.=20
*/
@@ -165,36 +249,10 @@
popl %ebx
ret=09
=20
-
-
-/* Other ways of getting out of the inner loop. Placed out-of-line to
- make it look cleaner.=20
-*/
-dispatch_exceptional:
- /* this is jumped to only, not fallen-through from above */
-
- /* save %eax in %EIP and defer to sched */
- movl 28(%esp), %edi
- movl %eax, OFFSET_x86_EIP(%edi)
- movl %ebp, %eax
- jmp run_innerloop_exit
-
-fast_lookup_failed:
- /* %EIP is up to date here since dispatch_boring dominates */
- addl $1, VG_(dispatch_ctr)
- movl $VG_TRC_INNER_FASTMISS, %eax
- jmp run_innerloop_exit
-
-counter_is_zero:
- /* %EIP is up to date here since dispatch_boring dominates */
- addl $1, VG_(dispatch_ctr)
- movl $VG_TRC_INNER_COUNTERZERO, %eax
- jmp run_innerloop_exit
-
=09
/* Let the linker know we don't need an executable stack */
.section .note.GNU-stack,"",@progbits
=20
-##--------------------------------------------------------------------##
-##--- end ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- end ---*/
+/*--------------------------------------------------------------------*/
Modified: branches/COMPVBITS/coregrind/m_scheduler/scheduler.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- branches/COMPVBITS/coregrind/m_scheduler/scheduler.c 2005-12-15 19:41=
:14 UTC (rev 5349)
+++ branches/COMPVBITS/coregrind/m_scheduler/scheduler.c 2005-12-15 19:53=
:50 UTC (rev 5350)
@@ -428,8 +428,12 @@
vg_assert(VG_(my_fault));
VG_(my_fault) =3D False;
=20
- SCHEDSETJMP(tid, jumped,=20
- trc =3D (UInt)VG_(run_innerloop)( (void*)&tst->arch.=
vex ));
+ SCHEDSETJMP(
+ tid,=20
+ jumped,=20
+ trc =3D (UInt)VG_(run_innerloop)( (void*)&tst->arch.vex,
+ VG_(clo_profile_flags) > 0 ? 1 : 0=
)
+ );
=20
//nextEIP =3D tst->arch.m_eip;
//if (nextEIP >=3D VG_(client_end))
Modified: branches/COMPVBITS/coregrind/m_translate.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- branches/COMPVBITS/coregrind/m_translate.c 2005-12-15 19:41:14 UTC (r=
ev 5349)
+++ branches/COMPVBITS/coregrind/m_translate.c 2005-12-15 19:53:50 UTC (r=
ev 5350)
@@ -32,23 +32,25 @@
#include "pub_core_basics.h"
#include "pub_core_aspacemgr.h"
=20
-#include "pub_core_machine.h" // For VG_(machine_get_VexArchInfo)
- // and VG_(get_SP)
+#include "pub_core_machine.h" // For VG_(machine_get_VexArchInfo)
+ // and VG_(get_SP)
#include "pub_core_libcbase.h"
#include "pub_core_libcassert.h"
#include "pub_core_libcprint.h"
#include "pub_core_options.h"
#include "pub_core_profile.h"
=20
-#include "pub_core_debuginfo.h" // Needed for pub_core_redir :(
-#include "pub_core_redir.h" // For VG_(code_redirect)()
+#include "pub_core_debuginfo.h" // Needed for pub_core_redir :(
+#include "pub_core_redir.h" // For VG_(code_redirect)()
=20
-#include "pub_core_signals.h" // For VG_(synth_fault_{perms,mappin=
g})()
-#include "pub_core_stacks.h" // For VG_(unknown_SP_update)()
-#include "pub_core_tooliface.h" // For VG_(tdict)
+#include "pub_core_signals.h" // For VG_(synth_fault_{perms,mapping})(=
)
+#include "pub_core_stacks.h" // For VG_(unknown_SP_update)()
+#include "pub_core_tooliface.h" // For VG_(tdict)
#include "pub_core_translate.h"
#include "pub_core_transtab.h"
+#include "pub_core_dispatch.h" // VG_(run_innerloop__dispatch_{un}profi=
led)
=20
+
/*------------------------------------------------------------*/
/*--- Stats ---*/
/*------------------------------------------------------------*/
@@ -569,6 +571,7 @@
VexArch vex_arch;
VexArchInfo vex_archinfo;
VexGuestExtents vge;
+ VexTranslateArgs vta;
VexTranslateResult tres;
=20
/* Make sure Vex is initialised right. */
@@ -690,25 +693,41 @@
/* Set up closure arg for "chase_into_ok" */
chase_into_ok__CLOSURE_tid =3D tid;
=20
- tres =3D LibVEX_Translate (=20
- vex_arch, &vex_archinfo,
- vex_arch, &vex_archinfo,
- (UChar*)ULong_to_Ptr(orig_addr),=20
- (Addr64)orig_addr,=20
- (Addr64)orig_addr_noredir,=20
- chase_into_ok,
- &vge,
- tmpbuf, N_TMPBUF, &tmpbuf_used,
- VG_(tdict).tool_instrument,
- need_to_handle_SP_assignment()
- ? vg_SP_update_pass
- : NULL,
- True, /* cleanup after instrumentation */
- do_self_check,
- NULL,
- verbosity
- );
+ vta.arch_guest =3D vex_arch;
+ vta.archinfo_guest =3D vex_archinfo;
+ vta.arch_host =3D vex_arch;
+ vta.archinfo_host =3D vex_archinfo;
+ vta.guest_bytes =3D (UChar*)ULong_to_Ptr(orig_addr);
+ vta.guest_bytes_addr =3D (Addr64)orig_addr;
+ vta.guest_bytes_addr_noredir =3D (Addr64)orig_addr_noredir;
+ vta.chase_into_ok =3D chase_into_ok;
+ vta.guest_extents =3D &vge;
+ vta.host_bytes =3D tmpbuf;
+ vta.host_bytes_size =3D N_TMPBUF;
+ vta.host_bytes_used =3D &tmpbuf_used;
+ vta.instrument1 =3D VG_(tdict).tool_instrument;
+ vta.instrument2 =3D need_to_handle_SP_assignment()
+ ? vg_SP_update_pass
+ : NULL;
+ vta.do_self_check =3D do_self_check;
+ vta.traceflags =3D verbosity;
=20
+ /* Set up the dispatch-return info. For archs without a link
+ register, vex generates a jump back to the specified dispatch
+ address. Else, it just generates a branch-to-LR. */
+# if defined(VGA_x86) || defined(VGA_amd64)
+ vta.dispatch =3D VG_(clo_profile_flags) > 0
+ ? (void*) &VG_(run_innerloop__dispatch_profiled)
+ : (void*) &VG_(run_innerloop__dispatch_unprofiled);
+# elif defined(VGA_ppc32) || defined(VGA_ppc64)
+ vta.dispatch =3D NULL;
+# else
+# error "Unknown arch"
+# endif
+
+ /* Sheesh. Finally, actually _do_ the translation! */
+ tres =3D LibVEX_Translate ( &vta );
+
vg_assert(tres =3D=3D VexTransOK);
vg_assert(tmpbuf_used <=3D N_TMPBUF);
vg_assert(tmpbuf_used > 0);
Modified: branches/COMPVBITS/coregrind/pub_core_dispatch.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- branches/COMPVBITS/coregrind/pub_core_dispatch.h 2005-12-15 19:41:14 =
UTC (rev 5349)
+++ branches/COMPVBITS/coregrind/pub_core_dispatch.h 2005-12-15 19:53:50 =
UTC (rev 5350)
@@ -50,12 +50,24 @@
signal, for example SIGSEGV, in which case control longjmp()s back pa=
st
here.
=20
+ If do_profiling is nonzero, the profile counters arrays should be
+ updated for each translation run.
+
This code simply handles the common case fast -- when the translation
address is found in the translation cache. For anything else, the
scheduler does the work.
*/
-extern UWord VG_(run_innerloop) ( void* guest_state );
+extern=20
+UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
=20
+#if defined(VGA_x86) || defined(VGA_amd64)
+/* We need to locate a couple of labels inside VG_(run_innerloop), so
+ that Vex can add branches to them from generated code. Hence the
+ following somewhat bogus decls. At least on x86 and amd64. */
+extern void VG_(run_innerloop__dispatch_unprofiled);
+extern void VG_(run_innerloop__dispatch_profiled);
+#endif
+
#endif // __PUB_CORE_DISPATCH_H
=20
/*--------------------------------------------------------------------*/
|
|
From: <sv...@va...> - 2005-12-15 19:52:41
|
Author: njn Date: 2005-12-15 19:52:37 +0000 (Thu, 15 Dec 2005) New Revision: 258 Log: update projects to account for recent work done on performance. Modified: trunk/help/projects.html Modified: trunk/help/projects.html =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- trunk/help/projects.html 2005-12-14 14:14:26 UTC (rev 257) +++ trunk/help/projects.html 2005-12-15 19:52:37 UTC (rev 258) @@ -26,43 +26,11 @@ =20 <h2>Software Infrastructure</h2> =20 -<h3>Profiling Valgrind</h3> -<p>Valgrind used to have a built-in tick-based profiler, but it stopped -working about two years ago and we've had no way of profiling it since, -because neither gprof nor gcov work with Valgrind.</p> +<h3>Improving the Performance Testing Suite</h3> +<p>We have a growing suite of performance benchmarks in the perf/ direct= ory, +which would benefit from more programs. We have a mix of two kinds of +programs:</p> =20 -<p>Recently self-hosting started working well enough to use Cachegrind o= n -Valgrind (see the README_DEVELOPERS file for how to do this), but no -thorough performance investigation has yet been done. This would be ver= y -valuable because Valgrind's speed is the single biggest cause of user -complaints.</p> - -<p>The only problem with using Cachegrind on Valgrind is that it's hard = to -profile big programs due to the double-slowdown. If you know of another -profiler that will work with Valgrind (OProfile perhaps?) that could be -helpful. (Added August 27, 2005; updated November 11, 2005)</p> - - -<h3>Performance regression testing</h3> -<p>We currently have some scripts to run the regression tests nightly on -a range of machines. This is very useful for spotting correctness -regressions. Equally useful would be a system for spotting performance -regressions (or improvements).</p> - -<p>This would involve running the Valgrind tools on a given suite of -programs, and recording how long they take to run. Or, perhaps better -would be recording how much slower than normal the programs run under -Valgrind; that metric would be more robust if the compiler or system -libraries on the test machine changed.</p> - -<p>The nightly measurements should be kept and ideally there would be a -system for producing graphs that show the performance changes over time. -You'd have to specify somehow where the previous measurements would be -stored, perhaps that would be a command line argument to the script.</p> - -<p>Choosing the programs for the test suite would be challenging. -Ideally we'd have a mix of two kinds of programs:</p> - <ol> <li><p>Real programs. Ones like the SPEC2000 benchmarks would be ideal, but they are not free.</p></li> @@ -74,10 +42,31 @@ performance bug in Valgrind's heap allocator.</p></li> </ol> =20 +The programs should be fairly small (preferably a few thousand lines at +most). Any new programs added should add something to the suite that th= e +current programs don't provide. Programs showing performance bugs (ie. +cases where Valgrind does very badly) are particularly welcome. +(Added August 27, 2005; updated December 15, 2005)</p> + + +<h3>Performance regression testing</h3> +<p>We currently have some scripts to run the regression tests nightly on +a range of machines. This is very useful for spotting correctness +regressions. Equally useful would be an automated system for spotting +performance regressions (or improvements).</p> + +<p>It would be nice to have scripts to run the performance benchmarks ev= ery +night. The nightly measurements should be kept and ideally there would = be a +system for producing graphs that show the performance changes over time. +You'd have to specify somehow where the previous measurements would be +stored, perhaps that would be a command line argument to the script.</p> + <p>The scripts in nightly/ for doing the nightly regression tests would = be the -right place to start on this. (Added August 27, 2005)</p> +right place to start on this. (Added August 27, 2005; updated December= 15, +2005)</p> =20 =20 + <h3>Regression test brittleness</h3> <p>Valgrind's regression test suite (run with "make regtest") is extreme= ly useful. The scripts in nightly/ are used on various test machines to |
|
From: <sv...@va...> - 2005-12-15 19:41:28
|
Author: njn Date: 2005-12-15 19:41:14 +0000 (Thu, 15 Dec 2005) New Revision: 5349 Log: add note about recent performance improvement Modified: trunk/docs/internals/performance.txt Modified: trunk/docs/internals/performance.txt =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- trunk/docs/internals/performance.txt 2005-12-15 17:22:37 UTC (rev 534= 8) +++ trunk/docs/internals/performance.txt 2005-12-15 19:41:14 UTC (rev 534= 9) @@ -14,6 +14,11 @@ - Nick improved vg_SP_update_pass() to identify more small constant increments/decrements of SP, so the fast cases can be used more often. Saved 1--3% on a few programs. +- r5345,r5346: Julian improved the dispatcher so that x86 and AMD64 use + jumps instead of call/return for calling translations, and also remove= d + the --profile-flags profiling from the dispatcher unless --profile-fla= gs + is being used. Improved Nulgrind performance typically by 10--20%, + and Memcheck performance typically by 2--20%. =20 COMPVBITS branch: - Nick converted to compress V bits, initial version saved 0--5% on most |
|
From: <sv...@va...> - 2005-12-15 17:22:40
|
Author: njn
Date: 2005-12-15 17:22:37 +0000 (Thu, 15 Dec 2005)
New Revision: 5348
Log:
Improvments to vg_perf:
- show percentage speedup over the first Valgrind when comparing multiple
Valgrind
- don't accept --reps < 0
- avoid div-by-zero if the runtime is measured as zero
Modified:
trunk/perf/vg_perf.in
Modified: trunk/perf/vg_perf.in
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/perf/vg_perf.in 2005-12-15 16:11:25 UTC (rev 5347)
+++ trunk/perf/vg_perf.in 2005-12-15 17:22:37 UTC (rev 5348)
@@ -153,6 +153,7 @@
$alldirs =3D 1;
} elsif ($arg =3D~ /^--reps=3D(\d+)$/) {
$n_reps =3D $1;
+ if ($n_reps < 1) { die "bad --reps value: $n_reps\n"; }
} elsif ($arg =3D~ /^--vg=3D(.+)$/) {
# Make dir absolute if not already
add_vgdir($1);
@@ -255,7 +256,8 @@
die "\n*** missing usertime in perf.stderr\n";
$tmin =3D $1 if ($1 < $tmin);
}
- return $tmin;
+ # Avoid divisions by zero!
+ return (0 =3D=3D $tmin ? 0.01 : $tmin);
}
=20
sub do_one_test($$)=20
@@ -263,6 +265,8 @@
my ($dir, $vgperf) =3D @_;
$vgperf =3D~ /^(.*)\.vgperf/;
my $name =3D $1;
+ my %first_tTool; # For doing percentage speedups when comparing
+ # multiple Valgrinds
=20
read_vgperf_file($vgperf);
=20
@@ -307,8 +311,20 @@
. "$vgopts ";
my $cmd =3D "$vgsetup $timecmd $vgcmd $prog $args";
my $tTool =3D time_prog($cmd, $n_reps);
- printf("%4.1fs (%4.1fx) ", $tTool, $tTool/$tNative);
+ printf("%4.1fs (%4.1fx,", $tTool, $tTool/$tNative);
=20
+ # If it's the first timing for this tool on this benchmark,
+ # record the time so we can get the percentage speedup of th=
e
+ # subsequent Valgrinds. Otherwise, compute and print
+ # the speedup.
+ if (not defined $first_tTool{$tool}) {
+ $first_tTool{$tool} =3D $tTool;
+ print(" -----) ");
+ } else {
+ my $speedup =3D 100 - (100 * $tTool / $first_tTool{$tool=
});
+ printf("%5.1f%%) ", $speedup);
+ }
+
$num_timings_done++;
=20
if (defined $cleanup) {
|
|
From: <sv...@va...> - 2005-12-15 16:11:27
|
Author: sewardj
Date: 2005-12-15 16:11:25 +0000 (Thu, 15 Dec 2005)
New Revision: 5347
Log:
Make this work on platforms where r !=3D x.
Modified:
trunk/perf/bigcode.c
Modified: trunk/perf/bigcode.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/perf/bigcode.c 2005-12-15 15:46:43 UTC (rev 5346)
+++ trunk/perf/bigcode.c 2005-12-15 16:11:25 UTC (rev 5347)
@@ -9,6 +9,8 @@
=20
#include <stdio.h>
#include <string.h>
+#include <assert.h>
+#include <sys/mman.h>
=20
#define FN_SIZE 996 // Must be big enough to hold the compiled f(=
)
#define N_LOOPS 20000 // Should be divisible by four
@@ -28,13 +30,16 @@
return y;
}
=20
-static char a[FN_SIZE * N_LOOPS];
-
int main(int argc, char* argv[])
{
int h, i, sum1 =3D 0, sum2 =3D 0, sum3 =3D 0, sum4 =3D 0;
int n_fns, n_reps;
=20
+ char* a =3D mmap(0, FN_SIZE * N_LOOPS,=20
+ PROT_EXEC|PROT_WRITE,=20
+ MAP_PRIVATE|MAP_ANONYMOUS, 0,0);
+ assert(a !=3D (char*)MAP_FAILED);
+
if (argc <=3D 1) {
// Mode 1: not so much code
n_fns =3D N_LOOPS / RATIO;
|
|
From: <sv...@va...> - 2005-12-15 15:46:54
|
Author: sewardj
Date: 2005-12-15 15:46:43 +0000 (Thu, 15 Dec 2005)
New Revision: 5346
Log:
Rewrite amd64 dispatch loop to add performance enhancements as per x86
reorganisation of r5345.
Modified:
trunk/coregrind/m_dispatch/dispatch-amd64-linux.S
Modified: trunk/coregrind/m_dispatch/dispatch-amd64-linux.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_dispatch/dispatch-amd64-linux.S 2005-12-15 14:07:07=
UTC (rev 5345)
+++ trunk/coregrind/m_dispatch/dispatch-amd64-linux.S 2005-12-15 15:46:43=
UTC (rev 5346)
@@ -1,8 +1,8 @@
=20
-##--------------------------------------------------------------------##
-##--- The core dispatch loop, for jumping to a code address. ---##
-##--- dispatch-amd64.S ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- The core dispatch loop, for jumping to a code address. ---*/
+/*--- dispatch-amd64.S ---*/
+/*--------------------------------------------------------------------*/
=20
/*
This file is part of Valgrind, a dynamic binary instrumentation
@@ -39,11 +39,19 @@
/*--- The dispatch loop. ---*/
/*------------------------------------------------------------*/
=20
-/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */
+/*----------------------------------------------------*/
+/*--- Preamble (set everything up) ---*/
+/*----------------------------------------------------*/
=20
+/* signature:
+UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+*/
+
+.text
.globl VG_(run_innerloop)
VG_(run_innerloop):
/* %rdi holds guest_state */
+ /* %rsi holds do_profiling */
=09
/* ----- entry point to VG_(run_innerloop) ----- */
pushq %rbx
@@ -59,12 +67,13 @@
pushq %r13
pushq %r14
pushq %r15
- pushq %rdi
+ pushq %rdi /* guest_state */
=20
- movq VG_(dispatch_ctr)@GOTPCREL(%rip), %rsi
- pushq (%rsi)
+ movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
+ movl (%r15), %r15d
+ pushq %r15
=20
- /* 8(%rsp) holds cached copy of guest_state */
+ /* 8(%rsp) holds cached copy of guest_state ptr */
/* 0(%rsp) holds cached copy of VG_(dispatch_ctr) */
=20
/* Set up the guest state pointer */
@@ -90,12 +99,26 @@
/* set dir flag to known value */
cld
=20
- /* fall into main loop */
+ /* fall into main loop (the right one) */
+ cmpq $0, %rsi
+ je VG_(run_innerloop__dispatch_unprofiled)
+ jmp VG_(run_innerloop__dispatch_profiled)
+ /*NOTREACHED*/=09
=20
- /* Here, %rax is the only live (real) register. The entire
- simulated state is saved in the ThreadState. */
+/*----------------------------------------------------*/
+/*--- NO-PROFILING (standard) dispatcher ---*/
+/*----------------------------------------------------*/
=20
-dispatch_boring:
+.align 16
+.global VG_(run_innerloop__dispatch_unprofiled)
+VG_(run_innerloop__dispatch_unprofiled):
+ /* AT ENTRY: %rax is next guest addr, %rbp is possibly
+ modified guest state ptr */
+
+ /* Has the guest state pointer been messed with? If yes, exit. */
+ cmpq 8(%rsp), %rbp
+ jnz gsp_changed
+
/* save the jump address in the guest state */
movq %rax, OFFSET_amd64_RIP(%rbp)
=20
@@ -104,40 +127,99 @@
jz counter_is_zero
=20
/* try a fast lookup in the translation cache */
- movq %rax, %rbx
- andq $VG_TT_FAST_MASK, %rbx
- movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
- movq (%rcx,%rbx,8), %rcx
- cmpq %rax, (%rcx)
- jnz fast_lookup_failed
- /* increment bb profile counter */
- movq VG_(tt_fastN)@GOTPCREL(%rip), %rdx
- movq (%rdx,%rbx,8), %rdx
- incl (%rdx)
+ movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
+ movq %rax, %rbx
+ andq $VG_TT_FAST_MASK, %rbx
+ movq (%rcx,%rbx,8), %rcx
+ cmpq %rax, (%rcx)
+ jnz fast_lookup_failed
=20
/* Found a match. Call tce[1], which is 8 bytes along, since
each tce element is a 64-bit int. */
addq $8, %rcx
- call *%rcx
+ jmp *%rcx
+ ud2 /* persuade insn decoders not to speculate past here */
+ /* generated code should run, then jump back to
+ VG_(run_innerloop__dispatch_unprofiled). */
+ /*NOTREACHED*/
=20
- /*=20
- %rax holds destination (original) address.
- %rbp indicates further details of the control transfer
- requested to the address in %rax.
-=09
- If rbp is unchanged (=3D=3D * 8(%rsp)), just jump next to %rax.
+/*----------------------------------------------------*/
+/*--- PROFILING dispatcher (can be much slower) ---*/
+/*----------------------------------------------------*/
=20
- Otherwise fall out, back to the scheduler, and let it
- figure out what to do next.
- */
+.align 16
+.global VG_(run_innerloop__dispatch_profiled)
+VG_(run_innerloop__dispatch_profiled):
+ /* AT ENTRY: %rax is next guest addr, %rbp is possibly
+ modified guest state ptr */
=20
+ /* Has the guest state pointer been messed with? If yes, exit. */
cmpq 8(%rsp), %rbp
- jz dispatch_boring
+ jnz gsp_changed
=20
- jmp dispatch_exceptional
+ /* save the jump address in the guest state */
+ movq %rax, OFFSET_amd64_RIP(%rbp)
=20
+ /* Are we out of timeslice? If yes, defer to scheduler. */
+ subl $1, 0(%rsp)
+ jz counter_is_zero
=20
+ /* try a fast lookup in the translation cache */
+ movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
+ movq %rax, %rbx
+ andq $VG_TT_FAST_MASK, %rbx
+ movq (%rcx,%rbx,8), %rcx
+ cmpq %rax, (%rcx)
+ jnz fast_lookup_failed
=20
+ /* increment bb profile counter */
+ movq VG_(tt_fastN)@GOTPCREL(%rip), %rdx
+ movq (%rdx,%rbx,8), %rdx
+ addl $1, (%rdx)
+
+ /* Found a match. Call tce[1], which is 8 bytes along, since
+ each tce element is a 64-bit int. */
+ addq $8, %rcx
+ jmp *%rcx
+ ud2 /* persuade insn decoders not to speculate past here */
+ /* generated code should run, then jump back to
+ VG_(run_innerloop__dispatch_profiled). */
+ /*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- exit points ---*/
+/*----------------------------------------------------*/
+
+gsp_changed:
+ /* Someone messed with the gsp. Have to
+ defer to scheduler to resolve this. dispatch ctr
+ is not yet decremented, so no need to increment. */
+ /* %RIP is NOT up to date here. First, need to write
+ %rax back to %RIP, but without trashing %rbp since
+ that holds the value we want to return to the scheduler.
+ Hence use %r15 transiently for the guest state pointer. */
+ movq 8(%rsp), %r15
+ movq %rax, OFFSET_amd64_RIP(%r15)
+ movq %rbp, %rax
+ jmp run_innerloop_exit
+ /*NOTREACHED*/
+
+counter_is_zero:
+ /* %RIP is up to date here */
+ /* back out decrement of the dispatch counter */
+ addl $1, 0(%rsp)
+ movq $VG_TRC_INNER_COUNTERZERO, %rax
+ jmp run_innerloop_exit
+
+fast_lookup_failed:
+ /* %RIP is up to date here */
+ /* back out decrement of the dispatch counter */
+ addl $1, 0(%rsp)
+ movq $VG_TRC_INNER_FASTMISS, %rax
+ jmp run_innerloop_exit
+
+
+
/* All exits from the dispatcher go through here. %rax holds
the return value.=20
*/
@@ -150,14 +232,14 @@
pushq $0
fstcw (%rsp)
cmpl $0x027F, (%rsp)
- popq %r11 /* get rid of the word without trashing %eflags */
+ popq %r15 /* get rid of the word without trashing %eflags */
jnz invariant_violation
#endif
pushq $0
stmxcsr (%rsp)
andl $0xFFFFFFC0, (%rsp) /* mask out status flags */
cmpl $0x1F80, (%rsp)
- popq %r11
+ popq %r15
jnz invariant_violation
/* otherwise we're OK */
jmp run_innerloop_exit_REALLY
@@ -167,8 +249,12 @@
jmp run_innerloop_exit_REALLY
=20
run_innerloop_exit_REALLY:
- movq VG_(dispatch_ctr)@GOTPCREL(%rip), %rsi
- popq (%rsi)
+
+ /* restore VG_(dispatch_ctr) */=09
+ popq %r14
+ movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
+ movl %r14d, (%r15)
+
popq %rdi
popq %r15
popq %r14
@@ -190,31 +276,13 @@
/* Other ways of getting out of the inner loop. Placed out-of-line to
make it look cleaner.=20
*/
-dispatch_exceptional:
- /* this is jumped to only, not fallen-through from above */
=20
- /* save %rax in %RIP and defer to sched */
- movq 8(%rsp), %rdi
- movq %rax, OFFSET_amd64_RIP(%rdi)
- movq %rbp, %rax
- jmp run_innerloop_exit
=20
-fast_lookup_failed:
- /* %RIP is up to date here since dispatch_boring dominates */
- addl $1, 0(%rsp)
- movq $VG_TRC_INNER_FASTMISS, %rax
- jmp run_innerloop_exit
=20
-counter_is_zero:
- /* %RIP is up to date here since dispatch_boring dominates */
- addl $1, 0(%rsp)
- movq $VG_TRC_INNER_COUNTERZERO, %rax
- jmp run_innerloop_exit
=20
-
/* Let the linker know we don't need an executable stack */
.section .note.GNU-stack,"",@progbits
=20
-##--------------------------------------------------------------------##
-##--- end ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- end ---*/
+/*--------------------------------------------------------------------*/
|
|
From: <sv...@va...> - 2005-12-15 15:45:27
|
Author: sewardj
Date: 2005-12-15 15:45:20 +0000 (Thu, 15 Dec 2005)
New Revision: 1495
Log:
Modify amd64 backend to use jump-jump scheme rather than call-return
scheme.
Modified:
trunk/priv/host-amd64/hdefs.c
trunk/priv/host-amd64/hdefs.h
trunk/priv/host-x86/hdefs.c
trunk/priv/main/vex_main.c
Modified: trunk/priv/host-amd64/hdefs.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-amd64/hdefs.c 2005-12-15 14:02:34 UTC (rev 1494)
+++ trunk/priv/host-amd64/hdefs.c 2005-12-15 15:45:20 UTC (rev 1495)
@@ -1091,14 +1091,16 @@
vex_printf("if (%%rflags.%s) { ",=20
showAMD64CondCode(i->Ain.Goto.cond));
}
- if (i->Ain.Goto.jk !=3D Ijk_Boring) {
+ if (i->Ain.Goto.jk !=3D Ijk_Boring
+ && i->Ain.Goto.jk !=3D Ijk_Call
+ && i->Ain.Goto.jk !=3D Ijk_Ret) {
vex_printf("movl $");
ppIRJumpKind(i->Ain.Goto.jk);
vex_printf(",%%ebp ; ");
}
vex_printf("movq ");
ppAMD64RI(i->Ain.Goto.dst);
- vex_printf(",%%rax ; ret");
+ vex_printf(",%%rax ; movabsq $dispatcher_addr,%%rdx ; jmp *%%rd=
x");
if (i->Ain.Goto.cond !=3D Acc_ALWAYS) {
vex_printf(" }");
}
@@ -1447,8 +1449,13 @@
return;
case Ain_Goto:
addRegUsage_AMD64RI(u, i->Ain.Goto.dst);
- addHRegUse(u, HRmWrite, hregAMD64_RAX());
- if (i->Ain.Goto.jk !=3D Ijk_Boring)
+ addHRegUse(u, HRmWrite, hregAMD64_RAX()); /* used for next gues=
t addr */
+ addHRegUse(u, HRmWrite, hregAMD64_RDX()); /* used for dispatche=
r addr */
+ if (i->Ain.Goto.jk !=3D Ijk_Boring
+ && i->Ain.Goto.jk !=3D Ijk_Call
+ && i->Ain.Goto.jk !=3D Ijk_Ret)
+ /* note, this is irrelevant since rbp is not actually
+ available to the allocator. But still .. */
addHRegUse(u, HRmWrite, hregAMD64_RBP());
return;
case Ain_CMov64:
@@ -2200,7 +2207,8 @@
Note that buf is not the insn's final place, and therefore it is
imperative to emit position-independent code. */
=20
-Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64Instr* i, Bool mode64 )
+Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64Instr* i,=20
+ Bool mode64, void* dispatch )
{
UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, su=
bopc;
UInt xtra;
@@ -2638,13 +2646,24 @@
}
}
=20
- /* ret */
- *p++ =3D 0xC3;
+ /* Get the dispatcher address into %rdx. This has to happen
+ after the load of %rax since %rdx might be carrying the value
+ destined for %rax immediately prior to this Ain_Goto. */
+ vassert(sizeof(ULong) =3D=3D sizeof(void*));
+ vassert(dispatch !=3D NULL);
+ /* movabsq $imm64, %rdx */
+ *p++ =3D 0x48;
+ *p++ =3D 0xBA;
+ p =3D emit64(p, Ptr_to_ULong(dispatch));
=20
+ /* jmp *%rdx */
+ *p++ =3D 0xFF;
+ *p++ =3D 0xE2;
+
/* Fix up the conditional jump, if there was one. */
if (i->Ain.Goto.cond !=3D Acc_ALWAYS) {
Int delta =3D p - ptmp;
- vassert(delta > 0 && delta < 20);
+ vassert(delta > 0 && delta < 30);
*ptmp =3D toUChar(delta-1);
}
goto done;
Modified: trunk/priv/host-amd64/hdefs.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-amd64/hdefs.h 2005-12-15 14:02:34 UTC (rev 1494)
+++ trunk/priv/host-amd64/hdefs.h 2005-12-15 15:45:20 UTC (rev 1495)
@@ -715,7 +715,8 @@
extern void getRegUsage_AMD64Instr ( HRegUsage*, AMD64Instr*, Bo=
ol );
extern void mapRegs_AMD64Instr ( HRegRemap*, AMD64Instr*, Bo=
ol );
extern Bool isMove_AMD64Instr ( AMD64Instr*, HReg*, HReg* )=
;
-extern Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64=
Instr*, Bool );
+extern Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64=
Instr*,=20
+ Bool, void* dispatch );
extern AMD64Instr* genSpill_AMD64 ( HReg rreg, Int offset, Bool=
);
extern AMD64Instr* genReload_AMD64 ( HReg rreg, Int offset, Bool=
);
extern void getAllocableRegs_AMD64 ( Int*, HReg** );
Modified: trunk/priv/host-x86/hdefs.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-x86/hdefs.c 2005-12-15 14:02:34 UTC (rev 1494)
+++ trunk/priv/host-x86/hdefs.c 2005-12-15 15:45:20 UTC (rev 1495)
@@ -2212,7 +2212,7 @@
vassert(dispatch !=3D NULL);
/* movl $imm32, %edx */
*p++ =3D 0xBA;
- p =3D emit32(p, (UInt)dispatch);
+ p =3D emit32(p, (UInt)Ptr_to_ULong(dispatch));
=20
/* jmp *%edx */
*p++ =3D 0xFF;
Modified: trunk/priv/main/vex_main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/main/vex_main.c 2005-12-15 14:02:34 UTC (rev 1494)
+++ trunk/priv/main/vex_main.c 2005-12-15 15:45:20 UTC (rev 1495)
@@ -250,7 +250,7 @@
ppInstr =3D (void(*)(HInstr*, Bool)) ppX86Instr;
ppReg =3D (void(*)(HReg)) ppHRegX86;
iselBB =3D iselBB_X86;
- emit =3D emit_X86Instr;
+ emit =3D (Int(*)(UChar*,Int,HInstr*,Bool,void*)) emit_X8=
6Instr;
host_is_bigendian =3D False;
host_word_type =3D Ity_I32;
vassert(vta->archinfo_host.subarch =3D=3D VexSubArchX86_sse0
@@ -271,7 +271,7 @@
ppInstr =3D (void(*)(HInstr*, Bool)) ppAMD64Instr;
ppReg =3D (void(*)(HReg)) ppHRegAMD64;
iselBB =3D iselBB_AMD64;
- emit =3D (Int(*)(UChar*,Int,HInstr*, Bool)) emit_AMD64In=
str;
+ emit =3D (Int(*)(UChar*,Int,HInstr*,Bool,void*)) emit_AM=
D64Instr;
host_is_bigendian =3D False;
host_word_type =3D Ity_I64;
vassert(vta->archinfo_host.subarch =3D=3D VexSubArch_NONE);
@@ -290,7 +290,7 @@
ppInstr =3D (void(*)(HInstr*,Bool)) ppPPC32Instr;
ppReg =3D (void(*)(HReg)) ppHRegPPC32;
iselBB =3D iselBB_PPC32;
- emit =3D (Int(*)(UChar*,Int,HInstr*,Bool)) emit_PPC32Ins=
tr;
+ emit =3D (Int(*)(UChar*,Int,HInstr*,Bool,void*)) emit_PP=
C32Instr;
host_is_bigendian =3D True;
host_word_type =3D Ity_I32;
vassert(vta->archinfo_guest.subarch =3D=3D VexSubArchPPC32_I
@@ -311,7 +311,7 @@
ppInstr =3D (void(*)(HInstr*, Bool)) ppPPC32Instr;
ppReg =3D (void(*)(HReg)) ppHRegPPC32;
iselBB =3D iselBB_PPC32;
- emit =3D (Int(*)(UChar*,Int,HInstr*, Bool)) emit_PPC32In=
str;
+ emit =3D (Int(*)(UChar*,Int,HInstr*,Bool,void*)) emit_PP=
C32Instr;
host_is_bigendian =3D True;
host_word_type =3D Ity_I64;
vassert(vta->archinfo_guest.subarch =3D=3D VexSubArchPPC64_FI
|
|
From: <sv...@va...> - 2005-12-15 14:07:18
|
Author: sewardj
Date: 2005-12-15 14:07:07 +0000 (Thu, 15 Dec 2005)
New Revision: 5345
Log:
- Track vex r1494 (x86/amd64 change of conventions for getting
to translations and back to dispatcher, and also different arg
passing conventions to LibVEX_Translate).
- Rewrite x86 dispatcher to not increment the profiling counters
unless requested by the user. This dramatically reduces the
D1 miss rate and gives considerable performance improvement
on x86. Also, restructure and add comments to dispatch-x86-linux.S
to make it much easier to follow (imo).
amd64/ppc32/ppc64 fixes to follow.
Modified:
trunk/coregrind/m_dispatch/dispatch-x86-linux.S
trunk/coregrind/m_scheduler/scheduler.c
trunk/coregrind/m_translate.c
trunk/coregrind/pub_core_dispatch.h
Modified: trunk/coregrind/m_dispatch/dispatch-x86-linux.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_dispatch/dispatch-x86-linux.S 2005-12-14 22:24:45 U=
TC (rev 5344)
+++ trunk/coregrind/m_dispatch/dispatch-x86-linux.S 2005-12-15 14:07:07 U=
TC (rev 5345)
@@ -1,8 +1,8 @@
=20
-##--------------------------------------------------------------------##
-##--- The core dispatch loop, for jumping to a code address. ---##
-##--- dispatch-x86.S ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- The core dispatch loop, for jumping to a code address. ---*/
+/*--- dispatch-x86.S ---*/
+/*--------------------------------------------------------------------*/
=20
/*
This file is part of Valgrind, a dynamic binary instrumentation
@@ -39,11 +39,18 @@
/*--- The dispatch loop. ---*/
/*------------------------------------------------------------*/
=20
-/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */
+/*----------------------------------------------------*/
+/*--- Preamble (set everything up) ---*/
+/*----------------------------------------------------*/
=20
+/* signature:
+UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+*/
+.text
.globl VG_(run_innerloop)
VG_(run_innerloop):
/* 4(%esp) holds guest_state */
+ /* 8(%esp) holds do_profiling */
=09
/* ----- entry point to VG_(run_innerloop) ----- */
pushl %ebx
@@ -54,6 +61,7 @@
pushl %ebp
=09
/* 28(%esp) holds guest_state */
+ /* 32(%esp) holds do_profiling */
=20
/* Set up the guest state pointer */
movl 28(%esp), %ebp
@@ -80,53 +88,128 @@
/* set dir flag to known value */
cld
=09
- /* fall into main loop */
+ /* fall into main loop (the right one) */
+ cmpl $0, 32(%esp) /* do_profiling */
+ je VG_(run_innerloop__dispatch_unprofiled)
+ jmp VG_(run_innerloop__dispatch_profiled)
+ /*NOTREACHED*/
=20
- /* Here, %eax is the only live (real) register. The entire
- simulated state is saved in the ThreadState. */
+/*----------------------------------------------------*/
+/*--- NO-PROFILING (standard) dispatcher ---*/
+/*----------------------------------------------------*/
=20
-dispatch_boring:
+.align 16
+.global VG_(run_innerloop__dispatch_unprofiled)
+VG_(run_innerloop__dispatch_unprofiled):
+ /* AT ENTRY: %eax is next guest addr, %ebp is possibly
+ modified guest state ptr */
+
+ /* Has the guest state pointer been messed with? If yes, exit. */
+ cmpl 28(%esp), %ebp
+ jnz gsp_changed
+
/* save the jump address in the guest state */
movl %eax, OFFSET_x86_EIP(%ebp)
=20
/* Are we out of timeslice? If yes, defer to scheduler. */
- subl $1, VG_(dispatch_ctr)
+ subl $1, VG_(dispatch_ctr)
jz counter_is_zero
=20
/* try a fast lookup in the translation cache */
- movl %eax, %ebx
- andl $VG_TT_FAST_MASK, %ebx
- movl VG_(tt_fast)(,%ebx,4), %ecx
- cmpl %eax, (%ecx)
- jnz fast_lookup_failed
+ movl %eax, %ebx
+ andl $VG_TT_FAST_MASK, %ebx
+ movl VG_(tt_fast)(,%ebx,4), %ecx
+ cmpl %eax, (%ecx)
+ jnz fast_lookup_failed
+
+ /* Found a match. Jump to tce[1], which is 8 bytes along,
+ since each tce element is a 64-bit int. */
+ addl $8, %ecx
+ jmp *%ecx
+ ud2 /* persuade insn decoders not to speculate past here */
+ /* generated code should run, then jump back to
+ VG_(run_innerloop__dispatch_unprofiled). */
+ /*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- PROFILING dispatcher (can be much slower) ---*/
+/*----------------------------------------------------*/
+
+.align 16
+.global VG_(run_innerloop__dispatch_profiled)
+VG_(run_innerloop__dispatch_profiled):
+ /* AT ENTRY: %eax is next guest addr, %ebp is possibly
+ modified guest state ptr */
+
+ /* Has the guest state pointer been messed with? If yes, exit. */
+ cmpl 28(%esp), %ebp
+ jnz gsp_changed
+
+ /* save the jump address in the guest state */
+ movl %eax, OFFSET_x86_EIP(%ebp)
+
+ /* Are we out of timeslice? If yes, defer to scheduler. */
+ subl $1, VG_(dispatch_ctr)
+ jz counter_is_zero
+
+ /* try a fast lookup in the translation cache */
+ movl %eax, %ebx
+ andl $VG_TT_FAST_MASK, %ebx
+ movl VG_(tt_fast)(,%ebx,4), %ecx
+ cmpl %eax, (%ecx)
+ jnz fast_lookup_failed
/* increment bb profile counter */
- movl VG_(tt_fastN)(,%ebx,4), %edx
+ /* note: innocuous as this sounds, it causes a huge amount more
+ stress on D1 and significantly slows everything down. */
+ movl VG_(tt_fastN)(,%ebx,4), %edx
/* Use "addl $1", not "incl", to avoid partial-flags stall on P4 */
- addl $1, (%edx)
+ addl $1, (%edx)
=20
- /* Found a match. Call tce[1], which is 8 bytes along, since
- each tce element is a 64-bit int. */
+ /* Found a match. Jump to tce[1], which is 8 bytes along,
+ since each tce element is a 64-bit int. */
addl $8, %ecx
- call *%ecx
-=09
- /*=20
- %eax holds destination (original) address.
- %ebp indicates further details of the control transfer
- requested to the address in %eax.
-=09
- If ebp is unchanged (=3D=3D * 28(%esp)), just jump next to %eax.
+ jmp *%ecx
+ ud2 /* persuade insn decoders not to speculate past here */
+ /* generated code should run, then jump back to
+ VG_(run_innerloop__dispatch_profiled). */
+ /*NOTREACHED*/
=20
- Otherwise fall out, back to the scheduler, and let it
- figure out what to do next.
- */
+/*----------------------------------------------------*/
+/*--- exit points ---*/
+/*----------------------------------------------------*/
=20
- cmpl 28(%esp), %ebp
- jz dispatch_boring
+gsp_changed:
+ /* Someone messed with the gsp. Have to
+ defer to scheduler to resolve this. dispatch ctr
+ is not yet decremented, so no need to increment. */
+ /* %EIP is NOT up to date here. First, need to write
+ %eax back to %EIP, but without trashing %ebp since
+ that holds the value we want to return to the scheduler.
+ Hence use %esi transiently for the guest state pointer. */
+ movl 28(%esp), %esi
+ movl %eax, OFFSET_x86_EIP(%esi)
+ movl %ebp, %eax
+ jmp run_innerloop_exit
+ /*NOTREACHED*/
=20
- jmp dispatch_exceptional
+counter_is_zero:
+ /* %EIP is up to date here */
+ /* back out decrement of the dispatch counter */
+ addl $1, VG_(dispatch_ctr)
+ movl $VG_TRC_INNER_COUNTERZERO, %eax
+ jmp run_innerloop_exit
+ /*NOTREACHED*/
=20
-=09
+fast_lookup_failed:
+ /* %EIP is up to date here */
+ /* back out decrement of the dispatch counter */
+ addl $1, VG_(dispatch_ctr)
+ movl $VG_TRC_INNER_FASTMISS, %eax
+ jmp run_innerloop_exit
+ /*NOTREACHED*/
=20
+
+
/* All exits from the dispatcher go through here. %eax holds
the return value.=20
*/
@@ -166,36 +249,10 @@
popl %ebx
ret=09
=20
-
-
-/* Other ways of getting out of the inner loop. Placed out-of-line to
- make it look cleaner.=20
-*/
-dispatch_exceptional:
- /* this is jumped to only, not fallen-through from above */
-
- /* save %eax in %EIP and defer to sched */
- movl 28(%esp), %edi
- movl %eax, OFFSET_x86_EIP(%edi)
- movl %ebp, %eax
- jmp run_innerloop_exit
-
-fast_lookup_failed:
- /* %EIP is up to date here since dispatch_boring dominates */
- addl $1, VG_(dispatch_ctr)
- movl $VG_TRC_INNER_FASTMISS, %eax
- jmp run_innerloop_exit
-
-counter_is_zero:
- /* %EIP is up to date here since dispatch_boring dominates */
- addl $1, VG_(dispatch_ctr)
- movl $VG_TRC_INNER_COUNTERZERO, %eax
- jmp run_innerloop_exit
-
=09
/* Let the linker know we don't need an executable stack */
.section .note.GNU-stack,"",@progbits
=20
-##--------------------------------------------------------------------##
-##--- end ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- end ---*/
+/*--------------------------------------------------------------------*/
Modified: trunk/coregrind/m_scheduler/scheduler.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_scheduler/scheduler.c 2005-12-14 22:24:45 UTC (rev =
5344)
+++ trunk/coregrind/m_scheduler/scheduler.c 2005-12-15 14:07:07 UTC (rev =
5345)
@@ -428,8 +428,12 @@
vg_assert(VG_(my_fault));
VG_(my_fault) =3D False;
=20
- SCHEDSETJMP(tid, jumped,=20
- trc =3D (UInt)VG_(run_innerloop)( (void*)&tst->arch.=
vex ));
+ SCHEDSETJMP(
+ tid,=20
+ jumped,=20
+ trc =3D (UInt)VG_(run_innerloop)( (void*)&tst->arch.vex,
+ VG_(clo_profile_flags) > 0 ? 1 : 0=
)
+ );
=20
//nextEIP =3D tst->arch.m_eip;
//if (nextEIP >=3D VG_(client_end))
Modified: trunk/coregrind/m_translate.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_translate.c 2005-12-14 22:24:45 UTC (rev 5344)
+++ trunk/coregrind/m_translate.c 2005-12-15 14:07:07 UTC (rev 5345)
@@ -32,23 +32,25 @@
#include "pub_core_basics.h"
#include "pub_core_aspacemgr.h"
=20
-#include "pub_core_machine.h" // For VG_(machine_get_VexArchInfo)
- // and VG_(get_SP)
+#include "pub_core_machine.h" // For VG_(machine_get_VexArchInfo)
+ // and VG_(get_SP)
#include "pub_core_libcbase.h"
#include "pub_core_libcassert.h"
#include "pub_core_libcprint.h"
#include "pub_core_options.h"
#include "pub_core_profile.h"
=20
-#include "pub_core_debuginfo.h" // Needed for pub_core_redir :(
-#include "pub_core_redir.h" // For VG_(code_redirect)()
+#include "pub_core_debuginfo.h" // Needed for pub_core_redir :(
+#include "pub_core_redir.h" // For VG_(code_redirect)()
=20
-#include "pub_core_signals.h" // For VG_(synth_fault_{perms,mappin=
g})()
-#include "pub_core_stacks.h" // For VG_(unknown_SP_update)()
-#include "pub_core_tooliface.h" // For VG_(tdict)
+#include "pub_core_signals.h" // For VG_(synth_fault_{perms,mapping})(=
)
+#include "pub_core_stacks.h" // For VG_(unknown_SP_update)()
+#include "pub_core_tooliface.h" // For VG_(tdict)
#include "pub_core_translate.h"
#include "pub_core_transtab.h"
+#include "pub_core_dispatch.h" // VG_(run_innerloop__dispatch_{un}profi=
led)
=20
+
/*------------------------------------------------------------*/
/*--- Stats ---*/
/*------------------------------------------------------------*/
@@ -569,6 +571,7 @@
VexArch vex_arch;
VexArchInfo vex_archinfo;
VexGuestExtents vge;
+ VexTranslateArgs vta;
VexTranslateResult tres;
=20
/* Make sure Vex is initialised right. */
@@ -690,25 +693,41 @@
/* Set up closure arg for "chase_into_ok" */
chase_into_ok__CLOSURE_tid =3D tid;
=20
- tres =3D LibVEX_Translate (=20
- vex_arch, &vex_archinfo,
- vex_arch, &vex_archinfo,
- (UChar*)ULong_to_Ptr(orig_addr),=20
- (Addr64)orig_addr,=20
- (Addr64)orig_addr_noredir,=20
- chase_into_ok,
- &vge,
- tmpbuf, N_TMPBUF, &tmpbuf_used,
- VG_(tdict).tool_instrument,
- need_to_handle_SP_assignment()
- ? vg_SP_update_pass
- : NULL,
- True, /* cleanup after instrumentation */
- do_self_check,
- NULL,
- verbosity
- );
+ vta.arch_guest =3D vex_arch;
+ vta.archinfo_guest =3D vex_archinfo;
+ vta.arch_host =3D vex_arch;
+ vta.archinfo_host =3D vex_archinfo;
+ vta.guest_bytes =3D (UChar*)ULong_to_Ptr(orig_addr);
+ vta.guest_bytes_addr =3D (Addr64)orig_addr;
+ vta.guest_bytes_addr_noredir =3D (Addr64)orig_addr_noredir;
+ vta.chase_into_ok =3D chase_into_ok;
+ vta.guest_extents =3D &vge;
+ vta.host_bytes =3D tmpbuf;
+ vta.host_bytes_size =3D N_TMPBUF;
+ vta.host_bytes_used =3D &tmpbuf_used;
+ vta.instrument1 =3D VG_(tdict).tool_instrument;
+ vta.instrument2 =3D need_to_handle_SP_assignment()
+ ? vg_SP_update_pass
+ : NULL;
+ vta.do_self_check =3D do_self_check;
+ vta.traceflags =3D verbosity;
=20
+ /* Set up the dispatch-return info. For archs without a link
+ register, vex generates a jump back to the specified dispatch
+ address. Else, it just generates a branch-to-LR. */
+# if defined(VGA_x86) || defined(VGA_amd64)
+ vta.dispatch =3D VG_(clo_profile_flags) > 0
+ ? (void*) &VG_(run_innerloop__dispatch_profiled)
+ : (void*) &VG_(run_innerloop__dispatch_unprofiled);
+# elif defined(VGA_ppc32) || defined(VGA_ppc64)
+ vta.dispatch =3D NULL;
+# else
+# error "Unknown arch"
+# endif
+
+ /* Sheesh. Finally, actually _do_ the translation! */
+ tres =3D LibVEX_Translate ( &vta );
+
vg_assert(tres =3D=3D VexTransOK);
vg_assert(tmpbuf_used <=3D N_TMPBUF);
vg_assert(tmpbuf_used > 0);
Modified: trunk/coregrind/pub_core_dispatch.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/pub_core_dispatch.h 2005-12-14 22:24:45 UTC (rev 5344=
)
+++ trunk/coregrind/pub_core_dispatch.h 2005-12-15 14:07:07 UTC (rev 5345=
)
@@ -50,12 +50,24 @@
signal, for example SIGSEGV, in which case control longjmp()s back pa=
st
here.
=20
+ If do_profiling is nonzero, the profile counters arrays should be
+ updated for each translation run.
+
This code simply handles the common case fast -- when the translation
address is found in the translation cache. For anything else, the
scheduler does the work.
*/
-extern UWord VG_(run_innerloop) ( void* guest_state );
+extern=20
+UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
=20
+#if defined(VGA_x86) || defined(VGA_amd64)
+/* We need to locate a couple of labels inside VG_(run_innerloop), so
+ that Vex can add branches to them from generated code. Hence the
+ following somewhat bogus decls. At least on x86 and amd64. */
+extern void VG_(run_innerloop__dispatch_unprofiled);
+extern void VG_(run_innerloop__dispatch_profiled);
+#endif
+
#endif // __PUB_CORE_DISPATCH_H
=20
/*--------------------------------------------------------------------*/
|
|
From: <sv...@va...> - 2005-12-15 14:02:41
|
Author: sewardj
Date: 2005-12-15 14:02:34 +0000 (Thu, 15 Dec 2005)
New Revision: 1494
Log:
- x86 back end: change code generation convention, so that instead of
dispatchers CALLing generated code which later RETs, dispatchers
jump to generated code and it jumps back to the dispatcher. This
removes two memory references per translation run and by itself
gives a measureable performance improvement on P4. As a result,
there is new plumbing so that the caller of LibVEX_Translate can
supply the address of the dispatcher to jump back to.
This probably breaks all other targets. Do not update.
- Administrative cleanup: LibVEX_Translate has an excessive
number of arguments. Remove them all and instead add a struct
by which the arguments are supplied. Add further comments=20
about the meaning of some fields.
Modified:
trunk/priv/host-x86/hdefs.c
trunk/priv/host-x86/hdefs.h
trunk/priv/main/vex_main.c
trunk/pub/libvex.h
trunk/test_main.c
Modified: trunk/priv/host-x86/hdefs.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-x86/hdefs.c 2005-12-15 13:58:07 UTC (rev 1493)
+++ trunk/priv/host-x86/hdefs.c 2005-12-15 14:02:34 UTC (rev 1494)
@@ -942,14 +942,16 @@
vex_printf("if (%%eflags.%s) { ",=20
showX86CondCode(i->Xin.Goto.cond));
}
- if (i->Xin.Goto.jk !=3D Ijk_Boring) {
+ if (i->Xin.Goto.jk !=3D Ijk_Boring
+ && i->Xin.Goto.jk !=3D Ijk_Call
+ && i->Xin.Goto.jk !=3D Ijk_Ret) {
vex_printf("movl $");
ppIRJumpKind(i->Xin.Goto.jk);
vex_printf(",%%ebp ; ");
}
vex_printf("movl ");
ppX86RI(i->Xin.Goto.dst);
- vex_printf(",%%eax ; ret");
+ vex_printf(",%%eax ; movl $dispatcher_addr,%%edx ; jmp *%%edx")=
;
if (i->Xin.Goto.cond !=3D Xcc_ALWAYS) {
vex_printf(" }");
}
@@ -1216,8 +1218,13 @@
return;
case Xin_Goto:
addRegUsage_X86RI(u, i->Xin.Goto.dst);
- addHRegUse(u, HRmWrite, hregX86_EAX());
- if (i->Xin.Goto.jk !=3D Ijk_Boring)
+ addHRegUse(u, HRmWrite, hregX86_EAX()); /* used for next guest =
addr */
+ addHRegUse(u, HRmWrite, hregX86_EDX()); /* used for dispatcher =
addr */
+ if (i->Xin.Goto.jk !=3D Ijk_Boring
+ && i->Xin.Goto.jk !=3D Ijk_Call
+ && i->Xin.Goto.jk !=3D Ijk_Ret)
+ /* note, this is irrelevant since ebp is not actually
+ available to the allocator. But still .. */
addHRegUse(u, HRmWrite, hregX86_EBP());
return;
case Xin_CMov32:
@@ -1832,7 +1839,8 @@
Note that buf is not the insn's final place, and therefore it is
imperative to emit position-independent code. */
=20
-Int emit_X86Instr ( UChar* buf, Int nbuf, X86Instr* i, Bool mode64 )
+Int emit_X86Instr ( UChar* buf, Int nbuf, X86Instr* i,=20
+ Bool mode64, void* dispatch )
{
UInt irno, opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc=
;
=20
@@ -2185,21 +2193,31 @@
=20
/* Get the destination address into %eax */
if (i->Xin.Goto.dst->tag =3D=3D Xri_Imm) {
- /* movl $immediate, %eax ; ret */
+ /* movl $immediate, %eax */
*p++ =3D 0xB8;
p =3D emit32(p, i->Xin.Goto.dst->Xri.Imm.imm32);
} else {
vassert(i->Xin.Goto.dst->tag =3D=3D Xri_Reg);
- /* movl %reg, %eax ; ret */
+ /* movl %reg, %eax */
if (i->Xin.Goto.dst->Xri.Reg.reg !=3D hregX86_EAX()) {
*p++ =3D 0x89;
p =3D doAMode_R(p, i->Xin.Goto.dst->Xri.Reg.reg, hregX86_EAX=
());
}
}
=20
- /* ret */
- *p++ =3D 0xC3;
+ /* Get the dispatcher address into %edx. This has to happen
+ after the load of %eax since %edx might be carrying the value
+ destined for %eax immediately prior to this Xin_Goto. */
+ vassert(sizeof(UInt) =3D=3D sizeof(void*));
+ vassert(dispatch !=3D NULL);
+ /* movl $imm32, %edx */
+ *p++ =3D 0xBA;
+ p =3D emit32(p, (UInt)dispatch);
=20
+ /* jmp *%edx */
+ *p++ =3D 0xFF;
+ *p++ =3D 0xE2;
+
/* Fix up the conditional jump, if there was one. */
if (i->Xin.Goto.cond !=3D Xcc_ALWAYS) {
Int delta =3D p - ptmp;
Modified: trunk/priv/host-x86/hdefs.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-x86/hdefs.h 2005-12-15 13:58:07 UTC (rev 1493)
+++ trunk/priv/host-x86/hdefs.h 2005-12-15 14:02:34 UTC (rev 1494)
@@ -660,7 +660,8 @@
extern void getRegUsage_X86Instr ( HRegUsage*, X86Instr*, Bool )=
;
extern void mapRegs_X86Instr ( HRegRemap*, X86Instr*, Bool )=
;
extern Bool isMove_X86Instr ( X86Instr*, HReg*, HReg* );
-extern Int emit_X86Instr ( UChar* buf, Int nbuf, X86Inst=
r*, Bool );
+extern Int emit_X86Instr ( UChar* buf, Int nbuf, X86Inst=
r*,=20
+ Bool, void* dispatch );
extern X86Instr* genSpill_X86 ( HReg rreg, Int offset, Bool )=
;
extern X86Instr* genReload_X86 ( HReg rreg, Int offset, Bool )=
;
extern void getAllocableRegs_X86 ( Int*, HReg** );
Modified: trunk/priv/main/vex_main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/main/vex_main.c 2005-12-15 13:58:07 UTC (rev 1493)
+++ trunk/priv/main/vex_main.c 2005-12-15 14:02:34 UTC (rev 1494)
@@ -173,43 +173,7 @@
=20
/* Exported to library client. */
=20
-VexTranslateResult LibVEX_Translate (
- /* The instruction sets we are translating from and to. */
- VexArch arch_guest,
- VexArchInfo* archinfo_guest,
- VexArch arch_host,
- VexArchInfo* archinfo_host,
- /* IN: the block to translate, and its guest address. */
- /* where are the actual bytes in the host's address space? */
- UChar* guest_bytes,
- /* where do the bytes came from in the guest's aspace? */
- Addr64 guest_bytes_addr,
- /* what guest entry point address do they correspond to? */
- Addr64 guest_bytes_addr_noredir,
- /* Is it OK to chase into this guest address? */
- Bool (*chase_into_ok) ( Addr64 ),
- /* OUT: which bits of guest code actually got translated */
- VexGuestExtents* guest_extents,
- /* IN: a place to put the resulting code, and its size */
- UChar* host_bytes,
- Int host_bytes_size,
- /* OUT: how much of the output area is used. */
- Int* host_bytes_used,
- /* IN: optionally, two instrumentation functions. */
- IRBB* (*instrument1) ( IRBB*, VexGuestLayout*,=20
- Addr64, VexGuestExtents*,=20
- IRType gWordTy, IRType hWordTy ),
- IRBB* (*instrument2) ( IRBB*, VexGuestLayout*,=20
- Addr64, VexGuestExtents*,
- IRType gWordTy, IRType hWordTy ),
- Bool cleanup_after_instrumentation,
- /* IN: should this translation be self-checking? */
- Bool do_self_check,
- /* IN: optionally, an access check function for guest code. */
- Bool (*byte_accessible) ( Addr64 ),
- /* IN: debug: trace vex activity at various points */
- Int traceflags
-)
+VexTranslateResult LibVEX_Translate ( VexTranslateArgs* vta )
{
/* This the bundle of functions we need to do the back-end stuff
(insn selection, reg-alloc, assembly) whilst being insulated
@@ -224,7 +188,7 @@
void (*ppInstr) ( HInstr*, Bool );
void (*ppReg) ( HReg );
HInstrArray* (*iselBB) ( IRBB*, VexArchInfo* );
- Int (*emit) ( UChar*, Int, HInstr*, Bool );
+ Int (*emit) ( UChar*, Int, HInstr*, Bool, void* );
IRExpr* (*specHelper) ( HChar*, IRExpr** );
Bool (*preciseMemExnsFn) ( Int, Int );
=20
@@ -263,7 +227,7 @@
offB_TILEN =3D 0;
mode64 =3D False;
=20
- vex_traceflags =3D traceflags;
+ vex_traceflags =3D vta->traceflags;
=20
vassert(vex_initdone);
vexSetAllocModeTEMP_and_clear();
@@ -272,7 +236,7 @@
/* First off, check that the guest and host insn sets
are supported. */
=20
- switch (arch_host) {
+ switch (vta->arch_host) {
=20
case VexArchX86:
mode64 =3D False;
@@ -286,12 +250,13 @@
ppInstr =3D (void(*)(HInstr*, Bool)) ppX86Instr;
ppReg =3D (void(*)(HReg)) ppHRegX86;
iselBB =3D iselBB_X86;
- emit =3D (Int(*)(UChar*,Int,HInstr*, Bool)) emit_X86Inst=
r;
+ emit =3D emit_X86Instr;
host_is_bigendian =3D False;
host_word_type =3D Ity_I32;
- vassert(archinfo_host->subarch =3D=3D VexSubArchX86_sse0
- || archinfo_host->subarch =3D=3D VexSubArchX86_sse1
- || archinfo_host->subarch =3D=3D VexSubArchX86_sse2);
+ vassert(vta->archinfo_host.subarch =3D=3D VexSubArchX86_sse0
+ || vta->archinfo_host.subarch =3D=3D VexSubArchX86_sse1
+ || vta->archinfo_host.subarch =3D=3D VexSubArchX86_sse2=
);
+ vassert(vta->dispatch !=3D NULL); /* jump-to-dispatcher scheme =
*/
break;
=20
case VexArchAMD64:
@@ -309,7 +274,8 @@
emit =3D (Int(*)(UChar*,Int,HInstr*, Bool)) emit_AMD64In=
str;
host_is_bigendian =3D False;
host_word_type =3D Ity_I64;
- vassert(archinfo_host->subarch =3D=3D VexSubArch_NONE);
+ vassert(vta->archinfo_host.subarch =3D=3D VexSubArch_NONE);
+ vassert(vta->dispatch !=3D NULL); /* jump-to-dispatcher scheme =
*/
break;
=20
case VexArchPPC32:
@@ -327,9 +293,10 @@
emit =3D (Int(*)(UChar*,Int,HInstr*,Bool)) emit_PPC32Ins=
tr;
host_is_bigendian =3D True;
host_word_type =3D Ity_I32;
- vassert(archinfo_guest->subarch =3D=3D VexSubArchPPC32_I
- || archinfo_guest->subarch =3D=3D VexSubArchPPC32_FI
- || archinfo_guest->subarch =3D=3D VexSubArchPPC32_VFI);
+ vassert(vta->archinfo_guest.subarch =3D=3D VexSubArchPPC32_I
+ || vta->archinfo_guest.subarch =3D=3D VexSubArchPPC32_F=
I
+ || vta->archinfo_guest.subarch =3D=3D VexSubArchPPC32_V=
FI);
+ vassert(vta->dispatch =3D=3D NULL); /* return-to-dispatcher sch=
eme */
break;
=20
case VexArchPPC64:
@@ -347,8 +314,9 @@
emit =3D (Int(*)(UChar*,Int,HInstr*, Bool)) emit_PPC32In=
str;
host_is_bigendian =3D True;
host_word_type =3D Ity_I64;
- vassert(archinfo_guest->subarch =3D=3D VexSubArchPPC64_FI
- || archinfo_guest->subarch =3D=3D VexSubArchPPC64_VFI);
+ vassert(vta->archinfo_guest.subarch =3D=3D VexSubArchPPC64_FI
+ || vta->archinfo_guest.subarch =3D=3D VexSubArchPPC64_V=
FI);
+ vassert(vta->dispatch =3D=3D NULL); /* return-to-dispatcher sch=
eme */
break;
=20
default:
@@ -356,7 +324,7 @@
}
=20
=20
- switch (arch_guest) {
+ switch (vta->arch_guest) {
=20
case VexArchX86:
preciseMemExnsFn =3D guest_x86_state_requires_precise_mem_exns;
@@ -367,9 +335,9 @@
guest_layout =3D &x86guest_layout;
offB_TISTART =3D offsetof(VexGuestX86State,guest_TISTART);
offB_TILEN =3D offsetof(VexGuestX86State,guest_TILEN);
- vassert(archinfo_guest->subarch =3D=3D VexSubArchX86_sse0
- || archinfo_guest->subarch =3D=3D VexSubArchX86_sse1
- || archinfo_guest->subarch =3D=3D VexSubArchX86_sse2);
+ vassert(vta->archinfo_guest.subarch =3D=3D VexSubArchX86_sse0
+ || vta->archinfo_guest.subarch =3D=3D VexSubArchX86_sse=
1
+ || vta->archinfo_guest.subarch =3D=3D VexSubArchX86_sse=
2);
vassert(0 =3D=3D sizeof(VexGuestX86State) % 8);
vassert(sizeof( ((VexGuestX86State*)0)->guest_TISTART ) =3D=3D =
4);
vassert(sizeof( ((VexGuestX86State*)0)->guest_TILEN ) =3D=3D 4)=
;
@@ -384,7 +352,7 @@
guest_layout =3D &amd64guest_layout;
offB_TISTART =3D offsetof(VexGuestAMD64State,guest_TISTART)=
;
offB_TILEN =3D offsetof(VexGuestAMD64State,guest_TILEN);
- vassert(archinfo_guest->subarch =3D=3D VexSubArch_NONE);
+ vassert(vta->archinfo_guest.subarch =3D=3D VexSubArch_NONE);
vassert(0 =3D=3D sizeof(VexGuestAMD64State) % 8);
vassert(sizeof( ((VexGuestAMD64State*)0)->guest_TISTART ) =3D=3D=
8);
vassert(sizeof( ((VexGuestAMD64State*)0)->guest_TILEN ) =3D=3D =
8);
@@ -399,7 +367,7 @@
guest_layout =3D &armGuest_layout;
offB_TISTART =3D 0; /* hack ... arm has bitrot */
offB_TILEN =3D 0; /* hack ... arm has bitrot */
- vassert(archinfo_guest->subarch =3D=3D VexSubArchARM_v4);
+ vassert(vta->archinfo_guest.subarch =3D=3D VexSubArchARM_v4);
break;
=20
case VexArchPPC32:
@@ -411,9 +379,9 @@
guest_layout =3D &ppc32Guest_layout;
offB_TISTART =3D offsetof(VexGuestPPC32State,guest_TISTART)=
;
offB_TILEN =3D offsetof(VexGuestPPC32State,guest_TILEN);
- vassert(archinfo_guest->subarch =3D=3D VexSubArchPPC32_I
- || archinfo_guest->subarch =3D=3D VexSubArchPPC32_FI
- || archinfo_guest->subarch =3D=3D VexSubArchPPC32_VFI);
+ vassert(vta->archinfo_guest.subarch =3D=3D VexSubArchPPC32_I
+ || vta->archinfo_guest.subarch =3D=3D VexSubArchPPC32_F=
I
+ || vta->archinfo_guest.subarch =3D=3D VexSubArchPPC32_V=
FI);
vassert(0 =3D=3D sizeof(VexGuestPPC32State) % 8);
vassert(sizeof( ((VexGuestPPC32State*)0)->guest_TISTART ) =3D=3D=
4);
vassert(sizeof( ((VexGuestPPC32State*)0)->guest_TILEN ) =3D=3D =
4);
@@ -428,8 +396,8 @@
guest_layout =3D &ppc64Guest_layout;
offB_TISTART =3D offsetof(VexGuestPPC64State,guest_TISTART)=
;
offB_TILEN =3D offsetof(VexGuestPPC64State,guest_TILEN);
- vassert(archinfo_guest->subarch =3D=3D VexSubArchPPC64_FI
- || archinfo_guest->subarch =3D=3D VexSubArchPPC64_VFI);
+ vassert(vta->archinfo_guest.subarch =3D=3D VexSubArchPPC64_FI
+ || vta->archinfo_guest.subarch =3D=3D VexSubArchPPC64_V=
FI);
vassert(0 =3D=3D sizeof(VexGuestPPC64State) % 16);
vassert(sizeof( ((VexGuestPPC64State*)0)->guest_TISTART ) =3D=3D=
8);
vassert(sizeof( ((VexGuestPPC64State*)0)->guest_TILEN ) =3D=3D =
8);
@@ -440,11 +408,11 @@
}
=20
/* yet more sanity checks ... */
- if (arch_guest =3D=3D arch_host) {
+ if (vta->arch_guest =3D=3D vta->arch_host) {
/* doesn't necessarily have to be true, but if it isn't it means
we are simulating one flavour of an architecture a different
flavour of the same architecture, which is pretty strange. */
- vassert(archinfo_guest->subarch =3D=3D archinfo_host->subarch);
+ vassert(vta->archinfo_guest.subarch =3D=3D vta->archinfo_host.suba=
rch);
}
=20
vexAllocSanityCheck();
@@ -454,15 +422,15 @@
" Front end "
"------------------------\n\n");
=20
- irbb =3D bb_to_IR ( guest_extents,
+ irbb =3D bb_to_IR ( vta->guest_extents,
disInstrFn,
- guest_bytes,=20
- guest_bytes_addr,
- chase_into_ok,
+ vta->guest_bytes,=20
+ vta->guest_bytes_addr,
+ vta->chase_into_ok,
host_is_bigendian,
- archinfo_guest,
+ &vta->archinfo_guest,
guest_word_type,
- do_self_check,
+ vta->do_self_check,
offB_TISTART,
offB_TILEN );
=20
@@ -475,21 +443,21 @@
return VexTransAccessFail;
}
=20
- vassert(guest_extents->n_used >=3D 1 && guest_extents->n_used <=3D 3)=
;
- vassert(guest_extents->base[0] =3D=3D guest_bytes_addr);
- for (i =3D 0; i < guest_extents->n_used; i++) {
- vassert(guest_extents->len[i] < 10000); /* sanity */
+ vassert(vta->guest_extents->n_used >=3D 1 && vta->guest_extents->n_us=
ed <=3D 3);
+ vassert(vta->guest_extents->base[0] =3D=3D vta->guest_bytes_addr);
+ for (i =3D 0; i < vta->guest_extents->n_used; i++) {
+ vassert(vta->guest_extents->len[i] < 10000); /* sanity */
}
=20
/* If debugging, show the raw guest bytes for this bb. */
if (0 || (vex_traceflags & VEX_TRACE_FE)) {
- if (guest_extents->n_used > 1) {
+ if (vta->guest_extents->n_used > 1) {
vex_printf("can't show code due to extents > 1\n");
} else {
/* HACK */
- UChar* p =3D (UChar*)guest_bytes;
- UInt guest_bytes_read =3D (UInt)guest_extents->len[0];
- vex_printf(". 0 %llx %u\n.", guest_bytes_addr, guest_bytes_read=
);
+ UChar* p =3D (UChar*)vta->guest_bytes;
+ UInt guest_bytes_read =3D (UInt)vta->guest_extents->len[0];
+ vex_printf(". 0 %llx %u\n.", vta->guest_bytes_addr, guest_bytes=
_read );
for (i =3D 0; i < guest_bytes_read; i++)
vex_printf(" %02x", (Int)p[i] );
vex_printf("\n\n");
@@ -504,7 +472,7 @@
=20
/* Clean it up, hopefully a lot. */
irbb =3D do_iropt_BB ( irbb, specHelper, preciseMemExnsFn,=20
- guest_bytes_addr );
+ vta->guest_bytes_addr );
sanityCheckIRBB( irbb, "after initial iropt",=20
True/*must be flat*/, guest_word_type );
=20
@@ -519,16 +487,18 @@
vexAllocSanityCheck();
=20
/* Get the thing instrumented. */
- if (instrument1)
- irbb =3D (*instrument1)(irbb, guest_layout,=20
- guest_bytes_addr_noredir, guest_extents,
- guest_word_type, host_word_type);
+ if (vta->instrument1)
+ irbb =3D vta->instrument1(irbb, guest_layout,=20
+ vta->guest_bytes_addr_noredir,
+ vta->guest_extents,
+ guest_word_type, host_word_type);
vexAllocSanityCheck();
=20
- if (instrument2)
- irbb =3D (*instrument2)(irbb, guest_layout,
- guest_bytes_addr_noredir, guest_extents,
- guest_word_type, host_word_type);
+ if (vta->instrument2)
+ irbb =3D vta->instrument2(irbb, guest_layout,
+ vta->guest_bytes_addr_noredir,=20
+ vta->guest_extents,
+ guest_word_type, host_word_type);
=20
if (vex_traceflags & VEX_TRACE_INST) {
vex_printf("\n------------------------"=20
@@ -538,12 +508,12 @@
vex_printf("\n");
}
=20
- if (instrument1 || instrument2)
+ if (vta->instrument1 || vta->instrument2)
sanityCheckIRBB( irbb, "after instrumentation",
True/*must be flat*/, guest_word_type );
=20
/* Do a post-instrumentation cleanup pass. */
- if (cleanup_after_instrumentation) {
+ if (vta->instrument1 || vta->instrument2) {
do_deadcode_BB( irbb );
irbb =3D cprop_BB( irbb );
do_deadcode_BB( irbb );
@@ -576,7 +546,7 @@
}
=20
/* HACK */
- if (0) { *host_bytes_used =3D 0; return VexTransOK; }
+ if (0) { *(vta->host_bytes_used) =3D 0; return VexTransOK; }
/* end HACK */
=20
if (vex_traceflags & VEX_TRACE_VCODE)
@@ -584,7 +554,7 @@
" Instruction selection "
"------------------------\n");
=20
- vcode =3D iselBB ( irbb, archinfo_host );
+ vcode =3D iselBB ( irbb, &vta->archinfo_host );
=20
vexAllocSanityCheck();
=20
@@ -622,7 +592,7 @@
}
=20
/* HACK */
- if (0) { *host_bytes_used =3D 0; return VexTransOK; }
+ if (0) { *(vta->host_bytes_used) =3D 0; return VexTransOK; }
/* end HACK */
=20
/* Assemble */
@@ -638,7 +608,7 @@
ppInstr(rcode->arr[i], mode64);
vex_printf("\n");
}
- j =3D (*emit)( insn_bytes, 32, rcode->arr[i], mode64 );
+ j =3D (*emit)( insn_bytes, 32, rcode->arr[i], mode64, vta->dispatc=
h );
if (vex_traceflags & VEX_TRACE_ASM) {
for (k =3D 0; k < j; k++)
if (insn_bytes[k] < 16)
@@ -647,18 +617,18 @@
vex_printf("%x ", (UInt)insn_bytes[k]);
vex_printf("\n\n");
}
- if (out_used + j > host_bytes_size) {
+ if (out_used + j > vta->host_bytes_size) {
vexSetAllocModeTEMP_and_clear();
vex_traceflags =3D 0;
return VexTransOutputFull;
}
for (k =3D 0; k < j; k++) {
- host_bytes[out_used] =3D insn_bytes[k];
+ vta->host_bytes[out_used] =3D insn_bytes[k];
out_used++;
}
- vassert(out_used <=3D host_bytes_size);
+ vassert(out_used <=3D vta->host_bytes_size);
}
- *host_bytes_used =3D out_used;
+ *(vta->host_bytes_used) =3D out_used;
=20
vexAllocSanityCheck();
=20
Modified: trunk/pub/libvex.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/pub/libvex.h 2005-12-15 13:58:07 UTC (rev 1493)
+++ trunk/pub/libvex.h 2005-12-15 14:02:34 UTC (rev 1494)
@@ -300,45 +300,89 @@
VexGuestExtents;
=20
=20
+/* A structure to carry arguments for LibVEX_Translate. There are so
+ many of them, it seems better to have a structure. */
+typedef
+ struct {
+ /* IN: The instruction sets we are translating from and to. */
+ VexArch arch_guest;
+ VexArchInfo archinfo_guest;
+ VexArch arch_host;
+ VexArchInfo archinfo_host;
+
+ /* IN: the block to translate, and its guest address. */
+ /* where are the actual bytes in the host's address space? */
+ UChar* guest_bytes;
+ /* where do the bytes really come from in the guest's aspace?
+ This is the post-redirection guest address. */
+ Addr64 guest_bytes_addr;
+ /* where do the bytes claim to come from in the guest address
+ space? (what guest entry point address do they correspond
+ to?) This is the pre-redirection guest address. */
+ Addr64 guest_bytes_addr_noredir;
+
+ /* Is it OK to chase into this guest address? May not be
+ NULL. */
+ Bool (*chase_into_ok) ( Addr64 );
+
+ /* OUT: which bits of guest code actually got translated */
+ VexGuestExtents* guest_extents;
+
+ /* IN: a place to put the resulting code, and its size */
+ UChar* host_bytes;
+ Int host_bytes_size;
+ /* OUT: how much of the output area is used. */
+ Int* host_bytes_used;
+
+ /* IN: optionally, two instrumentation functions. May be
+ NULL. */
+ IRBB* (*instrument1) ( IRBB*, VexGuestLayout*,=20
+ Addr64, VexGuestExtents*,
+ IRType gWordTy, IRType hWordTy );
+ IRBB* (*instrument2) ( IRBB*, VexGuestLayout*,=20
+ Addr64, VexGuestExtents*,
+ IRType gWordTy, IRType hWordTy );
+
+ /* IN: should this translation be self-checking? default: False *=
/
+ Bool do_self_check;
+ /* IN: debug: trace vex activity at various points */
+ Int traceflags;
+
+ /* IN: address of the dispatcher entry point. Describes the
+ place where generated code should jump to at the end of each
+ bb.
+
+ At the end of each translation, the next guest address is
+ placed in the host's standard return register (x86: %eax,
+ amd64: %rax, ppc32: %r3, ppc64: %r3). Optionally, the guest
+ state pointer register (on host x86: %ebp; amd64: %rbp;
+ ppc32/64: r31) may be set to a VEX_TRC_ value to indicate any
+ special action required before the next block is run.
+
+ Control is then passed back to the dispatcher (beyond Vex's
+ control; caller supplies this) in the following way:
+
+ - On host archs which lack a link register (x86, amd64), by a
+ jump to the host address specified in 'dispatcher', which
+ must be non-NULL.
+
+ - On host archs which have a link register (ppc32, ppc64), by
+ a branch to the link register (which is guaranteed to be
+ unchanged from whatever it was at entry to the
+ translation). 'dispatch' must be NULL.
+
+ The aim is to get back and forth between translations and the
+ dispatcher without creating memory traffic to store return
+ addresses.
+ */
+ void* dispatch;
+ }
+ VexTranslateArgs;
+
+
extern=20
-VexTranslateResult LibVEX_Translate (
+VexTranslateResult LibVEX_Translate ( VexTranslateArgs* );
=20
- /* The instruction sets we are translating from and to. */
- VexArch arch_guest,
- VexArchInfo* archinfo_guest,
- VexArch arch_host,
- VexArchInfo* archinfo_host,
- /* IN: the block to translate, and its guest address. */
- /* where are the actual bytes in the host's address space? */
- UChar* guest_bytes,
- /* where do the bytes came from in the guest's aspace? */
- Addr64 guest_bytes_addr,
- /* what guest entry point address do they correspond to? */
- Addr64 guest_bytes_addr_noredir,
- /* Is it OK to chase into this guest address? */
- Bool (*chase_into_ok) ( Addr64 ),
- /* OUT: which bits of guest code actually got translated */
- VexGuestExtents* guest_extents,
- /* IN: a place to put the resulting code, and its size */
- UChar* host_bytes,
- Int host_bytes_size,
- /* OUT: how much of the output area is used. */
- Int* host_bytes_used,
- /* IN: optionally, two instrumentation functions. */
- IRBB* (*instrument1) ( IRBB*, VexGuestLayout*,=20
- Addr64, VexGuestExtents*,
- IRType gWordTy, IRType hWordTy ),
- IRBB* (*instrument2) ( IRBB*, VexGuestLayout*,=20
- Addr64, VexGuestExtents*,
- IRType gWordTy, IRType hWordTy ),
- Bool cleanup_after_instrumentation,
- /* IN: should this translation be self-checking? */
- Bool do_self_check,
- /* IN: optionally, an access check function for guest code. */
- Bool (*byte_accessible) ( Addr64 ),
- /* IN: debug: trace vex activity at various points */
- Int traceflags
-);
=20
/* A subtlety re interaction between self-checking translations and
bb-chasing. The supplied chase_into_ok function should say NO
@@ -369,7 +413,7 @@
=20
x86
~~~
- Generated code should be entered using a CALL instruction. On
+ Generated code should be entered using a JMP instruction. On
entry, %ebp should point to the guest state, and %esp should be a
valid stack pointer. The generated code may change %eax, %ebx,
%ecx, %edx, %esi, %edi, all the FP registers and control state, and
@@ -380,9 +424,11 @@
should still have those values (after masking off the lowest 6 bits
of %mxcsr). If they don't, there is a bug in VEX-generated code.
=20
- Generated code returns to the scheduler using a RET instruction.
+ Generated code returns to the scheduler using a JMP instruction, to
+ the address specified in the .dispatch field of VexTranslateArgs.
%eax (or %eax:%edx, if simulating a 64-bit target) will contain the
- guest address of the next block to execute.
+ guest address of the next block to execute. %ebp may be changed
+ to a VEX_TRC_ value, otherwise it should be as it was at entry.
=20
CRITICAL ISSUES in x86 code generation. The only known critical
issue is that the host FPU and SSE state is not properly saved
@@ -392,6 +438,22 @@
generated code, the generated code is likely to go wrong. This
really should be fixed.
=20
+ amd64
+ ~~~~~
+ Analogous to x86.
+
+ ppc32
+ ~~~~~
+ On entry, guest state pointer is r31. .dispatch must be NULL.
+ Control is returned with a branch to the link register. Generated
+ code will not change lr. At return, r3 holds the next guest addr
+ (or r3:r4 ?). r31 may be may be changed to a VEX_TRC_ value,
+ otherwise it should be as it was at entry.
+
+ ppc64
+ ~~~~~
+ Probably the same as ppc32.
+
ALL GUEST ARCHITECTURES
~~~~~~~~~~~~~~~~~~~~~~~
The architecture must contain two pseudo-registers, guest_TISTART
Modified: trunk/test_main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/test_main.c 2005-12-15 13:58:07 UTC (rev 1493)
+++ trunk/test_main.c 2005-12-15 14:02:34 UTC (rev 1494)
@@ -67,6 +67,7 @@
VexControl vcon;
VexGuestExtents vge;
VexArchInfo vai_x86, vai_amd64, vai_ppc32;
+ VexTranslateArgs vta;
=20
if (argc !=3D 2) {
fprintf(stderr, "usage: vex file.org\n");
@@ -133,45 +134,56 @@
vai_ppc32.subarch =3D VexSubArchPPC32_VFI;
vai_ppc32.ppc32_cache_line_szB =3D 128;
=20
- for (i =3D 0; i < TEST_N_ITERS; i++)
- tres
- =3D LibVEX_Translate (=20
+ /* ----- Set up args for LibVEX_Translate ----- */
#if 1 /* ppc32 -> ppc32 */
- VexArchPPC32, &vai_ppc32,
- VexArchPPC32, &vai_ppc32,
+ vta.arch_guest =3D VexArchPPC32;
+ vta.archinfo_guest =3D vai_ppc32;
+ vta.arch_host =3D VexArchPPC32;
+ vta.archinfo_host =3D vai_ppc32;
#endif
#if 0 /* amd64 -> amd64 */
- VexArchAMD64, &vai_amd64,=20
- VexArchAMD64, &vai_amd64,=20
+ vta.arch_guest =3D VexArchAMD64;
+ vta.archinfo_guest =3D vai_amd64;
+ vta.arch_host =3D VexArchAMD64;
+ vta.archinfo_host =3D vai_amd64;
#endif
#if 0 /* x86 -> x86 */
- VexArchX86, &vai_x86,=20
- VexArchX86, &vai_x86,=20
+ vta.arch_guest =3D VexArchX86;
+ vta.archinfo_guest =3D vai_x86;
+ vta.arch_host =3D VexArchX86;
+ vta.archinfo_host =3D vai_x86;
#endif
- origbuf, (Addr64)orig_addr, (Addr64)orig_addr,
- chase_into_not_ok,
- &vge,
- transbuf, N_TRANSBUF, &trans_used,
+ vta.guest_bytes =3D origbuf;
+ vta.guest_bytes_addr =3D (Addr64)orig_addr;
+ vta.guest_bytes_addr_noredir =3D (Addr64)orig_addr;
+ vta.chase_into_ok =3D chase_into_not_ok;
+ vta.guest_extents =3D &vge;
+ vta.host_bytes =3D transbuf;
+ vta.host_bytes_size =3D N_TRANSBUF;
+ vta.host_bytes_used =3D &trans_used;
#if 1 /* no instrumentation */
- NULL, /* instrument1 */
- NULL, /* instrument2 */
- False, /* cleanup after instrument */
+ vta.instrument1 =3D NULL;
+ vta.instrument2 =3D NULL;
#endif
#if 0 /* addrcheck */
- ac_instrument, /* instrument1 */
- NULL, /* instrument2 */
- False, /* cleanup after instrument */
+ vta.instrument1 =3D ac_instrument;
+ vta.instrument2 =3D NULL;
#endif
#if 0 /* memcheck */
- mc_instrument, /* instrument1 */
- NULL, /* instrument2 */
- True, /* cleanup after instrument */
+ vta.instrument1 =3D mc_instrument;
+ vta.instrument2 =3D NULL;
#endif
- False, /* do_self_check ? */
- NULL, /* access checker */
- TEST_FLAGS=20
- );
+ vta.do_self_check =3D False;
+ vta.traceflags =3D TEST_FLAGS;
+#if 1 /* x86, amd64 hosts */
+ vta.dispatch =3D (void*)0x12345678;
+#else /* ppc32, ppc64 hosts */
+ vta.dispatch =3D NULL;
+#endif
=20
+ for (i =3D 0; i < TEST_N_ITERS; i++)
+ tres =3D LibVEX_Translate ( &vta );
+
if (tres !=3D VexTransOK)
printf("\ntres =3D %d\n", (Int)tres);
assert(tres =3D=3D VexTransOK);
|
|
From: <sv...@va...> - 2005-12-15 13:58:20
|
Author: sewardj
Date: 2005-12-15 13:58:07 +0000 (Thu, 15 Dec 2005)
New Revision: 1493
Log:
Stop gcc complaining.
Modified:
trunk/priv/host-ppc32/hdefs.c
Modified: trunk/priv/host-ppc32/hdefs.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-ppc32/hdefs.c 2005-12-14 22:00:53 UTC (rev 1492)
+++ trunk/priv/host-ppc32/hdefs.c 2005-12-15 13:58:07 UTC (rev 1493)
@@ -1296,7 +1296,7 @@
Bool idxd =3D toBool(i->Pin.Load.src->tag =3D=3D Pam_RR);
UChar sz =3D i->Pin.Load.sz;
UChar c_sz =3D sz=3D=3D1 ? 'b' : sz=3D=3D2 ? 'h' : sz=3D=3D4 ? 'w'=
: 'd';
- UChar* s_syned =3D i->Pin.Load.syned ? "a" : sz=3D=3D8 ? "" : "z";
+ HChar* s_syned =3D i->Pin.Load.syned ? "a" : sz=3D=3D8 ? "" : "z";
vex_printf("l%c%s%s ", c_sz, s_syned, idxd ? "x" : "" );
ppHRegPPC32(i->Pin.Load.dst);
vex_printf(",");
|
|
From: <js...@ac...> - 2005-12-15 06:50:47
|
Nightly build on phoenix ( SuSE 10.0 ) started at 2005-12-15 03:30:01 GMT Checking out vex source tree ... done Building vex ... done Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 208 tests, 6 stderr failures, 1 stdout failure ================= memcheck/tests/leak-tree (stderr) memcheck/tests/mempool (stderr) memcheck/tests/stack_switch (stderr) memcheck/tests/x86/scalar (stderr) none/tests/mremap2 (stdout) none/tests/x86/faultstatus (stderr) none/tests/x86/int (stderr) |
|
From: <js...@ac...> - 2005-12-15 05:59:00
|
Nightly build on g5 ( YDL 4.0, ppc970 ) started at 2005-12-15 04:40:00 CET Checking out vex source tree ... done Building vex ... done Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 176 tests, 15 stderr failures, 0 stdout failures ================= memcheck/tests/badjump (stderr) memcheck/tests/badjump2 (stderr) memcheck/tests/leak-cycle (stderr) memcheck/tests/leak-tree (stderr) memcheck/tests/mempool (stderr) memcheck/tests/partiallydefinedeq (stderr) memcheck/tests/pointer-trace (stderr) memcheck/tests/supp1 (stderr) memcheck/tests/supp_unknown (stderr) memcheck/tests/toobig-allocs (stderr) memcheck/tests/xml1 (stderr) massif/tests/toobig-allocs (stderr) none/tests/faultstatus (stderr) none/tests/fdleak_cmsg (stderr) none/tests/mremap (stderr) |
|
From: Tom H. <to...@co...> - 2005-12-15 03:43:03
|
Nightly build on dunsmere ( athlon, Fedora Core 4 ) started at 2005-12-15 03:30:06 GMT Results unchanged from 24 hours ago Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 210 tests, 7 stderr failures, 1 stdout failure ================= memcheck/tests/leak-tree (stderr) memcheck/tests/mempool (stderr) memcheck/tests/pointer-trace (stderr) memcheck/tests/stack_switch (stderr) memcheck/tests/x86/scalar (stderr) none/tests/mremap2 (stdout) none/tests/x86/faultstatus (stderr) none/tests/x86/int (stderr) |
|
From: Tom H. <th...@cy...> - 2005-12-15 03:29:35
|
Nightly build on alvis ( i686, Red Hat 7.3 ) started at 2005-12-15 03:15:05 GMT Results unchanged from 24 hours ago Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 209 tests, 17 stderr failures, 1 stdout failure ================= memcheck/tests/addressable (stderr) memcheck/tests/describe-block (stderr) memcheck/tests/erringfds (stderr) memcheck/tests/leak-0 (stderr) memcheck/tests/leak-cycle (stderr) memcheck/tests/leak-regroot (stderr) memcheck/tests/leak-tree (stderr) memcheck/tests/leakotron (stdout) memcheck/tests/match-overrun (stderr) memcheck/tests/mempool (stderr) memcheck/tests/partial_load_dflt (stderr) memcheck/tests/partial_load_ok (stderr) memcheck/tests/partiallydefinedeq (stderr) memcheck/tests/pointer-trace (stderr) memcheck/tests/sigkill (stderr) memcheck/tests/stack_changes (stderr) none/tests/x86/faultstatus (stderr) none/tests/x86/int (stderr) |
|
From: Tom H. <th...@cy...> - 2005-12-15 03:27:20
|
Nightly build on dellow ( x86_64, Fedora Core 4 ) started at 2005-12-15 03:10:11 GMT Results unchanged from 24 hours ago Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 227 tests, 5 stderr failures, 1 stdout failure ================= memcheck/tests/mempool (stderr) memcheck/tests/x86/scalar (stderr) none/tests/amd64/faultstatus (stderr) none/tests/mremap2 (stdout) none/tests/x86/faultstatus (stderr) none/tests/x86/int (stderr) |
|
From: Tom H. <th...@cy...> - 2005-12-15 03:26:41
|
Nightly build on aston ( x86_64, Fedora Core 3 ) started at 2005-12-15 03:05:16 GMT Results differ from 24 hours ago Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 227 tests, 6 stderr failures, 1 stdout failure ================= memcheck/tests/mempool (stderr) memcheck/tests/x86/scalar (stderr) memcheck/tests/x86/scalar_supp (stderr) none/tests/amd64/faultstatus (stderr) none/tests/mremap2 (stdout) none/tests/x86/faultstatus (stderr) none/tests/x86/int (stderr) ================================================= == Results from 24 hours ago == ================================================= Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 227 tests, 7 stderr failures, 1 stdout failure ================= memcheck/tests/mempool (stderr) memcheck/tests/pointer-trace (stderr) memcheck/tests/x86/scalar (stderr) memcheck/tests/x86/scalar_supp (stderr) none/tests/amd64/faultstatus (stderr) none/tests/mremap2 (stdout) none/tests/x86/faultstatus (stderr) none/tests/x86/int (stderr) ================================================= == Difference between 24 hours ago and now == ================================================= *** old.short Thu Dec 15 03:19:10 2005 --- new.short Thu Dec 15 03:26:36 2005 *************** *** 8,12 **** ! == 227 tests, 7 stderr failures, 1 stdout failure ================= memcheck/tests/mempool (stderr) - memcheck/tests/pointer-trace (stderr) memcheck/tests/x86/scalar (stderr) --- 8,11 ---- ! == 227 tests, 6 stderr failures, 1 stdout failure ================= memcheck/tests/mempool (stderr) memcheck/tests/x86/scalar (stderr) |