|
From: <sv...@va...> - 2006-05-03 22:14:13
|
Author: sewardj
Date: 2006-05-03 23:13:57 +0100 (Wed, 03 May 2006)
New Revision: 5880
Log:
Vectorise copy_address_range_perms for common cases. This gives about
40% speedup on artificial programs which just do realloc() and nothing
else, and about a 3-4% speedup on starting kpresenter-1.5.0 and
loading a 16-slide presentation.
Modified:
trunk/docs/internals/performance.txt
trunk/memcheck/mc_main.c
Modified: trunk/docs/internals/performance.txt
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/docs/internals/performance.txt 2006-05-03 18:09:41 UTC (rev 587=
9)
+++ trunk/docs/internals/performance.txt 2006-05-03 22:13:57 UTC (rev 588=
0)
@@ -29,6 +29,10 @@
- Nick changed ExeContext gathering to not record/save extra zeroes at t=
he
end. Saved 7% on perf/heap with --num-callers=3D50, and about 1% on
perf/tinycc.
+- Julian vectorised copy_address_range_perms for common cases, which
+ gives about 40% speedup on artificial programs which just do
+ realloc() and nothing else, and about a 3-4% speedup on starting
+ kpresenter-1.5.0 and loading a 16-slide presentation.
=20
COMPVBITS branch:
- Nick converted to compress V bits, initial version saved 0--5% on most
Modified: trunk/memcheck/mc_main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/memcheck/mc_main.c 2006-05-03 18:09:41 UTC (rev 5879)
+++ trunk/memcheck/mc_main.c 2006-05-03 22:13:57 UTC (rev 5880)
@@ -589,6 +589,28 @@
return extract_vabits2_from_vabits8(a, vabits8);
}
=20
+// *** WARNING! ***
+// Any time this function is called, if it is possible that any of the
+// 4 2-bit fields in vabits8 are equal to VA_BITS2_PARTDEFINED, then the=
=20
+// corresponding entry(s) in the sec-V-bits table must also be set!
+static INLINE
+UChar get_vabits8_for_aligned_word32 ( Addr a )
+{
+ SecMap* sm =3D get_secmap_for_reading(a);
+ UWord sm_off =3D SM_OFF(a);
+ UChar vabits8 =3D sm->vabits8[sm_off];
+ return vabits8;
+}
+
+static INLINE
+void set_vabits8_for_aligned_word32 ( Addr a, UChar vabits8 )
+{
+ SecMap* sm =3D get_secmap_for_writing(a);
+ UWord sm_off =3D SM_OFF(a);
+ sm->vabits8[sm_off] =3D vabits8;
+}
+
+
// Forward declarations
static UWord get_sec_vbits8(Addr a);
static void set_sec_vbits8(Addr a, UWord vbits8);
@@ -1227,35 +1249,81 @@
void MC_(copy_address_range_state) ( Addr src, Addr dst, SizeT len )
{
SizeT i, j;
- UChar vabits2;
+ UChar vabits2, vabits8;
+ Bool aligned, nooverlap;
=20
DEBUG("MC_(copy_address_range_state)\n");
PROF_EVENT(50, "MC_(copy_address_range_state)");
=20
- if (len =3D=3D 0)
+ if (len =3D=3D 0 || src =3D=3D dst)
return;
=20
- if (src < dst) {
- for (i =3D 0, j =3D len-1; i < len; i++, j--) {
- PROF_EVENT(51, "MC_(copy_address_range_state)(loop)");
- vabits2 =3D get_vabits2( src+j );
- set_vabits2( dst+j, vabits2 );
- if (VA_BITS2_PARTDEFINED =3D=3D vabits2) {
- set_sec_vbits8( dst+j, get_sec_vbits8( src+j ) );
+ aligned =3D VG_IS_4_ALIGNED(src) && VG_IS_4_ALIGNED(dst);
+ nooverlap =3D src+len <=3D dst || dst+len <=3D src;
+
+ if (nooverlap && aligned) {
+
+ /* Vectorised fast case, when no overlap and suitably aligned */
+ /* vector loop */
+ i =3D 0;
+ while (len >=3D 4) {
+ vabits8 =3D get_vabits8_for_aligned_word32( src+i );
+ set_vabits8_for_aligned_word32( dst+i, vabits8 );
+ if (EXPECTED_TAKEN(VA_BITS8_DEFINED =3D=3D vabits8=20
+ || VA_BITS8_UNDEFINED =3D=3D vabits8=20
+ || VA_BITS8_NOACCESS =3D=3D vabits8)) {
+ /* do nothing */
+ } else {
+ /* have to copy secondary map info */
+ if (VA_BITS2_PARTDEFINED =3D=3D get_vabits2( src+i+0 ))
+ set_sec_vbits8( dst+i+0, get_sec_vbits8( src+i+0 ) );
+ if (VA_BITS2_PARTDEFINED =3D=3D get_vabits2( src+i+1 ))
+ set_sec_vbits8( dst+i+1, get_sec_vbits8( src+i+1 ) );
+ if (VA_BITS2_PARTDEFINED =3D=3D get_vabits2( src+i+2 ))
+ set_sec_vbits8( dst+i+2, get_sec_vbits8( src+i+2 ) );
+ if (VA_BITS2_PARTDEFINED =3D=3D get_vabits2( src+i+3 ))
+ set_sec_vbits8( dst+i+3, get_sec_vbits8( src+i+3 ) );
}
+ i +=3D 4;
+ len -=3D 4;
}
- }
-
- if (src > dst) {
- for (i =3D 0; i < len; i++) {
- PROF_EVENT(52, "MC_(copy_address_range_state)(loop)");
+ /* fixup loop */
+ while (len >=3D 1) {
vabits2 =3D get_vabits2( src+i );
set_vabits2( dst+i, vabits2 );
if (VA_BITS2_PARTDEFINED =3D=3D vabits2) {
set_sec_vbits8( dst+i, get_sec_vbits8( src+i ) );
}
+ i++;
+ len--;
}
+
+ } else {
+
+ /* We have to do things the slow way */
+ if (src < dst) {
+ for (i =3D 0, j =3D len-1; i < len; i++, j--) {
+ PROF_EVENT(51, "MC_(copy_address_range_state)(loop)");
+ vabits2 =3D get_vabits2( src+j );
+ set_vabits2( dst+j, vabits2 );
+ if (VA_BITS2_PARTDEFINED =3D=3D vabits2) {
+ set_sec_vbits8( dst+j, get_sec_vbits8( src+j ) );
+ }
+ }
+ }
+
+ if (src > dst) {
+ for (i =3D 0; i < len; i++) {
+ PROF_EVENT(52, "MC_(copy_address_range_state)(loop)");
+ vabits2 =3D get_vabits2( src+i );
+ set_vabits2( dst+i, vabits2 );
+ if (VA_BITS2_PARTDEFINED =3D=3D vabits2) {
+ set_sec_vbits8( dst+i, get_sec_vbits8( src+i ) );
+ }
+ }
+ }
}
+
}
=20
=20
@@ -4422,6 +4490,3 @@
/*--------------------------------------------------------------------*/
/*--- end ---*/
/*--------------------------------------------------------------------*/
-
-
-
|