|
From: <sv...@va...> - 2015-01-18 16:35:47
|
Author: sewardj
Date: Sun Jan 18 16:35:40 2015
New Revision: 3068
Log:
Move LibVEX_Alloc into the header file so it can be inlined.
Modified:
branches/NCODE/priv/main_util.c
branches/NCODE/pub/libvex.h
Modified: branches/NCODE/priv/main_util.c
==============================================================================
--- branches/NCODE/priv/main_util.c (original)
+++ branches/NCODE/priv/main_util.c Sun Jan 18 16:35:40 2015
@@ -68,8 +68,8 @@
static HChar* permanent_last = &permanent[N_PERMANENT_BYTES-1];
static HChar* private_LibVEX_alloc_first = &temporary[0];
-static HChar* private_LibVEX_alloc_curr = &temporary[0];
-static HChar* private_LibVEX_alloc_last = &temporary[N_TEMPORARY_BYTES-1];
+ HChar* private_LibVEX_alloc_curr = &temporary[0];
+ HChar* private_LibVEX_alloc_last = &temporary[N_TEMPORARY_BYTES-1];
static VexAllocMode mode = VexAllocModeTEMP;
@@ -155,7 +155,7 @@
}
__attribute__((noreturn))
-static void private_LibVEX_alloc_OOM(void)
+void private_LibVEX_alloc_OOM(void)
{
const HChar* pool = "???";
if (private_LibVEX_alloc_first == &temporary[0]) pool = "TEMP";
@@ -203,43 +203,10 @@
translation of the current basic block is complete.
*/
-void* LibVEX_Alloc ( SizeT nbytes )
-{
- struct align {
- char c;
- union {
- char c;
- short s;
- int i;
- long l;
- long long ll;
- float f;
- double d;
- /* long double is currently not used and would increase alignment
- unnecessarily. */
- /* long double ld; */
- void *pto;
- void (*ptf)(void);
- } x;
- };
-
-#if 0
- /* Nasty debugging hack, do not use. */
- return malloc(nbytes);
-#else
- HChar* curr;
- HChar* next;
- SizeT ALIGN;
- ALIGN = offsetof(struct align,x) - 1;
- nbytes = (nbytes + ALIGN) & ~ALIGN;
- curr = private_LibVEX_alloc_curr;
- next = curr + nbytes;
- if (next >= private_LibVEX_alloc_last)
- private_LibVEX_alloc_OOM();
- private_LibVEX_alloc_curr = next;
- return curr;
-#endif
-}
+/* void* LibVEX_Alloc ( SizeT nbytes )
+ is now in libvex.h, so it can be inlined.
+*/
+
void LibVEX_ShowAllocStats ( void )
{
Modified: branches/NCODE/pub/libvex.h
==============================================================================
--- branches/NCODE/pub/libvex.h (original)
+++ branches/NCODE/pub/libvex.h Sun Jan 18 16:35:40 2015
@@ -453,7 +453,51 @@
callback that you have previously specified in a call to
LibVEX_Translate. The storage allocated will only stay alive until
translation of the current basic block is complete. */
-extern void* LibVEX_Alloc ( SizeT nbytes );
+
+extern HChar* private_LibVEX_alloc_curr;
+extern HChar* private_LibVEX_alloc_last;
+__attribute__((noreturn))
+extern void private_LibVEX_alloc_OOM(void);
+
+static inline void* LibVEX_Alloc ( SizeT nbytes )
+{
+# define my_offsetof(type,memb) ((SizeT)(HWord)&((type*)0)->memb)
+# define my_UNLIKELY(x) __builtin_expect(!!(x), 0)
+
+ struct align {
+ char c;
+ union {
+ char c;
+ short s;
+ int i;
+ long l;
+ long long ll;
+ float f;
+ double d;
+ /* long double is currently not used and would increase alignment
+ unnecessarily. */
+ /* long double ld; */
+ void *pto;
+ void (*ptf)(void);
+ } x;
+ };
+
+ HChar* curr;
+ HChar* next;
+ SizeT ALIGN;
+ ALIGN = my_offsetof(struct align,x) - 1;
+ nbytes = (nbytes + ALIGN) & ~ALIGN;
+ curr = private_LibVEX_alloc_curr;
+ next = curr + nbytes;
+ if (my_UNLIKELY(next >= private_LibVEX_alloc_last))
+ private_LibVEX_alloc_OOM();
+ private_LibVEX_alloc_curr = next;
+ return curr;
+
+# undef my_UNLIKELY
+# undef my_offsetof
+}
+
/* Show Vex allocation statistics. */
extern void LibVEX_ShowAllocStats ( void );
|
|
From: Florian K. <fl...@ei...> - 2015-01-18 18:39:15
|
On 18.01.2015 17:35, sv...@va... wrote: > Author: sewardj > Date: Sun Jan 18 16:35:40 2015 > New Revision: 3068 > > Log: > Move LibVEX_Alloc into the header file so it can be inlined. > This change effectively reverts r2974 which was added for a reason, namely to avoid linkage problems with the icc compiler. Does this give a measurable performance gain? Florian |
|
From: Julian S. <js...@ac...> - 2015-01-18 18:54:21
|
On 18/01/15 19:39, Florian Krohm wrote: > This change effectively reverts r2974 which was added for a reason, > namely to avoid linkage problems with the icc compiler. Ah. Oh. > Does this give a measurable performance gain? Yes. For perf/bigcode2 it reduces the instruction count (on the NCODE branch) from 41,701,692,614 to 40,578,319,334 and in terms of real time, it improves perf by about 1% on this machine (Intel Haswell). I made the change as a result of looking at a Callgrind profile of Memcheck running that test, which shows about 4.5 million calls to LibVEX_Alloc each using just 8 insns. Hence it's a prime candidate for inlining. I don't want to regress functionality/correctness though. Do you know if there is some way to have LibVEX_Alloc be inlined and not break building with icc? That is, to have our cake and eat it, so to speak. J |
|
From: Florian K. <fl...@ei...> - 2015-01-18 20:05:41
|
On 18.01.2015 19:54, Julian Seward wrote: > > I don't want to regress functionality/correctness though. Do you know > if there is some way to have LibVEX_Alloc be inlined and not break building > with icc? That is, to have our cake and eat it, so to speak. > Hmm.. perhaps the header file entanglement in r14600 fixes this. I recall that linking the linux launcher did not work because it included libvex.h which defined LibVEX_Alloc. ICC chose to keep the definition of the function (even though it wasn't used) and as LibVEX_Alloc is referring to some global variables defined in main_util.c this caused linker errors (linux launcher does not pull in libvex.a). Not sure if there was another problem... Florian |
|
From: John R. <jr...@bi...> - 2015-01-19 04:01:04
|
The code in LibVEX_Alloc works "by accident". The result happens to be correct for current architectures (x86, x86_64, PowerPC, PowerPC-64, MIPS, MIPS-64, ARM, ARM-64) and compilers (gcc, icc, clang, xlc, ...) However, an adversarial compiler could force LibVEX_Alloc to return a value which is not properly aligned. In particular, PowerPC-64 is on the cusp. Also, LibVEX_Alloc wastes space. The wasted space is easy to see. Suppose these are two consecutive calls: p1 = LibVEX_Alloc(1); p2 = LibVEX_Alloc(1); If p2 and p1 reside in the same allocation block, then (8 == ((char *)p2 - (char *)p1)) and there are 7 wasted bytes beginning at (1+ (char *)p1). The alignment of the result is "correct by accident" because the code enforces alignment only to (0 modulo 8). This happens to be enough because (8 == my_offsetof(struct align,x)) in all current cases. However, it is easy to imagine a case where (16 == my_offsetof(struct align,x)). In particular, a compiler for PowerPC-64 is allowed to choose (16 == alignof(void (*)(void))) (perhaps motivated by (16 == sizeof(void (*)(void))) ) and that would be enough to reveal the bug in LibVEX_Alloc. The existing code aligns the lowest pointer in an allocation block to a multiple of 8 using code analogous to static HChar permanent[N_PERMANENT_BYTES] __attribute__((aligned(8))); and the code in LibVEX_Alloc does not check for (8 < my_offsetof(struct align,x)). Both the wasted space and the possible incorrect alignment happen because LibVEX_Alloc aligns the size. Instead, LibVEX_Alloc should align the pointer: static unsigned const alignment_table[16] = [0,0,1,1, 3,3,3,3, 7,7,7,7, 7,7,7,7]; nbytes += (0 == nbytes); /* 0 ==> 1; ISO C requires that malloc(0) be (GENSYM) */ ALIGN = ((nbytes >= 0x10) ? 0xf : alignment_table[nbytes]); curr = (char *)(~ALIGN & (ALIGN + (ptrdiff_t)private_LibVEX_alloc_next); private_LibVEX_alloc_next = nbytes + curr; return curr; If it is known that nbytes is sizeof an actual C struct, then the table may be static unsigned const alignment_table[16] = [0,0,1,0, 3,0,1,0, 7,0,1,0, 3,0,1,0]; because alignof(struct X) must divide sizeof(struct X); think of "struct X[N];" In practice this may be violated by a trailing variable-length character array, but that is not legal C. |
|
From: Florian K. <fl...@ei...> - 2015-01-19 09:45:41
|
On 19.01.2015 05:00, John Reiser wrote: > The code in LibVEX_Alloc works "by accident". The result happens to be correct > for current architectures (x86, x86_64, PowerPC, PowerPC-64, MIPS, MIPS-64, ARM, > ARM-64) and compilers (gcc, icc, clang, xlc, ...) However, an adversarial compiler > could force LibVEX_Alloc to return a value which is not properly aligned. > In particular, PowerPC-64 is on the cusp. Also, LibVEX_Alloc wastes space. > > The wasted space is easy to see. Suppose these are two consecutive calls: > p1 = LibVEX_Alloc(1); > p2 = LibVEX_Alloc(1); > If p2 and p1 reside in the same allocation block, then > (8 == ((char *)p2 - (char *)p1)) > and there are 7 wasted bytes beginning at (1+ (char *)p1). > > The alignment of the result is "correct by accident" because the code > enforces alignment only to (0 modulo 8). This happens to be enough because > (8 == my_offsetof(struct align,x)) in all current cases. However, it is > easy to imagine a case where (16 == my_offsetof(struct align,x)). > In particular, a compiler for PowerPC-64 is allowed to choose > (16 == alignof(void (*)(void))) > (perhaps motivated by (16 == sizeof(void (*)(void))) ) > and that would be enough to reveal the bug in LibVEX_Alloc. > > The existing code aligns the lowest pointer in an allocation block to a > multiple of 8 using code analogous to > static HChar permanent[N_PERMANENT_BYTES] __attribute__((aligned(8))); > and the code in LibVEX_Alloc does not check for (8 < my_offsetof(struct align,x)). Yes, we should fix the attribute spec. > Both the wasted space and the possible incorrect alignment happen because > LibVEX_Alloc aligns the size. Instead, LibVEX_Alloc should align the pointer: > > static unsigned const alignment_table[16] = [0,0,1,1, 3,3,3,3, 7,7,7,7, 7,7,7,7]; > nbytes += (0 == nbytes); /* 0 ==> 1; ISO C requires that malloc(0) be (GENSYM) */ > ALIGN = ((nbytes >= 0x10) ? 0xf : alignment_table[nbytes]); 0xf seems wasteful to me if only 8 byte alignment is needed. But that can be salvaged and auto-adjusted to whatever the current compiler uses. > curr = (char *)(~ALIGN & (ALIGN + (ptrdiff_t)private_LibVEX_alloc_next); > private_LibVEX_alloc_next = nbytes + curr; > return curr; I would be interested in some numbers. Would you mind running an experiment with, say, the tests from /perf ? Florian |
|
From: Julian S. <js...@ac...> - 2015-01-19 10:47:33
|
On 19/01/15 10:45, Florian Krohm wrote: > On 19.01.2015 05:00, John Reiser wrote: >> The code in LibVEX_Alloc works "by accident". The result happens to be correct >> for current architectures (x86, x86_64, PowerPC, PowerPC-64, MIPS, MIPS-64, ARM, >> ARM-64) and compilers (gcc, icc, clang, xlc, ...) However, an adversarial compiler >> could force LibVEX_Alloc to return a value which is not properly aligned. >> In particular, PowerPC-64 is on the cusp. Also, LibVEX_Alloc wastes space. The current code produces an alignment of 8 I think, which is wrong on all -- or, at least, almost all -- targets anyway. Even on x86 the minimum correct alignment is 16, for storage of vector registers (xmm). But LibVEX_Alloc is designed to support vex, and be a very fast and compact bump-allocator. Vex doesn't do any vector loads/stores so an alignment of 8 seems OK. Given that vex allocates at least at hundreds of megabytes per second and given that D1$ space is very limited, especially on lower-end targets, I'd prefer not to change the alignment requirement to 16 unless really necessary. Maybe a simple solution would be to remove from LibVEX_Alloc the "struct align" thing and hardwire 8 as the alignment. And also, in LibVEX_Init, put some assertions to check that 8 is in fact OK for the set of types we're interested in. Given that the base pointer for the area is already 8 aligned, I don't then see how aligning the size rather than the pointer could lead to misaligned results. Aligning the size rather than the pointer has the advantage that it can be done by gcc's constant folding at compile time, for any allocation of constant size, which almost all of them are. As for LibVEX_Alloc(0), that should never happen. I guess we could add an assert in there on the assumption that it will get constant folded out for all constant-size allocation requests. J |