[Substrate-commits] SF.net SVN: substrate: [287] trunk/Foundation

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 287
          http://svn.sourceforge.net/substrate/?rev=287&view=rev
Author:   landonf
Date:     2006-08-31 17:08:24 -0700 (Thu, 31 Aug 2006)

Log Message:
-----------
Merge in latest spinlock implementation from Postgres

Modified Paths:
--------------
    trunk/Foundation/s_lock.c
    trunk/Foundation/s_lock.h

Modified: trunk/Foundation/s_lock.c
===================================================================

--- trunk/Foundation/s_lock.c	2006-08-31 22:59:15 UTC (rev 286)
+++ trunk/Foundation/s_lock.c	2006-09-01 00:08:24 UTC (rev 287)
@@ -30,7 +30,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/s_lock.c,v 1.35 2004/12/31 22:01:05 pgsql Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/s_lock.c,v 1.40.2.2 2006/05/11 21:58:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -68,6 +68,7 @@
 #include <Foundation/s_lock.h>
 #include <Foundation/port.h>
 
+/* A few postgresql defines and macros */
 /*
  * The random() function is expected to yield values between 0 and
  * MAX_RANDOM_VALUE.  Currently, all known implementations yield
@@ -78,7 +79,12 @@
  * considerably inferior to --- random().
  */
 #define MAX_RANDOM_VALUE  (0x7FFFFFFF)
+#define Min(x, y) ((x) < (y) ? (x) : (y))
+#define Max(x, y) ((x) > (y) ? (x) : (y))
 
+
+static int	spins_per_delay = DEFAULT_SPINS_PER_DELAY;
+
 /*
  * s_lock_stuck() - complain about a stuck spinlock
  */
@@ -104,44 +110,52 @@
 lf_s_lock(volatile slock_t *lock, const char *file, int line)
 {
 	/*
-	 * We loop tightly for awhile, then delay using pg_usleep() and try
-	 * again. Preferably, "awhile" should be a small multiple of the
-	 * maximum time we expect a spinlock to be held.  100 iterations seems
-	 * about right.  In most multi-CPU scenarios, the spinlock is probably
-	 * held by a process on another CPU and will be released before we
-	 * finish 100 iterations.  However, on a uniprocessor, the tight loop
-	 * is just a waste of cycles, so don't iterate thousands of times.
+	 * We loop tightly for awhile, then delay using pg_usleep() and try again.
+	 * Preferably, "awhile" should be a small multiple of the maximum time we
+	 * expect a spinlock to be held.  100 iterations seems about right as an
+	 * initial guess.  However, on a uniprocessor the loop is a waste of
+	 * cycles, while in a multi-CPU scenario it's usually better to spin a bit
+	 * longer than to call the kernel, so we try to adapt the spin loop count
+	 * depending on whether we seem to be in a uniprocessor or multiprocessor.
 	 *
+	 * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
+	 * be wrong; there are platforms where that can result in a "stuck
+	 * spinlock" failure.  This has been seen particularly on Alphas; it seems
+	 * that the first TAS after returning from kernel space will always fail
+	 * on that hardware.
+	 *
 	 * Once we do decide to block, we use randomly increasing pg_usleep()
-	 * delays. The first delay is 10 msec, then the delay randomly
-	 * increases to about one second, after which we reset to 10 msec and
-	 * start again.  The idea here is that in the presence of heavy
-	 * contention we need to increase the delay, else the spinlock holder
-	 * may never get to run and release the lock.  (Consider situation
-	 * where spinlock holder has been nice'd down in priority by the
-	 * scheduler --- it will not get scheduled until all would-be
-	 * acquirers are sleeping, so if we always use a 10-msec sleep, there
-	 * is a real possibility of starvation.)  But we can't just clamp the
-	 * delay to an upper bound, else it would take a long time to make a
-	 * reasonable number of tries.
+	 * delays. The first delay is 1 msec, then the delay randomly increases to
+	 * about one second, after which we reset to 1 msec and start again.  The
+	 * idea here is that in the presence of heavy contention we need to
+	 * increase the delay, else the spinlock holder may never get to run and
+	 * release the lock.  (Consider situation where spinlock holder has been
+	 * nice'd down in priority by the scheduler --- it will not get scheduled
+	 * until all would-be acquirers are sleeping, so if we always use a 1-msec
+	 * sleep, there is a real possibility of starvation.)  But we can't just
+	 * clamp the delay to an upper bound, else it would take a long time to
+	 * make a reasonable number of tries.
 	 *
 	 * We time out and declare error after NUM_DELAYS delays (thus, exactly
-	 * that many tries).  With the given settings, this will usually take
-	 * 3 or so minutes.  It seems better to fix the total number of tries
-	 * (and thus the probability of unintended failure) than to fix the
-	 * total time spent.
+	 * that many tries).  With the given settings, this will usually take 2 or
+	 * so minutes.	It seems better to fix the total number of tries (and thus
+	 * the probability of unintended failure) than to fix the total time
+	 * spent.
 	 *
-	 * The pg_usleep() delays are measured in centiseconds (0.01 sec) because
-	 * 10 msec is a common resolution limit at the OS level.
+	 * The pg_usleep() delays are measured in milliseconds because 1 msec is a
+	 * common resolution limit at the OS level for newer platforms. On older
+	 * platforms the resolution limit is usually 10 msec, in which case the
+	 * total delay before timeout will be a bit more.
 	 */
-#define SPINS_PER_DELAY		100
+#define MIN_SPINS_PER_DELAY 10
+#define MAX_SPINS_PER_DELAY 1000
 #define NUM_DELAYS			1000
-#define MIN_DELAY_CSEC		1
-#define MAX_DELAY_CSEC		100
+#define MIN_DELAY_MSEC		1
+#define MAX_DELAY_MSEC		1000
 
 	int			spins = 0;
 	int			delays = 0;
-	int			cur_delay = MIN_DELAY_CSEC;
+	int			cur_delay = 0;
 #ifndef WIN32
 	struct timeval		delay;
 #endif
@@ -151,21 +165,21 @@
 		/* CPU-specific delay each time through the loop */
 		SPIN_DELAY();
 
-		/* Block the process every SPINS_PER_DELAY tries */
-		if (++spins > SPINS_PER_DELAY)
+		/* Block the process every spins_per_delay tries */
+		if (++spins >= spins_per_delay)
 		{
 			if (++delays > NUM_DELAYS)
 				s_lock_stuck(lock, file, line);
 
-			if (cur_delay > 0) {
+			if (cur_delay == 0) /* first time to delay? */
+				cur_delay = MIN_DELAY_MSEC;
 #ifndef WIN32
-				delay.tv_sec = (cur_delay * 10000L) / 1000000L;
-				delay.tv_usec = (cur_delay * 10000L) % 1000000L;
-				(void) select(0, NULL, NULL, NULL, &delay);
+			delay.tv_sec = 0;
+			delay.tv_usec = cur_delay * 1000L;
+			(void) select(0, NULL, NULL, NULL, &delay);
 #else
-				SleepEx(((cur_delay * 10000L) < 500 ? 1 : ((cur_delay * 10000L) + 500) / 1000), FALSE);
+			SleepEx(cur_delay, FALSE)
 #endif
-			}
 
 #if defined(S_LOCK_TEST)
 			fprintf(stdout, "*");
@@ -174,17 +188,78 @@
 
 			/* increase delay by a random fraction between 1X and 2X */
 			cur_delay += (int) (cur_delay *
-			  (((double) random()) / ((double) MAX_RANDOM_VALUE)) + 0.5);
+				  (((double) random()) / ((double) MAX_RANDOM_VALUE)) + 0.5);
 			/* wrap back to minimum delay when max is exceeded */
-			if (cur_delay > MAX_DELAY_CSEC)
-				cur_delay = MIN_DELAY_CSEC;
+			if (cur_delay > MAX_DELAY_MSEC)
+				cur_delay = MIN_DELAY_MSEC;
 
 			spins = 0;
 		}
 	}
+
+	/*
+	 * If we were able to acquire the lock without delaying, it's a good
+	 * indication we are in a multiprocessor.  If we had to delay, it's a sign
+	 * (but not a sure thing) that we are in a uniprocessor. Hence, we
+	 * decrement spins_per_delay slowly when we had to delay, and increase it
+	 * rapidly when we didn't.  It's expected that spins_per_delay will
+	 * converge to the minimum value on a uniprocessor and to the maximum
+	 * value on a multiprocessor.
+	 *
+	 * Note: spins_per_delay is local within our current process. We want to
+	 * average these observations across multiple backends, since it's
+	 * relatively rare for this function to even get entered, and so a single
+	 * backend might not live long enough to converge on a good value.	That
+	 * is handled by the two routines below.
+	 */
+	if (cur_delay == 0)
+	{
+		/* we never had to delay */
+		if (spins_per_delay < MAX_SPINS_PER_DELAY)
+			spins_per_delay = Min (spins_per_delay + 100, MAX_SPINS_PER_DELAY);
+	}
+	else
+	{
+		if (spins_per_delay > MIN_SPINS_PER_DELAY)
+			spins_per_delay = Max (spins_per_delay - 1, MIN_SPINS_PER_DELAY);
+	}
 }
 
+
 /*
+ * Set local copy of spins_per_delay during backend startup.
+ *
+ * NB: this has to be pretty fast as it is called while holding a spinlock
+ */
+LF_PRIVATE void
+set_spins_per_delay(int shared_spins_per_delay)
+{
+	spins_per_delay = shared_spins_per_delay;
+}
+
+/*
+ * Update shared estimate of spins_per_delay during backend exit.
+ *
+ * NB: this has to be pretty fast as it is called while holding a spinlock
+ */
+LF_PRIVATE int
+update_spins_per_delay(int shared_spins_per_delay)
+{
+	/*
+	 * We use an exponential moving average with a relatively slow adaption
+	 * rate, so that noise in any one backend's result won't affect the shared
+	 * value too much.	As long as both inputs are within the allowed range,
+	 * the result must be too, so we need not worry about clamping the result.
+	 *
+	 * We deliberately truncate rather than rounding; this is so that single
+	 * adjustments inside a backend can affect the shared estimate (see the
+	 * asymmetric adjustment rules above).
+	 */
+	return (shared_spins_per_delay * 15 + spins_per_delay) / 16;
+}
+
+
+/*
  * Various TAS implementations that cannot live in s_lock.h as no inline
  * definition exists (yet).
  * In the future, get rid of tas.[cso] and fold it into this file.
@@ -204,7 +279,12 @@
  */
 
 
-#if defined(__m68k__)
+/*
+ * Note: all the if-tests here probably ought to be testing gcc version
+ * rather than platform, but I don't have adequate info to know what to
+ * write.  Ideally we'd flush all this in favor of the inline version.
+ */
+#if defined(__m68k__) && !defined(__linux__)
 /* really means: extern int tas(slock_t* **lock); */
 static void
 tas_dummy()
@@ -212,7 +292,7 @@
 	__asm__		__volatile__(
 #if defined(__NetBSD__) && defined(__ELF__)
 /* no underscore for label and % for registers */
-										"\
+										 "\
 .global		tas 				\n\
 tas:							\n\
 			movel	%sp@(0x4),%a0	\n\
@@ -224,7 +304,7 @@
 			moveq	#0,%d0		\n\
 			rts 				\n"
 #else
-										"\
+										 "\
 .global		_tas				\n\
 _tas:							\n\
 			movel	sp@(0x4),a0	\n\
@@ -236,39 +316,9 @@
 			moveq 	#0,d0		\n\
 			rts					\n"
 #endif   /* __NetBSD__ && __ELF__ */
-);
+	);
 }
-#endif   /* __m68k__ */
-
-
-#if defined(__mips__) && !defined(__sgi)
-static void
-tas_dummy()
-{
-	__asm__		__volatile__(
-										 "\
-.global	tas						\n\
-tas:							\n\
-			.frame	$sp, 0, $31	\n\
-			.set push		\n\
-			.set mips2		\n\
-			ll		$14, 0($4)	\n\
-			or		$15, $14, 1	\n\
-			sc		$15, 0($4)	\n\
-			.set pop			\n\
-			beq		$15, 0, fail\n\
-			bne		$14, 0, fail\n\
-			li		$2, 0		\n\
-			.livereg 0x2000FF0E,0x00000FFF	\n\
-			j		$31			\n\
-fail:							\n\
-			li		$2, 1		\n\
-			j   	$31			\n\
-");
-}
-#endif   /* __mips__ && !__sgi */
-
-
+#endif   /* __m68k__ && !__linux__ */
 #else							/* not __GNUC__ */
 
 /*
@@ -309,15 +359,6 @@
 tas_dummy()						/* really means: extern int tas(slock_t
 								 * *lock); */
 {
-
-#ifdef SUNOS4_CC
-	asm(".seg \"data\"");
-	asm(".seg \"text\"");
-#else
-	asm(".section \"data\"");
-	asm(".section \"text\"");
-#endif
-
 	asm("_tas:");
 
 	/*

Modified: trunk/Foundation/s_lock.h
===================================================================
--- trunk/Foundation/s_lock.h	2006-08-31 22:59:15 UTC (rev 286)
+++ trunk/Foundation/s_lock.h	2006-09-01 00:08:24 UTC (rev 287)
@@ -86,7 +86,7 @@
  * ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  *
- *	  $PostgreSQL: pgsql/src/include/storage/s_lock.h,v 1.133 2004/12/31 22:03:42 pgsql Exp $
+ *	  $PostgreSQL: pgsql/src/include/storage/s_lock.h,v 1.142 2005/10/11 20:41:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -123,7 +123,7 @@
  */
 
 
-#if defined(__i386__) || defined(__x86_64__) /* AMD Opteron */
+#ifdef __i386__
 #define HAS_TEST_AND_SET
 
 typedef unsigned char slock_t;
@@ -135,7 +135,11 @@
 {
 	register slock_t _res = 1;
 
-	/* Use a non-locking test before asserting the bus lock */
+	/*
+	 * Use a non-locking test before asserting the bus lock.  Note that the
+	 * extra test appears to be a small loss on some x86 platforms and a small
+	 * win on others; it's by no means clear that we should keep it.
+	 */
 	__asm__ __volatile__(
 		"	cmpb	$0,%1	\n"
 		"	jne		1f		\n"
@@ -180,10 +184,52 @@
 		" rep; nop			\n");
 }
 
-#endif	 /* __i386__ || __x86_64__ */
+#endif	 /* __i386__ */
 
 
-#if defined(__ia64__) || defined(__ia64)  /* __ia64 used by ICC compiler? */
+#ifdef __x86_64__		/* AMD Opteron, Intel EM64T */
+#define HAS_TEST_AND_SET
+
+typedef unsigned char slock_t;
+
+#define TAS(lock) tas(lock)
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+	register slock_t _res = 1;
+
+	/*
+	 * On Opteron, using a non-locking test before the locking instruction
+	 * is a huge loss.  On EM64T, it appears to be a wash or small loss,
+	 * so we needn't bother to try to distinguish the sub-architectures.
+	 */
+	__asm__ __volatile__(
+		"	lock			\n"
+		"	xchgb	%0,%1	\n"
+:		"+q"(_res), "+m"(*lock)
+:
+:		"memory", "cc");
+	return (int) _res;
+}
+
+#define SPIN_DELAY() spin_delay()
+
+static __inline__ void
+spin_delay(void)
+{
+	/*
+	 * Adding a PAUSE in the spin delay loop is demonstrably a no-op on
+	 * Opteron, but it may be of some use on EM64T, so we keep it.
+	 */
+	__asm__ __volatile__(
+		" rep; nop			\n");
+}
+
+#endif	 /* __x86_64__ */
+
+
+#if defined(__ia64__) || defined(__ia64)
 /* Intel Itanium */
 #define HAS_TEST_AND_SET
 
@@ -191,6 +237,8 @@
 
 #define TAS(lock) tas(lock)
 
+#ifndef __INTEL_COMPILER
+
 static __inline__ int
 tas(volatile slock_t *lock)
 {
@@ -204,6 +252,19 @@
 	return (int) ret;
 }
 
+#else /* __INTEL_COMPILER */
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+	int		ret;
+
+	ret = _InterlockedExchange(lock,1);	/* this is a xchg asm macro */
+
+	return ret;
+}
+
+#endif /* __INTEL_COMPILER */
 #endif	 /* __ia64__ || __ia64 */
 
 
@@ -328,7 +389,7 @@
 #endif /* powerpc */
 
 
-#if defined(__mc68000__) && defined(__linux__)
+#if (defined(__mc68000__) || defined(__m68k__)) && defined(__linux__)
 #define HAS_TEST_AND_SET
 
 typedef unsigned char slock_t;
@@ -350,7 +411,7 @@
 	return rv;
 }
 
-#endif	 /* defined(__mc68000__) && defined(__linux__) */
+#endif	 /* (__mc68000__ || __m68k__) && __linux__ */
 
 
 #if defined(__vax__)
@@ -453,20 +514,64 @@
 #endif /* __alpha || __alpha__ */
 
 
-/* These live in s_lock.c, but only for gcc */
+#if defined(__mips__) && !defined(__sgi)
+/* Note: on SGI we use the OS' mutex ABI, see below */
+/* Note: R10000 processors require a separate SYNC */
+#define HAS_TEST_AND_SET
 
+typedef unsigned int slock_t;
 
-#if defined(__m68k__)
-#define HAS_TEST_AND_SET
+#define TAS(lock) tas(lock)
 
-typedef unsigned char slock_t;
-#endif
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+	register volatile slock_t *_l = lock;
+	register int _res;
+	register int _tmp;
 
+	__asm__ __volatile__(
+		"       .set push           \n"
+		"       .set mips2          \n"
+		"       .set noreorder      \n"
+		"       .set nomacro        \n"
+		"       ll      %0, %2      \n"
+		"       or      %1, %0, 1   \n"
+		"       sc      %1, %2      \n"
+		"       xori    %1, 1       \n"
+		"       or      %0, %0, %1  \n"
+		"       sync                \n"
+		"       .set pop              "
+:		"=&r" (_res), "=&r" (_tmp), "+R" (*_l)
+:
+:		"memory");
+	return _res;
+}
 
-#if defined(__mips__) && !defined(__sgi)
+/* MIPS S_UNLOCK is almost standard but requires a "sync" instruction */
+#define S_UNLOCK(lock)	\
+do \
+{ \
+	__asm__ __volatile__( \
+		"       .set push           \n" \
+		"       .set mips2          \n" \
+		"       .set noreorder      \n" \
+		"       .set nomacro        \n" \
+		"       sync                \n" \
+		"       .set pop              "); \
+	*((volatile slock_t *) (lock)) = 0; \
+} while (0)
+
+#endif /* __mips__ && !__sgi */
+
+
+/* These live in s_lock.c, but only for gcc */
+
+
+#if defined(__m68k__) && !defined(__linux__)
 #define HAS_TEST_AND_SET
 
-typedef unsigned int slock_t;
+typedef unsigned char slock_t;
 #endif
 
 
@@ -492,9 +597,9 @@
 tas(volatile slock_t *s_lock)
 {
 /* UNIVEL wants %mem in column 1, so we don't pg_indent this file */
-%mem lf_s_lock
+%mem s_lock
 	pushl %ebx
-	movl lf_s_lock, %ebx
+	movl s_lock, %ebx
 	movl $255, %eax
 	lock
 	xchgb %al, (%ebx)
@@ -713,7 +818,7 @@
 
 /* Blow up if we didn't have any way to do spinlocks */
 #ifndef HAS_TEST_AND_SET
-#error libFoundation does not have native spinlock support on this platform.  Please report this to lib...@op....
+#error Substrate does not have native spinlock support on this platform.  Please report this to sub...@li....
 #endif
 
 /*
@@ -757,4 +862,10 @@
  */
 extern void lf_s_lock(volatile slock_t *lock, const char *file, int line);
 
+/* Support for dynamic adjustment of spins_per_delay */
+#define DEFAULT_SPINS_PER_DELAY  100
+
+extern void set_spins_per_delay(int shared_spins_per_delay);
+extern int	update_spins_per_delay(int shared_spins_per_delay);
+
 #endif	 /* S_LOCK_H */


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.