From: Arnd B. <ar...@ar...> - 2007-02-15 14:38:08
|
On Thursday 15 February 2007 00:52, Carl Love wrote: > --- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig 2007-01-18 16:43:14.000000000 -0600 > +++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig 2007-02-13 19:04:46.271028904 -0600 > @@ -7,7 +7,8 @@ > > config OPROFILE > tristate "OProfile system profiling (EXPERIMENTAL)" > - depends on PROFILING > + default m > + depends on SPU_FS && PROFILING > help > OProfile is a profiling system capable of profiling the > whole system, include the kernel, kernel modules, libraries, Milton already commented on this being wrong. I think what you want is depends on PROFILING && (SPU_FS = n || SPU_FS) that should make sure that when SPU_FS=y that OPROFILE can not be 'm'. > @@ -15,3 +16,10 @@ > > If unsure, say N. > > +config OPROFILE_CELL > + bool "OProfile for Cell Broadband Engine" > + depends on SPU_FS && OPROFILE > + default y > + help > + OProfile for Cell BE requires special support enabled > + by this option. You should at least mention that this allows profiling the spus. > +#define EFWCALL ENOSYS /* Use an existing error number that is as > + * close as possible for a FW call that failed. > + * The probability of the call failing is > + * very low. Passing up the error number > + * ensures that the user will see an error > + * message saying OProfile did not start. > + * Dmesg will contain an accurate message > + * about the failure. > + */ ENOSYS looks wrong though. It would appear to the user as if the oprofile function in the kernel was not present. I'd suggest EIO, and not use an extra define for that. > static int > rtas_ibm_cbe_perftools(int subfunc, int passthru, > void *address, unsigned long length) > { > u64 paddr = __pa(address); > > - return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru, > - paddr >> 32, paddr & 0xffffffff, length); > + pm_rtas_token = rtas_token("ibm,cbe-perftools"); > + > + if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) { > + printk(KERN_ERR > + "%s: rtas token ibm,cbe-perftools unknown\n", > + __FUNCTION__); > + return -EFWCALL; > + } else { > + > + return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, > + passthru, paddr >> 32, paddr & 0xffffffff, length); > + } > } Are you now reading the rtas token every time you call rtas? that seems like a waste of time. > +#define size 24 > +#define ENTRIES (0x1<<8) /* 256 */ > +#define MAXLFSR 0xFFFFFF > + > +int initial_lfsr[] = > +{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445, > + 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256, > + 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843, > + 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714, > + 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106, > + 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539, > + 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372, > + 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380, > + 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555, > + 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120, > + 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753, > + 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582, > + 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509, > + 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266, > + 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108, > + 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103, > + 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649, > + 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918, > + 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952, > + 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358, > + 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840, > + 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050, > + 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251, > + 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717, > + 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909, > + 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712, > + 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152, > + 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298, > + 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434, > + 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304, > + 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363, > + 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674}; > + > +/* > + * The hardware uses an LFSR counting sequence to determine when to capture > + * the SPU PCs. The SPU PC capture is done when the LFSR sequence reaches the > + * last value in the sequence. An LFSR sequence is like a puesdo random > + * number sequence where each number occurs once in the sequence but the > + * sequence is not in numerical order. To reduce the calculation time, a > + * sequence of 256 precomputed values in the LFSR sequence are stored in a > + * table. The nearest precomputed value is used as the initial point from > + * which to caculate the desired LFSR value that is n from the end of the > + * sequence. The lookup table reduces the maximum number of iterations in > + * the loop from 2^24 to 2^16. > + */ > +static int calculate_lfsr(int n) > +{ > + int i; > + > + int start_lfsr_index; > + unsigned int newlfsr0; > + unsigned int lfsr = MAXLFSR; > + unsigned int binsize = (MAXLFSR+1)/ENTRIES; > + unsigned int howmany; > + > + start_lfsr_index = (MAXLFSR - n) / binsize; > + lfsr = initial_lfsr[start_lfsr_index]; > + howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize)); > + > + for (i = 2; i < howmany+2; i++) { > + newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^ > + ((lfsr >> (size - 1 - 1)) & 1) ^ > + (((lfsr >> (size - 1 - 6)) & 1) ^ > + ((lfsr >> (size - 1 - 23)) & 1))); > + > + lfsr >>= 1; > + lfsr = lfsr | (newlfsr0 << (size - 1)); > + } > + return lfsr; > +} I agree with Milton that it would be far nicer even to calculate the value from user space, but since you say that would violate the oprofile interface conventions, let's not go there. In order to make this code nicer on the user, you should probably insert a 'cond_resched()' somewhere in the loop, maybe every 500 iterations or so. it also looks like there is whitespace damage in the code here. > + > +/* This interface allows a profiler (e.g., OProfile) to store > + * spu_context information needed for profiling, allowing it to > + * be saved across context save/restore operation. > + * > + * Assumes the caller has already incremented the ref count to > + * profile_info; then spu_context_destroy must call kref_put > + * on prof_info_kref. > + */ > +void spu_set_profile_private(struct spu_context * ctx, void * profile_info, > + struct kref * prof_info_kref, > + void (* prof_info_release) (struct kref * kref)) > +{ > + ctx->profile_private = profile_info; > + ctx->prof_priv_kref = prof_info_kref; > + ctx->prof_priv_release = prof_info_release; > +} > +EXPORT_SYMBOL_GPL(spu_set_profile_private); I think you don't need the profile_private member here, if you just use container_of with ctx->prof_priv_kref in all users. Arnd <>< |
From: Maynard J. <may...@us...> - 2007-02-15 16:15:37
|
Arnd Bergmann wrote: >On Thursday 15 February 2007 00:52, Carl Love wrote: > > > > >>--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig 2007-01-18 16:43:14.000000000 -0600 >>+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig 2007-02-13 19:04:46.271028904 -0600 >>@@ -7,7 +7,8 @@ >> >> config OPROFILE >> tristate "OProfile system profiling (EXPERIMENTAL)" >>- depends on PROFILING >>+ default m >>+ depends on SPU_FS && PROFILING >> help >> OProfile is a profiling system capable of profiling the >> whole system, include the kernel, kernel modules, libraries, >> >> > >Milton already commented on this being wrong. I think what you want >is > depends on PROFILING && (SPU_FS = n || SPU_FS) > >that should make sure that when SPU_FS=y that OPROFILE can not be 'm'. > > Blast it! I did this right on our development system, but neglected to update the patch correctly to remove this dependency and 'default m'. I'll fix in the next patch. > > >>@@ -15,3 +16,10 @@ >> >> If unsure, say N. >> >>+config OPROFILE_CELL >>+ bool "OProfile for Cell Broadband Engine" >>+ depends on SPU_FS && OPROFILE >>+ default y >>+ help >>+ OProfile for Cell BE requires special support enabled >>+ by this option. >> >> > >You should at least mention that this allows profiling the spus. > > OK. > > >>+#define EFWCALL ENOSYS /* Use an existing error number that is as >>+ * close as possible for a FW call that failed. >>+ * The probability of the call failing is >>+ * very low. Passing up the error number >>+ * ensures that the user will see an error >>+ * message saying OProfile did not start. >>+ * Dmesg will contain an accurate message >>+ * about the failure. >>+ */ >> >> > >ENOSYS looks wrong though. It would appear to the user as if the oprofile >function in the kernel was not present. I'd suggest EIO, and not use >an extra define for that. > > Carl will reply to this. > > > >> static int >> rtas_ibm_cbe_perftools(int subfunc, int passthru, >> void *address, unsigned long length) >> { >> u64 paddr = __pa(address); >> >>- return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru, >>- paddr >> 32, paddr & 0xffffffff, length); >>+ pm_rtas_token = rtas_token("ibm,cbe-perftools"); >>+ >>+ if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) { >>+ printk(KERN_ERR >>+ "%s: rtas token ibm,cbe-perftools unknown\n", >>+ __FUNCTION__); >>+ return -EFWCALL; >>+ } else { >>+ >>+ return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, >>+ passthru, paddr >> 32, paddr & 0xffffffff, length); >>+ } >> } >> >> > >Are you now reading the rtas token every time you call rtas? that seems >like a waste of time. > > Carl will reply. > > > >>+#define size 24 >>+#define ENTRIES (0x1<<8) /* 256 */ >>+#define MAXLFSR 0xFFFFFF >>+ >>+int initial_lfsr[] = >>+{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445, >>+ 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256, >>+ 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843, >>+ 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714, >>+ 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106, >>+ 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539, >>+ 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372, >>+ 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380, >>+ 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555, >>+ 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120, >>+ 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753, >>+ 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582, >>+ 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509, >>+ 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266, >>+ 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108, >>+ 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103, >>+ 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649, >>+ 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918, >>+ 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952, >>+ 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358, >>+ 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840, >>+ 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050, >>+ 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251, >>+ 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717, >>+ 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909, >>+ 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712, >>+ 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152, >>+ 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298, >>+ 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434, >>+ 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304, >>+ 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363, >>+ 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674}; >>+ >>+/* >>+ * The hardware uses an LFSR counting sequence to determine when to capture >>+ * the SPU PCs. The SPU PC capture is done when the LFSR sequence reaches the >>+ * last value in the sequence. An LFSR sequence is like a puesdo random >>+ * number sequence where each number occurs once in the sequence but the >>+ * sequence is not in numerical order. To reduce the calculation time, a >>+ * sequence of 256 precomputed values in the LFSR sequence are stored in a >>+ * table. The nearest precomputed value is used as the initial point from >>+ * which to caculate the desired LFSR value that is n from the end of the >>+ * sequence. The lookup table reduces the maximum number of iterations in >>+ * the loop from 2^24 to 2^16. >>+ */ >>+static int calculate_lfsr(int n) >>+{ >>+ int i; >>+ >>+ int start_lfsr_index; >>+ unsigned int newlfsr0; >>+ unsigned int lfsr = MAXLFSR; >>+ unsigned int binsize = (MAXLFSR+1)/ENTRIES; >>+ unsigned int howmany; >>+ >>+ start_lfsr_index = (MAXLFSR - n) / binsize; >>+ lfsr = initial_lfsr[start_lfsr_index]; >>+ howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize)); >>+ >>+ for (i = 2; i < howmany+2; i++) { >>+ newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^ >>+ ((lfsr >> (size - 1 - 1)) & 1) ^ >>+ (((lfsr >> (size - 1 - 6)) & 1) ^ >>+ ((lfsr >> (size - 1 - 23)) & 1))); >>+ >>+ lfsr >>= 1; >>+ lfsr = lfsr | (newlfsr0 << (size - 1)); >>+ } >>+ return lfsr; >>+} >> >> > >I agree with Milton that it would be far nicer even to calculate >the value from user space, but since you say that would >violate the oprofile interface conventions, let's not go there. >In order to make this code nicer on the user, you should probably >insert a 'cond_resched()' somewhere in the loop, maybe every >500 iterations or so. > >it also looks like there is whitespace damage in the code here. > > Carl will reply. > > >>+ >>+/* This interface allows a profiler (e.g., OProfile) to store >>+ * spu_context information needed for profiling, allowing it to >>+ * be saved across context save/restore operation. >>+ * >>+ * Assumes the caller has already incremented the ref count to >>+ * profile_info; then spu_context_destroy must call kref_put >>+ * on prof_info_kref. >>+ */ >>+void spu_set_profile_private(struct spu_context * ctx, void * profile_info, >>+ struct kref * prof_info_kref, >>+ void (* prof_info_release) (struct kref * kref)) >>+{ >>+ ctx->profile_private = profile_info; >>+ ctx->prof_priv_kref = prof_info_kref; >>+ ctx->prof_priv_release = prof_info_release; >>+} >>+EXPORT_SYMBOL_GPL(spu_set_profile_private); >> >> > >I think you don't need the profile_private member here, if you just use >container_of with ctx->prof_priv_kref in all users. > > Sorry, I don't follow. We want the profile_private to be stored in the spu_context, don't we? How else would I be able to do that? And besides, wouldn't container_of need the struct name of profile_private? SPUFS doesn't have access to the type. -Maynard > Arnd <>< > >------------------------------------------------------------------------- >Take Surveys. Earn Cash. Influence the Future of IT >Join SourceForge.net's Techsay panel and you'll get the chance to share your >opinions on IT & business topics through brief surveys-and earn cash >http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV >_______________________________________________ >oprofile-list mailing list >opr...@li... >https://lists.sourceforge.net/lists/listinfo/oprofile-list > > |
From: Arnd B. <ar...@ar...> - 2007-02-15 18:14:41
|
On Thursday 15 February 2007 17:15, Maynard Johnson wrote: > >>+void spu_set_profile_private(struct spu_context * ctx, void * profile_= info, > >>+=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0 =A0 = =A0 struct kref * prof_info_kref, > >>+=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0 =A0 = =A0 void (* prof_info_release) (struct kref * kref)) > >>+{ > >>+=A0=A0=A0=A0=A0ctx->profile_private =3D profile_info; > >>+=A0=A0=A0=A0=A0ctx->prof_priv_kref =3D prof_info_kref; > >>+=A0=A0=A0=A0=A0ctx->prof_priv_release =3D prof_info_release; > >>+} > >>+EXPORT_SYMBOL_GPL(spu_set_profile_private); > >> =A0 =A0 > >> > > > >I think you don't need the profile_private member here, if you just use > >container_of with ctx->prof_priv_kref in all users. > > =A0 > > > Sorry, I don't follow. We want the profile_private to be stored in the=20 > spu_context, don't we? =A0How else would I be able to do that? =A0And=20 > besides, wouldn't container_of need the struct name of profile_private? = =A0 > SPUFS doesn't have access to the type. The idea was to have spu_get_profile_private return the kref pointer, and then change the user of that to do + if (!spu_info[spu_num] && the_spu) { + spu_info[spu_num] =3D container_of( + spu_get_profile_private(the_spu->ctx), + struct cached_info, cache_kref); + if (spu_info[spu_num]) + kref_get(&spu_info[spu_num]->cache_ref); |
From: Carl L. <ce...@us...> - 2007-02-15 20:22:24
|
On Thu, 2007-02-15 at 15:37 +0100, Arnd Bergmann wrote: > On Thursday 15 February 2007 00:52, Carl Love wrote: > > > > --- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig 2007-01-18 16:43:14.000000000 -0600 > > +++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig 2007-02-13 19:04:46.271028904 -0600 > > @@ -7,7 +7,8 @@ > > > > config OPROFILE > > tristate "OProfile system profiling (EXPERIMENTAL)" > > - depends on PROFILING > > + default m > > + depends on SPU_FS && PROFILING > > help > > OProfile is a profiling system capable of profiling the > > whole system, include the kernel, kernel modules, libraries, > > Milton already commented on this being wrong. I think what you want > is > depends on PROFILING && (SPU_FS = n || SPU_FS) > > that should make sure that when SPU_FS=y that OPROFILE can not be 'm'. > > > @@ -15,3 +16,10 @@ > > > > If unsure, say N. > > > > +config OPROFILE_CELL > > + bool "OProfile for Cell Broadband Engine" > > + depends on SPU_FS && OPROFILE > > + default y > > + help > > + OProfile for Cell BE requires special support enabled > > + by this option. > > You should at least mention that this allows profiling the spus. > > > +#define EFWCALL ENOSYS /* Use an existing error number that is as > > + * close as possible for a FW call that failed. > > + * The probability of the call failing is > > + * very low. Passing up the error number > > + * ensures that the user will see an error > > + * message saying OProfile did not start. > > + * Dmesg will contain an accurate message > > + * about the failure. > > + */ > > ENOSYS looks wrong though. It would appear to the user as if the oprofile > function in the kernel was not present. I'd suggest EIO, and not use > an extra define for that. > OK, will do. > > > static int > > rtas_ibm_cbe_perftools(int subfunc, int passthru, > > void *address, unsigned long length) > > { > > u64 paddr = __pa(address); > > > > - return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru, > > - paddr >> 32, paddr & 0xffffffff, length); > > + pm_rtas_token = rtas_token("ibm,cbe-perftools"); > > + > > + if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) { > > + printk(KERN_ERR > > + "%s: rtas token ibm,cbe-perftools unknown\n", > > + __FUNCTION__); > > + return -EFWCALL; > > + } else { > > + > > + return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, > > + passthru, paddr >> 32, paddr & 0xffffffff, length); > > + } > > } > > Are you now reading the rtas token every time you call rtas? that seems > like a waste of time. There are actually two RTAS calls, i.e. two tokens. Once for setting up the debug bus. The other to do the SPU PC collection. Yes, we are getting the token each time using the single global pm_rtas_token. To make sure we had the correct token, I made sure to call it each time. As you point out it is very wasteful. It probably would be best to just have a second global variable say spu_rtas_token. Then do a single call for each global variable. Then we just use the global variable in the appropriate rtas_call. This would eliminate a significant number of calls to look up the token. I should have thought of that earlier. > > > > +#define size 24 > > +#define ENTRIES (0x1<<8) /* 256 */ > > +#define MAXLFSR 0xFFFFFF > > + > > +int initial_lfsr[] = > > +{16777215, 3797240, 13519805, 11602690, 6497030, 7614675, 2328937, 2889445, > > + 12364575, 8723156, 2450594, 16280864, 14742496, 10904589, 6434212, 4996256, > > + 5814270, 13014041, 9825245, 410260, 904096, 15151047, 15487695, 3061843, > > + 16482682, 7938572, 4893279, 9390321, 4320879, 5686402, 1711063, 10176714, > > + 4512270, 1057359, 16700434, 5731602, 2070114, 16030890, 1208230, 15603106, > > + 11857845, 6470172, 1362790, 7316876, 8534496, 1629197, 10003072, 1714539, > > + 1814669, 7106700, 5427154, 3395151, 3683327, 12950450, 16620273, 12122372, > > + 7194999, 9952750, 3608260, 13604295, 2266835, 14943567, 7079230, 777380, > > + 4516801, 1737661, 8730333, 13796927, 3247181, 9950017, 3481896, 16527555, > > + 13116123, 14505033, 9781119, 4860212, 7403253, 13264219, 12269980, 100120, > > + 664506, 607795, 8274553, 13133688, 6215305, 13208866, 16439693, 3320753, > > + 8773582, 13874619, 1784784, 4513501, 11002978, 9318515, 3038856, 14254582, > > + 15484958, 15967857, 13504461, 13657322, 14724513, 13955736, 5695315, 7330509, > > + 12630101, 6826854, 439712, 4609055, 13288878, 1309632, 4996398, 11392266, > > + 793740, 7653789, 2472670, 14641200, 5164364, 5482529, 10415855, 1629108, > > + 2012376, 13661123, 14655718, 9534083, 16637925, 2537745, 9787923, 12750103, > > + 4660370, 3283461, 14862772, 7034955, 6679872, 8918232, 6506913, 103649, > > + 6085577, 13324033, 14251613, 11058220, 11998181, 3100233, 468898, 7104918, > > + 12498413, 14408165, 1208514, 15712321, 3088687, 14778333, 3632503, 11151952, > > + 98896, 9159367, 8866146, 4780737, 4925758, 12362320, 4122783, 8543358, > > + 7056879, 10876914, 6282881, 1686625, 5100373, 4573666, 9265515, 13593840, > > + 5853060, 1188880, 4237111, 15765555, 14344137, 4608332, 6590210, 13745050, > > + 10916568, 12340402, 7145275, 4417153, 2300360, 12079643, 7608534, 15238251, > > + 4947424, 7014722, 3984546, 7168073, 10759589, 16293080, 3757181, 4577717, > > + 5163790, 2488841, 4650617, 3650022, 5440654, 1814617, 6939232, 15540909, > > + 501788, 1060986, 5058235, 5078222, 3734500, 10762065, 390862, 5172712, > > + 1070780, 7904429, 1669757, 3439997, 2956788, 14944927, 12496638, 994152, > > + 8901173, 11827497, 4268056, 15725859, 1694506, 5451950, 2892428, 1434298, > > + 9048323, 13558747, 15083840, 8154495, 15830901, 391127, 14970070, 2451434, > > + 2080347, 10775644, 14599429, 12540753, 4813943, 16140655, 2421772, 12724304, > > + 12935733, 7206473, 5697333, 10328104, 2418008, 13547986, 284246, 1732363, > > + 16375319, 8109554, 16372365, 14346072, 1835890, 13059499, 2442500, 4110674}; > > + > > +/* > > + * The hardware uses an LFSR counting sequence to determine when to capture > > + * the SPU PCs. The SPU PC capture is done when the LFSR sequence reaches the > > + * last value in the sequence. An LFSR sequence is like a puesdo random > > + * number sequence where each number occurs once in the sequence but the > > + * sequence is not in numerical order. To reduce the calculation time, a > > + * sequence of 256 precomputed values in the LFSR sequence are stored in a > > + * table. The nearest precomputed value is used as the initial point from > > + * which to caculate the desired LFSR value that is n from the end of the > > + * sequence. The lookup table reduces the maximum number of iterations in > > + * the loop from 2^24 to 2^16. > > + */ > > +static int calculate_lfsr(int n) > > +{ > > + int i; > > + > > + int start_lfsr_index; > > + unsigned int newlfsr0; > > + unsigned int lfsr = MAXLFSR; > > + unsigned int binsize = (MAXLFSR+1)/ENTRIES; > > + unsigned int howmany; > > + > > + start_lfsr_index = (MAXLFSR - n) / binsize; > > + lfsr = initial_lfsr[start_lfsr_index]; > > + howmany = (MAXLFSR - n) - (start_lfsr_index * (binsize)); > > + > > + for (i = 2; i < howmany+2; i++) { > > + newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^ > > + ((lfsr >> (size - 1 - 1)) & 1) ^ > > + (((lfsr >> (size - 1 - 6)) & 1) ^ > > + ((lfsr >> (size - 1 - 23)) & 1))); > > + > > + lfsr >>= 1; > > + lfsr = lfsr | (newlfsr0 << (size - 1)); > > + } > > + return lfsr; > > +} > > I agree with Milton that it would be far nicer even to calculate > the value from user space, but since you say that would > violate the oprofile interface conventions, let's not go there. > In order to make this code nicer on the user, you should probably > insert a 'cond_resched()' somewhere in the loop, maybe every > 500 iterations or so. > > it also looks like there is whitespace damage in the code here. I will double check on the whitespace damage. I thought I had gotten all that out. I have done some quick measurements. The above method limits the loop to at most 2^16 iterations. Based on running the algorithm in user space, it takes about 3ms of computation time to do the loop 2^16 times. At the vary least, we need to put the resched in say every 10,000 iterations which would be about every 0.5ms. Should we do a resched more often? Additionally we could up the size of the table to 512 which would reduce the maximum time to about 1.5ms. What do people think about increasing the table size? A little more general discussion about the logarithmic algorithm and limiting the range. The hardware supports a 24 bit LFSR value. This means the user can say is capture a sample every N cycles, where N is in the range of 1 to 2^24. The OProfile user tool enforces a minimum value of N to make sure the overhead of OProfile doesn't bring the machine to its knees. The minimum values is not intended to guarantee the performance impact of OProfile will not be significant. It is left as an exercise for the user to pick an N that will give minimal performance impact. We set the lower limit for N for SPU profiling to 100,000. This is actually high enough that we don't seem to see much performance impact when running OProfile. If the user picked N=2^24 then for a 3.2GHz machine you would get about 200 samples per second on each node. Where a sample consists of the PC value for all 8 SPUs on the node. If the user wanted to do a relatively long OProfile run, I can see where they might use N=2^24 to avoid gathering too much data. My gut feeling is that the sampling frequency for N=2^24 is not low enough that someone would never want to use it when doing long runs. Hence, we should not arbitrarily reduce the maximum value for N. Although I would expect that the typical value for N will be in the range of several hundred thousand to a few million. As for using a logarithmic spacing of the precomputed values, this approach means that the space between the precomputed values at the high end would be much larger then 2^14, assuming 256 precomputed values. That means it could take much longer then 3ms to get the needed LFSR value for a large N. By evenly spacing the precomputed values, we can ensure that for all N it will take less then 3ms to get the value. Personally, I am more comfortable with a hard limit on the compute time then a variable time that could get much bigger then the 1ms threshold that Arnd wants for resched. Any thoughts? > > > + > > +/* This interface allows a profiler (e.g., OProfile) to store > > + * spu_context information needed for profiling, allowing it to > > + * be saved across context save/restore operation. > > + * > > + * Assumes the caller has already incremented the ref count to > > + * profile_info; then spu_context_destroy must call kref_put > > + * on prof_info_kref. > > + */ > > +void spu_set_profile_private(struct spu_context * ctx, void * profile_info, > > + struct kref * prof_info_kref, > > + void (* prof_info_release) (struct kref * kref)) > > +{ > > + ctx->profile_private = profile_info; > > + ctx->prof_priv_kref = prof_info_kref; > > + ctx->prof_priv_release = prof_info_release; > > +} > > +EXPORT_SYMBOL_GPL(spu_set_profile_private); > > I think you don't need the profile_private member here, if you just use > container_of with ctx->prof_priv_kref in all users. > > Arnd <>< |
From: Arnd B. <ar...@ar...> - 2007-02-15 21:04:12
|
On Thursday 15 February 2007 21:21, Carl Love wrote: > I have done some quick measurements. =A0The above method limits the loop > to at most 2^16 iterations. =A0Based on running the algorithm in user > space, it takes about 3ms of computation time to do the loop 2^16 times. >=20 > At the vary least, we need to put the resched in say every 10,000 > iterations which would be about every 0.5ms. =A0Should we do a resched > more often? =A0 Yes, just to be on the safe side, I'd suggest to do it every 1000 iterations. =20 > Additionally we could up the size of the table to 512 which would reduce > the maximum time to about 1.5ms. =A0What do people think about increasing > the table size? No, that won't help too much. I'd say 256 or 128 entries is the most we should have. > As for using a logarithmic spacing of the precomputed values, this > approach means that the space between the precomputed values at the high > end would be much larger then 2^14, assuming 256 precomputed values. > That means it could take much longer then 3ms to get the needed LFSR > value for a large N. =A0By evenly spacing the precomputed values, we can > ensure that for all N it will take less then 3ms to get the value. > Personally, I am more comfortable with a hard limit on the compute time > then a variable time that could get much bigger then the 1ms threshold > that Arnd wants for resched. =A0Any thoughts? When using precomputed values on a logarithmic scale, I'd recommend just rounding to the closest value and accepting the relative inaccuracy, instead of using the precomputed value as the base and then calculating from there. Arnd <>< |
From: Paul E. M. <pa...@li...> - 2007-02-16 04:21:03
|
On Thu, Feb 15, 2007 at 12:21:58PM -0800, Carl Love wrote: > On Thu, 2007-02-15 at 15:37 +0100, Arnd Bergmann wrote: [ . . . ] > > I agree with Milton that it would be far nicer even to calculate > > the value from user space, but since you say that would > > violate the oprofile interface conventions, let's not go there. > > In order to make this code nicer on the user, you should probably > > insert a 'cond_resched()' somewhere in the loop, maybe every > > 500 iterations or so. > > > > it also looks like there is whitespace damage in the code here. > > I will double check on the whitespace damage. I thought I had gotten > all that out. > > I have done some quick measurements. The above method limits the loop > to at most 2^16 iterations. Based on running the algorithm in user > space, it takes about 3ms of computation time to do the loop 2^16 times. > > At the vary least, we need to put the resched in say every 10,000 > iterations which would be about every 0.5ms. Should we do a resched > more often? > > Additionally we could up the size of the table to 512 which would reduce > the maximum time to about 1.5ms. What do people think about increasing > the table size? Is this 1.5ms with interrupts disabled? This time period is problematic from a realtime perspective if so -- need to be able to preempt. Thanx, Paul > A little more general discussion about the logarithmic algorithm and > limiting the range. The hardware supports a 24 bit LFSR value. This > means the user can say is capture a sample every N cycles, where N is in > the range of 1 to 2^24. The OProfile user tool enforces a minimum value > of N to make sure the overhead of OProfile doesn't bring the machine to > its knees. The minimum values is not intended to guarantee the > performance impact of OProfile will not be significant. It is left as > an exercise for the user to pick an N that will give minimal performance > impact. We set the lower limit for N for SPU profiling to 100,000. This > is actually high enough that we don't seem to see much performance > impact when running OProfile. If the user picked N=2^24 then for a > 3.2GHz machine you would get about 200 samples per second on each node. > Where a sample consists of the PC value for all 8 SPUs on the node. If > the user wanted to do a relatively long OProfile run, I can see where > they might use N=2^24 to avoid gathering too much data. My gut feeling > is that the sampling frequency for N=2^24 is not low enough that someone > would never want to use it when doing long runs. Hence, we should not > arbitrarily reduce the maximum value for N. Although I would expect > that the typical value for N will be in the range of several hundred > thousand to a few million. > > As for using a logarithmic spacing of the precomputed values, this > approach means that the space between the precomputed values at the high > end would be much larger then 2^14, assuming 256 precomputed values. > That means it could take much longer then 3ms to get the needed LFSR > value for a large N. By evenly spacing the precomputed values, we can > ensure that for all N it will take less then 3ms to get the value. > Personally, I am more comfortable with a hard limit on the compute time > then a variable time that could get much bigger then the 1ms threshold > that Arnd wants for resched. Any thoughts? > > > > > > + > > > +/* This interface allows a profiler (e.g., OProfile) to store > > > + * spu_context information needed for profiling, allowing it to > > > + * be saved across context save/restore operation. > > > + * > > > + * Assumes the caller has already incremented the ref count to > > > + * profile_info; then spu_context_destroy must call kref_put > > > + * on prof_info_kref. > > > + */ > > > +void spu_set_profile_private(struct spu_context * ctx, void * profile_info, > > > + struct kref * prof_info_kref, > > > + void (* prof_info_release) (struct kref * kref)) > > > +{ > > > + ctx->profile_private = profile_info; > > > + ctx->prof_priv_kref = prof_info_kref; > > > + ctx->prof_priv_release = prof_info_release; > > > +} > > > +EXPORT_SYMBOL_GPL(spu_set_profile_private); > > > > I think you don't need the profile_private member here, if you just use > > container_of with ctx->prof_priv_kref in all users. > > > > Arnd <>< > > _______________________________________________ > cbe-oss-dev mailing list > cbe...@oz... > https://ozlabs.org/mailman/listinfo/cbe-oss-dev |
From: Arnd B. <ar...@ar...> - 2007-02-16 00:33:41
|
On Thursday 15 February 2007 22:50, Paul E. McKenney wrote: > Is this 1.5ms with interrupts disabled? =A0This time period is problematic > from a realtime perspective if so -- need to be able to preempt. No, interrupts should be enabled here. Still, 1.5ms is probably a little too long without a cond_resched() in case kernel preemption is disabled. Arnd <>< |
From: Maynard J. <may...@us...> - 2007-02-16 00:33:04
|
Arnd Bergmann wrote: >On Thursday 15 February 2007 00:52, Carl Love wrote: > > > > >>--- linux-2.6.20-rc1.orig/arch/powerpc/oprofile/Kconfig 2007-01-18 16:43:14.000000000 -0600 >>+++ linux-2.6.20-rc1/arch/powerpc/oprofile/Kconfig 2007-02-13 19:04:46.271028904 -0600 >>@@ -7,7 +7,8 @@ >> >> config OPROFILE >> tristate "OProfile system profiling (EXPERIMENTAL)" >>- depends on PROFILING >>+ default m >>+ depends on SPU_FS && PROFILING >> help >> OProfile is a profiling system capable of profiling the >> whole system, include the kernel, kernel modules, libraries, >> >> > >Milton already commented on this being wrong. I think what you want >is > depends on PROFILING && (SPU_FS = n || SPU_FS) > >that should make sure that when SPU_FS=y that OPROFILE can not be 'm'. > > The above suggestion would not work if SPU_FS is not defined, since the entire config option is ignored if an undefined symbol is used. So, here's what I propose instead: - Leave the existing 'config OPROFILE' unchanged from its current form in mainline (shown below) - Add the new 'config OPROFILE_CELL' (shown below) - In arch/powerpc/configs/cell-defconfig, set CONFIG_OPROFILE=m, to correspond to setting for CONFIG_SPU_FS - In arch/powerpc/oprofile/Makefile, do the following: oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \ cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o =========== config OPROFILE tristate "OProfile system profiling (EXPERIMENTAL)" depends on PROFILING help OProfile is a profiling system capable of profiling the whole system, include the kernel, kernel modules, libraries, and applications. If unsure, say N. config OPROFILE_CELL bool "OProfile for Cell Broadband Engine" depends on OPROFILE && SPU_FS default y if ((SPU_FS = y && OPROFILE = y) || (SPU_FS = m && OPROFILE = m)) help Profiling of Cell BE SPUs requires special support enabled by this option. Both SPU_FS and OPROFILE options must be set 'y' or both be set 'm'. ============= Can anyone see a problem with any of this . . . or perhaps a suggestion of a better way? Thanks. -Maynard |
From: Arnd B. <ar...@ar...> - 2007-02-16 17:15:16
|
On Friday 16 February 2007 01:32, Maynard Johnson wrote: > config OPROFILE_CELL > =A0 =A0 =A0 =A0 bool "OProfile for Cell Broadband Engine" > =A0 =A0 =A0 =A0 depends on OPROFILE && SPU_FS > =A0 =A0 =A0 =A0 default y if ((SPU_FS =3D y && OPROFILE =3D y) || (SPU_FS= =3D m &&=20 > OPROFILE =3D m)) > =A0 =A0 =A0 =A0 help > =A0 =A0 =A0 =A0 =A0 Profiling of Cell BE SPUs requires special support en= abled > =A0 =A0 =A0 =A0 =A0 by this option. =A0Both SPU_FS and OPROFILE options m= ust be > =A0 =A0 =A0 =A0 =A0 set 'y' or both be set 'm'. > =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D >=20 > Can anyone see a problem with any of this . . . or perhaps a suggestion=20 > of a better way? The text suggests it doesn't allow SPU_FS=3Dy with OPROFILE=3Dm, which I th= ink should be allowed. I also don't see any place in the code where you actually use CONFIG_OPROFILE_CELL. Ideally, you should be able to have an oprofile_spu module that can be loaded after spufs.ko and oprofile.ko. In that case you only need config OPROFILE_SPU depends on OPROFILE && SPU_FS default y and it will automatically build oprofile_spu as a module if one of the two is a module and won't build it if one of them is disabled. Arnd <>< |
From: Maynard J. <may...@us...> - 2007-02-16 21:44:03
|
Arnd Bergmann wrote: > On Friday 16 February 2007 01:32, Maynard Johnson wrote: > >>config OPROFILE_CELL >> bool "OProfile for Cell Broadband Engine" >> depends on OPROFILE && SPU_FS >> default y if ((SPU_FS = y && OPROFILE = y) || (SPU_FS = m && >>OPROFILE = m)) >> help >> Profiling of Cell BE SPUs requires special support enabled >> by this option. Both SPU_FS and OPROFILE options must be >> set 'y' or both be set 'm'. >>============= >> >>Can anyone see a problem with any of this . . . or perhaps a suggestion >>of a better way? > > > The text suggests it doesn't allow SPU_FS=y with OPROFILE=m, which I think > should be allowed. Right, good catch. I'll add another OR to the 'default y' and correct the text. > I also don't see any place in the code where you actually > use CONFIG_OPROFILE_CELL. As I mentioned, I will use CONFIG_OPROFILE_CELL in the arch/powerpc/oprofile/Makefile as follows: oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \ cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o > > Ideally, you should be able to have an oprofile_spu module that can be > loaded after spufs.ko and oprofile.ko. In that case you only need > > config OPROFILE_SPU > depends on OPROFILE && SPU_FS > default y > > and it will automatically build oprofile_spu as a module if one of the two > is a module and won't build it if one of them is disabled. Hmmm . . . I guess that would entail splitting out the SPU-related stuff from op_model_cell.c into a new file. Maybe more -- that's just what comes to mind right now. Could be very tricky, and I wonder if it's worth the bother. > > Arnd <>< |
From: Maynard J. <may...@us...> - 2007-02-18 23:21:43
|
Maynard Johnson wrote: >Arnd Bergmann wrote: > > > >>On Friday 16 February 2007 01:32, Maynard Johnson wrote: >> >> >> >>>config OPROFILE_CELL >>> bool "OProfile for Cell Broadband Engine" >>> depends on OPROFILE && SPU_FS >>> default y if ((SPU_FS = y && OPROFILE = y) || (SPU_FS = m && >>>OPROFILE = m)) >>> help >>> Profiling of Cell BE SPUs requires special support enabled >>> by this option. Both SPU_FS and OPROFILE options must be >>> set 'y' or both be set 'm'. >>>============= >>> >>>Can anyone see a problem with any of this . . . or perhaps a suggestion >>>of a better way? >>> >>> >>The text suggests it doesn't allow SPU_FS=y with OPROFILE=m, which I think >>should be allowed. >> >> >Right, good catch. I'll add another OR to the 'default y' and correct >the text. > > Actually, it makes more sense to do the following: config OPROFILE_CELL bool "OProfile for Cell Broadband Engine" depends on (SPU_FS = y && OPROFILE = m) || (SPU_FS = y && OPROFILE = y) || (SPU_FS = m && OPROFILE = m) default y help Profiling of Cell BE SPUs requires special support enabled by this option. > > I also don't see any place in the code where you actually > > >>use CONFIG_OPROFILE_CELL. >> >> >As I mentioned, I will use CONFIG_OPROFILE_CELL in the >arch/powerpc/oprofile/Makefile as follows: > oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \ > cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o > > > > [snip] >> Arnd <>< >> >> > > >_______________________________________________ >Linuxppc-dev mailing list >Lin...@oz... >https://ozlabs.org/mailman/listinfo/linuxppc-dev > > |
From: Carl L. <ce...@us...> - 2007-02-22 00:18:17
|
Reposting to oprofile list. Original posting had an error in the oprofile mail list address. This is the third update to the patch previously posted by Maynard Johnson as "PATCH 4/4. Add support to OProfile for profiling CELL". This posting has the following changes: - Added lib support but it is untested. Waiting on a test case. - LFSR calculation is completely table driven - Detecting overlay switches and discarding samples collected when the overlay occured. - Fixed the Kconfig file. - RTAS token call and returning error value reworked - Added lock around the samples array access. - SPU overlay support validated (there wasn't a bug) - Misc changes per other minor review commnets The following are still outstanding issues: - Samples from dynamic code on the stack (stubs) are still being silently dropped. Still plan on putting them into anonymous bucket. - Working on draining samples when context switch occurs. - File renaming and refactoring suggestions have not been done. There is still ongoing discussions about this. - Moving file offset code from kernel to user space has not been done. This is still being discussed. Would really like to have Anton Blanchard or similar person look over the non CELL specific OProfile code changes. Subject: Add support to OProfile for profiling Cell BE SPUs From: Maynard Johnson <may...@us...> This patch updates the existing arch/powerpc/oprofile/op_model_cell.c to add in the SPU profiling capabilities. In addition, a 'cell' subdirectory was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling code. Signed-off-by: Carl Love <ca...@us...> Signed-off-by: Maynard Johnson <mp...@us...> Index: linux-2.6.20/arch/powerpc/configs/cell_defconfig =================================================================== --- linux-2.6.20.orig/arch/powerpc/configs/cell_defconfig 2007-02-20 13:49:02.021236368 -0600 +++ linux-2.6.20/arch/powerpc/configs/cell_defconfig 2007-02-20 13:49:52.760242968 -0600 @@ -1415,7 +1415,7 @@ # Instrumentation Support # CONFIG_PROFILING=y -CONFIG_OPROFILE=y +CONFIG_OPROFILE=m # CONFIG_KPROBES is not set # Index: linux-2.6.20/arch/powerpc/oprofile/cell/pr_util.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.20/arch/powerpc/oprofile/cell/pr_util.h 2007-02-21 17:28:54.609263688 -0600 @@ -0,0 +1,88 @@ + /* + * Cell Broadband Engine OProfile Support + * + * (C) Copyright IBM Corporation 2006 + * + * Author: Maynard Johnson <may...@us...> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef PR_UTIL_H +#define PR_UTIL_H + +#include <linux/cpumask.h> +#include <linux/oprofile.h> +#include <asm/cell-pmu.h> +#include <asm/spu.h> + +static inline int number_of_online_nodes(void) +{ + u32 cpu; u32 tmp; + int nodes = 0; + for_each_online_cpu(cpu) { + tmp = cbe_cpu_to_node(cpu) + 1; + if (tmp > nodes) + nodes++; + } + return nodes; +} + +/* Defines used for sync_start */ +#define SKIP_GENERIC_SYNC 0 +#define SYNC_START_ERROR -1 +#define DO_GENERIC_SYNC 1 + +struct spu_overlay_info +{ + unsigned int vma; + unsigned int size; + unsigned int offset; + unsigned int buf; +}; + +struct vma_to_fileoffset_map +{ + struct vma_to_fileoffset_map *next; + unsigned int vma; + unsigned int size; + unsigned int offset; + unsigned int guard_ptr; + unsigned int guard_val; +}; + +/* The three functions below are for maintaining and accessing + * the vma-to-fileoffset map. + */ +struct vma_to_fileoffset_map * create_vma_map(const struct spu * spu, + u64 objectid); +unsigned int vma_map_lookup(struct vma_to_fileoffset_map *map, + unsigned int vma, const struct spu * aSpu, + int * grd_val); +void vma_map_free(struct vma_to_fileoffset_map *map); + +/* + * Entry point for SPU profiling. + * cycles_reset is the SPU_CYCLES count value specified by the user. + */ +void start_spu_profiling(unsigned int cycles_reset); + +void stop_spu_profiling(void); + + +/* add the necessary profiling hooks */ +int spu_sync_start(void); + +/* remove the hooks */ +int spu_sync_stop(void); + +/* Record SPU program counter samples to the oprofile event buffer. */ +void spu_sync_buffer(int spu_num, unsigned int * samples, + int num_samples); + +void set_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset); + +#endif // PR_UTIL_H Index: linux-2.6.20/arch/powerpc/oprofile/cell/spu_profiler.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.20/arch/powerpc/oprofile/cell/spu_profiler.c 2007-02-21 17:28:54.610263536 -0600 @@ -0,0 +1,220 @@ +/* + * Cell Broadband Engine OProfile Support + * + * (C) Copyright IBM Corporation 2006 + * + * Authors: Maynard Johnson <may...@us...> + * Carl Love <ca...@us...> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/hrtimer.h> +#include <linux/smp.h> +#include <linux/slab.h> +#include <asm/cell-pmu.h> +#include <asm/time.h> +#include "pr_util.h" + +#define TRACE_ARRAY_SIZE 1024 +#define SCALE_SHIFT 14 + +static u32 * samples; + +static int spu_prof_running = 0; +static unsigned int profiling_interval = 0; + +extern int spu_prof_num_nodes; + + +#define NUM_SPU_BITS_TRBUF 16 +#define SPUS_PER_TB_ENTRY 4 +#define SPUS_PER_NODE 8 + +#define SPU_PC_MASK 0xFFFF + +static spinlock_t sample_array_lock=SPIN_LOCK_UNLOCKED; +unsigned long sample_array_lock_flags; + +void set_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset) +{ + unsigned long nsPerCyc; + if (!freq_khz) + freq_khz = ppc_proc_freq/1000; + + /* To calculate a timeout in nanoseconds, the basic + * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency). + * To avoid floating point math, we use the scale math + * technique as described in linux/jiffies.h. We use + * a scale factor of SCALE_SHIFT,which provides 4 decimal places + * of precision, which is close enough for the purpose at hand. + * + * The value of the timeout should be small enough that the hw + * trace buffer will not get more then a bout 1/3 full for the + * maximum user specified (the LFSR value) hw sampling frequency. + * This is to ensure the trace buffer will never fill even if the + * kernel thread scheduling varies under a heavey system load. + */ + + nsPerCyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz; + profiling_interval = (nsPerCyc * cycles_reset) >> SCALE_SHIFT; + +} + +/* + * Extract SPU PC from trace buffer entry + */ +static void spu_pc_extract(int cpu, int entry) +{ + /* the trace buffer is 128 bits */ + u64 trace_buffer[2]; + u64 spu_mask; + int spu; + + spu_mask = SPU_PC_MASK; + + /* Each SPU PC is 16 bits; hence, four spus in each of + * the two 64-bit buffer entries that make up the + * 128-bit trace_buffer entry. Process two 64-bit values + * simultaneously. + * trace[0] SPU PC contents are: 0 1 2 3 + * trace[1] SPU PC contents are: 4 5 6 7 + */ + + cbe_read_trace_buffer(cpu, trace_buffer); + + for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) { + /* spu PC trace entry is upper 16 bits of the + * 18 bit SPU program counter + */ + samples[spu * TRACE_ARRAY_SIZE + entry] + = (spu_mask & trace_buffer[0]) << 2; + samples[(spu + SPUS_PER_TB_ENTRY) * TRACE_ARRAY_SIZE + entry] + = (spu_mask & trace_buffer[1]) << 2; + + trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF; + trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF; + } +} + +static int cell_spu_pc_collection(int cpu) +{ + u32 trace_addr; + int entry; + + /* process the collected SPU PC for the node */ + + entry = 0; + + trace_addr = cbe_read_pm(cpu, trace_address); + while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) + { + /* there is data in the trace buffer to process */ + spu_pc_extract(cpu, entry); + + entry++; + + if (entry >= TRACE_ARRAY_SIZE) + /* spu_samples is full */ + break; + + trace_addr = cbe_read_pm(cpu, trace_address); + } + + return(entry); +} + + +static int profile_spus(struct hrtimer * timer) +{ + ktime_t kt; + int cpu, node, k, num_samples, spu_num; + + if (!spu_prof_running) + goto stop; + + for_each_online_cpu(cpu) { + if (cbe_get_hw_thread_id(cpu)) + continue; + + node = cbe_cpu_to_node(cpu); + + /* There should only be on kernel thread at a time processing + * the samples. In the very unlikely case that the processing + * is taking a very long time and multiple kernel threads are + * started to process the samples. Make sure only one kernel + * thread is working on the samples array at a time. The + * sample array must be loaded and then processed for a given + * cpu. The sample array is not per cpu. + */ + spin_lock_irqsave(&sample_array_lock, + sample_array_lock_flags); + num_samples = cell_spu_pc_collection(cpu); + + if (num_samples == 0) { + spin_unlock_irqrestore(&sample_array_lock, + sample_array_lock_flags); + continue; + } + + for (k = 0; k < SPUS_PER_NODE; k++) { + spu_num = k + (node * SPUS_PER_NODE); + spu_sync_buffer(spu_num, + samples + (k * TRACE_ARRAY_SIZE), + num_samples); + } + + spin_unlock_irqrestore(&sample_array_lock, + sample_array_lock_flags); + + } + smp_wmb(); + + kt = ktime_set(0, profiling_interval); + if (!spu_prof_running) + goto stop; + hrtimer_forward(timer, timer->base->get_time(), kt); + return HRTIMER_RESTART; + + stop: + printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n"); + return HRTIMER_NORESTART; +} + +static struct hrtimer timer; +/* + * Entry point for SPU profiling. + * NOTE: SPU profiling is done system-wide, not per-CPU. + * + * cycles_reset is the count value specified by the user when + * setting up OProfile to count SPU_CYCLES. + */ +void start_spu_profiling(unsigned int cycles_reset) { + + ktime_t kt; + + pr_debug("timer resolution: %lu\n", + TICK_NSEC); + kt = ktime_set(0, profiling_interval); + hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_REL); + timer.expires = kt; + timer.function = profile_spus; + + /* Allocate arrays for collecting SPU PC samples */ + samples = (u32 *) kzalloc(SPUS_PER_NODE * + TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL); + + spu_prof_running = 1; + hrtimer_start(&timer, kt, HRTIMER_REL); +} + +void stop_spu_profiling(void) +{ + spu_prof_running = 0; + hrtimer_cancel(&timer); + kfree(samples); + pr_debug("SPU_PROF: stop_spu_profiling issued\n"); +} Index: linux-2.6.20/arch/powerpc/oprofile/cell/spu_task_sync.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.20/arch/powerpc/oprofile/cell/spu_task_sync.c 2007-02-21 17:28:54.610263536 -0600 @@ -0,0 +1,487 @@ +/* + * Cell Broadband Engine OProfile Support + * + * (C) Copyright IBM Corporation 2006 + * + * Author: Maynard Johnson <may...@us...> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* The purpose of this file is to handle SPU event task switching + * and to record SPU context information into the OProfile + * event buffer. + * + * Additionally, the spu_sync_buffer function is provided as a helper + * for recoding actual SPU program counter samples to the event buffer. + */ +#include <linux/dcookies.h> +#include <linux/kref.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/numa.h> +#include <linux/oprofile.h> +#include <linux/spinlock.h> +#include "pr_util.h" + +#define RELEASE_ALL 9999 + +static spinlock_t buffer_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED; +static int num_spu_nodes; +int spu_prof_num_nodes; +int last_guard_val[MAX_NUMNODES * 8]; + +/* Container for caching information about an active SPU task. */ +struct cached_info { + struct vma_to_fileoffset_map * map; + struct spu * the_spu; /* needed to access pointer to local_store */ + struct kref cache_ref; +}; + +static struct cached_info * spu_info[MAX_NUMNODES * 8]; + +static void destroy_cached_info(struct kref * kref) +{ + struct cached_info * info; + info = container_of(kref, struct cached_info, cache_ref); + vma_map_free(info->map); + kfree(info); + module_put(THIS_MODULE); +} + +/* Return the cached_info for the passed SPU number. + * ATTENTION: Callers are responsible for obtaining the + * cache_lock if needed prior to invoking this function. + */ +static struct cached_info * get_cached_info(struct spu * the_spu, int spu_num) +{ + struct kref * ref; + struct cached_info * ret_info = NULL; + if (spu_num >= num_spu_nodes) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: Invalid index %d into spu info cache\n", + __FUNCTION__, __LINE__, spu_num); + goto out; + } + if (!spu_info[spu_num] && the_spu) { + ref = spu_get_profile_private_kref(the_spu->ctx); + if (ref) { + spu_info[spu_num] = container_of(ref, struct cached_info, cache_ref); + kref_get(&spu_info[spu_num]->cache_ref); + } + } + + ret_info = spu_info[spu_num]; + out: + return ret_info; +} + + +/* Looks for cached info for the passed spu. If not found, the + * cached info is created for the passed spu. + * Returns 0 for success; otherwise, -1 for error. + */ +static int +prepare_cached_spu_info(struct spu * spu, unsigned int objectId) +{ + unsigned long flags = 0; + struct vma_to_fileoffset_map * new_map; + int retval = 0; + struct cached_info * info; + + /* We won't bother getting cache_lock here since + * don't do anything with the cached_info that's returned. + */ + info = get_cached_info(spu, spu->number); + + if (info) { + pr_debug("Found cached SPU info.\n"); + goto out; + } + + /* Create cached_info and set spu_info[spu->number] to point to it. + * spu->number is a system-wide value, not a per-node value. + */ + info = kzalloc(sizeof(struct cached_info), GFP_KERNEL); + if (!info) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: create vma_map failed\n", + __FUNCTION__, __LINE__); + goto err_alloc; + } + new_map = create_vma_map(spu, objectId); + if (!new_map) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: create vma_map failed\n", + __FUNCTION__, __LINE__); + goto err_alloc; + } + + pr_debug("Created vma_map\n"); + info->map = new_map; + info->the_spu = spu; + kref_init(&info->cache_ref); + spin_lock_irqsave(&cache_lock, flags); + spu_info[spu->number] = info; + /* Increment count before passing off ref to SPUFS. */ + kref_get(&info->cache_ref); + + /* We increment the module refcount here since SPUFS is + * responsible for the final destruction of the cached_info, + * and it must be able to access the destroy_cached_info() + * function defined in the OProfile module. We decrement + * the module refcount in destroy_cached_info. + */ + try_module_get(THIS_MODULE); + spu_set_profile_private_kref(spu->ctx, &info->cache_ref, + destroy_cached_info); + spin_unlock_irqrestore(&cache_lock, flags); + goto out; + +err_alloc: + retval = -1; +out: + return retval; +} + +/* + * NOTE: The caller is responsible for locking the + * cache_lock prior to calling this function. + */ +static int release_cached_info(int spu_index) +{ + int index, end; + if (spu_index == RELEASE_ALL) { + end = num_spu_nodes; + index = 0; + } else { + if (spu_index >= num_spu_nodes) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: " + "Invalid index %d into spu info cache\n", + __FUNCTION__, __LINE__, spu_index); + goto out; + } + end = spu_index +1; + index = spu_index; + } + for (; index < end; index++) { + if (spu_info[index]) { + kref_put(&spu_info[index]->cache_ref, + destroy_cached_info); + spu_info[index] = NULL; + } + } + +out: + return 0; +} + +/* The source code for fast_get_dcookie was "borrowed" + * from drivers/oprofile/buffer_sync.c. + */ + +/* Optimisation. We can manage without taking the dcookie sem + * because we cannot reach this code without at least one + * dcookie user still being registered (namely, the reader + * of the event buffer). + */ +static inline unsigned long fast_get_dcookie(struct dentry * dentry, + struct vfsmount * vfsmnt) +{ + unsigned long cookie; + + if (dentry->d_cookie) + return (unsigned long)dentry; + get_dcookie(dentry, vfsmnt, &cookie); + return cookie; +} + +/* Look up the dcookie for the task's first VM_EXECUTABLE mapping, + * which corresponds loosely to "application name". Also, determine + * the offset for the SPU ELF object. If computed offset is + * non-zero, it implies an embedded SPU object; otherwise, it's a + * separate SPU binary, in which case we retrieve it's dcookie. + * For the embedded case, we must determine if SPU ELF is embedded + * in the executable application or another file (i.e., shared lib). + * If embedded in a shared lib, we must get the dcookie and return + * that to the caller. + */ +static unsigned long +get_exec_dcookie_and_offset(struct spu * spu, unsigned int * offsetp, + unsigned long * spu_bin_dcookie, + unsigned long * shlib_dcookie, + unsigned int spu_ref) +{ + unsigned long app_cookie = 0; + unsigned long * image_cookie = NULL; + unsigned int my_offset = 0; + struct file * app = NULL; + struct vm_area_struct * vma; + struct mm_struct * mm = spu->mm; + + if (!mm) + goto out; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (!(vma->vm_flags & VM_EXECUTABLE)) + continue; + app_cookie = fast_get_dcookie(vma->vm_file->f_dentry, + vma->vm_file->f_vfsmnt); + pr_debug("got dcookie for %s\n", + vma->vm_file->f_dentry->d_name.name); + app = vma->vm_file; + break; + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_start > spu_ref || vma->vm_end <= spu_ref) + continue; + my_offset = spu_ref - vma->vm_start; + if (!vma->vm_file) + goto fail_no_image_cookie; + + pr_debug("Found spu ELF at %X(object-id:%X) for file %s \n", + my_offset, spu_ref, + vma->vm_file->f_dentry->d_name.name); + *offsetp = my_offset; + if (my_offset == 0) + image_cookie = spu_bin_dcookie; + else if (vma->vm_file != app) + image_cookie = shlib_dcookie; + break; + } + + if (image_cookie) { + *image_cookie = fast_get_dcookie(vma->vm_file->f_dentry, + vma->vm_file- >f_vfsmnt); + pr_debug("got dcookie for %s\n", + vma->vm_file->f_dentry->d_name.name); + } + + out: + return app_cookie; + + fail_no_image_cookie: + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: Cannot find dcookie for SPU binary\n", + __FUNCTION__, __LINE__); + goto out; +} + + + +/* This function finds or creates cached context information for the + * passed SPU and records SPU context information into the OProfile + * event buffer. + */ +static int process_context_switch(struct spu * spu, unsigned int objectId) +{ + unsigned long flags; + int retval = 0; + unsigned int offset = 0; + unsigned long spu_cookie = 0, app_dcookie = 0, shlib_cookie = 0; + retval = prepare_cached_spu_info(spu, objectId); + if (retval == -1) { + goto out; + } + /* Get dcookie first because a mutex_lock is taken in that + * code path, so interrupts must not be disabled. + */ + app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie, + &shlib_cookie, objectId); + + /* Record context info in event buffer */ + spin_lock_irqsave(&buffer_lock, flags); + add_event_entry(ESCAPE_CODE); + add_event_entry(SPU_CTX_SWITCH_CODE); + add_event_entry(spu->number); + add_event_entry(spu->pid); + add_event_entry(spu->tgid); + add_event_entry(app_dcookie); + + if (offset) { + /* When offset is non-zero, the SPU ELF was embedded; + * otherwise, it was loaded from a separate binary file. For + * embedded case, we record the offset into the embedding file + * where the SPU ELF was placed. The embedding file may be + * either the executable application binary or shared library. + * For the non-embedded case, we record a dcookie that + * points to the location of the separate SPU binary that was + * loaded. + */ + if (shlib_cookie) { + add_event_entry(SPU_SHLIB_COOKIE_CODE); + add_event_entry(shlib_cookie); + } + add_event_entry(SPU_OFFSET_CODE); + add_event_entry(offset); + } else { + add_event_entry(SPU_COOKIE_CODE); + add_event_entry(spu_cookie); + } + spin_unlock_irqrestore(&buffer_lock, flags); + smp_wmb(); +out: + return retval; +} + +/* + * This function is invoked on either a bind_context or unbind_context. + * If called for an unbind_context, the val arg is 0; otherwise, + * it is the object-id value for the spu context. + * The data arg is of type 'struct spu *'. + */ +static int spu_active_notify(struct notifier_block * self, unsigned long val, + void * data) +{ + int retval; + unsigned long flags = 0; + struct spu * the_spu = data; + pr_debug("SPU event notification arrived\n"); + if (!val){ + spin_lock_irqsave(&cache_lock, flags); + retval = release_cached_info(the_spu->number); + spin_unlock_irqrestore(&cache_lock, flags); + } else { + retval = process_context_switch(the_spu, val); + } + return retval; +} + +static struct notifier_block spu_active = { + .notifier_call = spu_active_notify, +}; + +/* The main purpose of this function is to synchronize + * OProfile with SPUFS by registering to be notified of + * SPU task switches. + * + * NOTE: When profiling SPUs, we must ensure that only + * spu_sync_start is invoked and not the generic sync_start + * in drivers/oprofile/oprof.c. A return value of + * SKIP_GENERIC_SYNC or SYNC_START_ERROR will + * accomplish this. + */ +int spu_sync_start(void) { + int k; + int ret = SKIP_GENERIC_SYNC; + int register_ret; + unsigned long flags = 0; + spu_prof_num_nodes = number_of_online_nodes(); + num_spu_nodes = spu_prof_num_nodes * 8; + + spin_lock_irqsave(&buffer_lock, flags); + add_event_entry(ESCAPE_CODE); + add_event_entry(SPU_PROFILING_CODE); + add_event_entry(num_spu_nodes); + spin_unlock_irqrestore(&buffer_lock, flags); + + /* Register for SPU events */ + register_ret = spu_switch_event_register(&spu_active); + if (register_ret) { + ret = SYNC_START_ERROR; + goto out; + } + + for (k = 0; k < (MAX_NUMNODES * 8); k++) + last_guard_val[k] = 0; + pr_debug("spu_sync_start -- running.\n"); +out: + return ret; +} + +/* Record SPU program counter samples to the oprofile event buffer. */ +void spu_sync_buffer(int spu_num, unsigned int * samples, + int num_samples) +{ + unsigned long long file_offset; + unsigned long cache_lock_flags = 0; + unsigned long buffer_lock_flags = 0; + int i; + struct vma_to_fileoffset_map * map; + struct spu * the_spu; + unsigned long long spu_num_ll = spu_num; + unsigned long long spu_num_shifted = spu_num_ll << 32; + struct cached_info * c_info; + + /* We need to obtain the cache_lock here because it's + * possible that after getting the cached_info, the SPU job + * corresponding to this cached_info may end, thus resulting + * in the destruction of the cached_info. + */ + spin_lock_irqsave(&cache_lock, cache_lock_flags); + c_info = get_cached_info(NULL, spu_num); + if (c_info == NULL) { + /* This legitimately happens when the SPU task ends before all + * samples are recorded. No big deal -- so we just drop a few samples. + */ + pr_debug("SPU_PROF: No cached SPU contex " + "for SPU #%d. Dropping samples.\n", spu_num); + spin_unlock_irqrestore(&cache_lock, cache_lock_flags); + return ; + } + + map = c_info->map; + the_spu = c_info->the_spu; + spin_lock_irqsave(&buffer_lock, buffer_lock_flags); + for (i = 0; i < num_samples; i++) { + unsigned int sample = *(samples+i); + int grd_val = 0; + file_offset = 0; + if (sample == 0) + continue; + file_offset = vma_map_lookup( + map, sample, the_spu, &grd_val); + + /* If overlays are used by this SPU application, the guard + * value is non-zero, indicating which overlay section is in + * use. We need to discard samples taken during the time + * period which an overlay occurs (i.e., guard value changes). + */ + if (grd_val && grd_val != last_guard_val[spu_num]) { + last_guard_val[spu_num] = grd_val; + /* Drop the rest of the samples. */ + break; + } + + /* For now, we'll drop samples that can't be mapped. + * This can happen for generated stubs executed from + * the SPU stack. Do we need to record these somehow? + */ + if (unlikely(file_offset == 0xffffffff)) + continue; + add_event_entry(file_offset | spu_num_shifted); + } + spin_unlock_irqrestore(&buffer_lock, buffer_lock_flags); + spin_unlock_irqrestore(&cache_lock, cache_lock_flags); +} + + +int spu_sync_stop(void) +{ + unsigned long flags = 0; + int ret = spu_switch_event_unregister(&spu_active); + if (ret) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: spu_switch_event_unregister returned %d\n", + __FUNCTION__, __LINE__, ret); + goto out; + } + + spin_lock_irqsave(&cache_lock, flags); + ret = release_cached_info(RELEASE_ALL); + spin_unlock_irqrestore(&cache_lock, flags); +out: + pr_debug("spu_sync_stop -- done.\n"); + return ret; +} + + Index: linux-2.6.20/arch/powerpc/oprofile/cell/vma_map.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.20/arch/powerpc/oprofile/cell/vma_map.c 2007-02-20 13:49:52.776240536 -0600 @@ -0,0 +1,279 @@ + /* + * Cell Broadband Engine OProfile Support + * + * (C) Copyright IBM Corporation 2006 + * + * Author: Maynard Johnson <may...@us...> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* The code in this source file is responsible for generating + * vma-to-fileOffset maps for both overlay and non-overlay SPU + * applications. + */ + +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/elf.h> +#include "pr_util.h" + + +void vma_map_free(struct vma_to_fileoffset_map *map) +{ + while (map) { + struct vma_to_fileoffset_map *next = map->next; + kfree(map); + map = next; + } +} + +unsigned int +vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma, + const struct spu * aSpu, int * grd_val) +{ + u32 offset = 0xffffffff; + u32 ovly_grd; + for (; map; map = map->next) { + if (vma < map->vma || vma >= map->vma + map->size) + continue; + + if (map->guard_ptr) { + ovly_grd = *(u32 *)(aSpu->local_store + map- >guard_ptr); + if (ovly_grd != map->guard_val) + continue; + *grd_val = ovly_grd; + } + offset = vma - map->vma + map->offset; + break; + } + + return offset; +} + +static struct vma_to_fileoffset_map * +vma_map_add(struct vma_to_fileoffset_map * map, unsigned int vma, + unsigned int size, unsigned int offset, unsigned int guard_ptr, + unsigned int guard_val) +{ + struct vma_to_fileoffset_map * new = + kzalloc(sizeof(struct vma_to_fileoffset_map), GFP_KERNEL); + if (!new) { + printk(KERN_ERR "SPU_PROF: %s, line %d: malloc failed \n", + __FUNCTION__, __LINE__); + vma_map_free(map); + return NULL; + } + + new->next = map; + new->vma = vma; + new->size = size; + new->offset = offset; + new->guard_ptr = guard_ptr; + new->guard_val = guard_val; + + return new; +} + + +/* Parse SPE ELF header and generate a list of vma_maps. + * A pointer to the first vma_map in the generated list + * of vma_maps is returned. */ +struct vma_to_fileoffset_map * create_vma_map(const struct spu * aSpu, + unsigned long spu_elf_start) +{ + static const unsigned char expected[EI_PAD] = { + [EI_MAG0] = ELFMAG0, + [EI_MAG1] = ELFMAG1, + [EI_MAG2] = ELFMAG2, + [EI_MAG3] = ELFMAG3, + [EI_CLASS] = ELFCLASS32, + [EI_DATA] = ELFDATA2MSB, + [EI_VERSION] = EV_CURRENT, + [EI_OSABI] = ELFOSABI_NONE + }; + + int grd_val; + struct vma_to_fileoffset_map * map = NULL; + struct spu_overlay_info ovly; + unsigned int overlay_tbl_offset = -1; + unsigned long phdr_start, shdr_start; + Elf32_Ehdr ehdr; + Elf32_Phdr phdr; + Elf32_Shdr shdr, shdr_str; + Elf32_Sym sym; + int i, j; + char name[32]; + + unsigned int ovly_table_sym = 0; + unsigned int ovly_buf_table_sym = 0; + unsigned int ovly_table_end_sym = 0; + unsigned int ovly_buf_table_end_sym = 0; + unsigned long ovly_table; + unsigned int n_ovlys; + + /* Get and validate ELF header. */ + + if (copy_from_user(&ehdr, (void *) spu_elf_start, sizeof (ehdr))) + goto fail; + + if (memcmp(ehdr.e_ident, expected, EI_PAD) != 0) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: Unexpected e_ident parsing SPU ELF \n", + __FUNCTION__, __LINE__); + goto fail; + } + if (ehdr.e_machine != EM_SPU) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: Unexpected e_machine parsing SPU ELF\n", + __FUNCTION__, __LINE__); + goto fail; + } + if (ehdr.e_type != ET_EXEC) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: Unexpected e_type parsing SPU ELF \n", + __FUNCTION__, __LINE__); + goto fail; + } + phdr_start = spu_elf_start + ehdr.e_phoff; + shdr_start = spu_elf_start + ehdr.e_shoff; + + /* Traverse program headers. */ + for (i = 0; i < ehdr.e_phnum; i++) { + if (copy_from_user(&phdr, + (void *) (phdr_start + i * sizeof (phdr)), + sizeof(phdr))) + goto fail; + + if (phdr.p_type != PT_LOAD) + continue; + if (phdr.p_flags & (1 << 27)) + continue; + + map = vma_map_add(map, phdr.p_vaddr, phdr.p_memsz, + phdr.p_offset, 0, 0); + if (!map) + goto fail; + } + + pr_debug("SPU_PROF: Created non-overlay maps\n"); + /* Traverse section table and search for overlay-related symbols. */ + for (i = 0; i < ehdr.e_shnum; i++) { + if (copy_from_user(&shdr, + (void *) (shdr_start + i * sizeof (shdr)), + sizeof(shdr))) + goto fail; + + if (shdr.sh_type != SHT_SYMTAB) + continue; + if (shdr.sh_entsize != sizeof (sym)) + continue; + + if (copy_from_user(&shdr_str, + (void *) (shdr_start + shdr.sh_link * + sizeof(shdr)), + sizeof(shdr))) + goto fail; + + if (shdr_str.sh_type != SHT_STRTAB) + goto fail;; + + for (j = 0; j < shdr.sh_size / sizeof (sym); j++) { + if (copy_from_user(&sym, (void *) (spu_elf_start + + shdr.sh_offset + j * + sizeof (sym)), + sizeof (sym))) + goto fail; + + if (copy_from_user(name, (void *) + (spu_elf_start + shdr_str.sh_offset + + sym.st_name), + 20)) + goto fail; + + if (memcmp(name, "_ovly_table", 12) == 0) + ovly_table_sym = sym.st_value; + if (memcmp(name, "_ovly_buf_table", 16) == 0) + ovly_buf_table_sym = sym.st_value; + if (memcmp(name, "_ovly_table_end", 16) == 0) + ovly_table_end_sym = sym.st_value; + if (memcmp(name, "_ovly_buf_table_end", 20) == 0) + ovly_buf_table_end_sym = sym.st_value; + } + } + + /* If we don't have overlays, we're done. */ + if (ovly_table_sym == 0 || ovly_buf_table_sym == 0 + || ovly_table_end_sym == 0 || ovly_buf_table_end_sym == 0) { + pr_debug("SPU_PROF: No overlay table found\n"); + goto out; + } + else { + pr_debug("SPU_PROF: Overlay table found\n"); + } + + /* The _ovly_table symbol represents a table with one entry + * per overlay section. The _ovly_buf_table symbol represents + * a table with one entry per overlay region. + * The struct spu_overlay_info gives the structure of the _ovly_table + * entries. The structure of _ovly_table_buf is simply one + * u32 word per entry. + */ + overlay_tbl_offset = vma_map_lookup(map, ovly_table_sym, aSpu, &grd_val); + if (overlay_tbl_offset < 0) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: Error finding SPU overlay table\n", + __FUNCTION__, __LINE__); + goto fail; + } + ovly_table = spu_elf_start + overlay_tbl_offset; + + n_ovlys = (ovly_table_end_sym - + ovly_table_sym) / sizeof (ovly); + + /* Traverse overlay table. */ + for (i = 0; i < n_ovlys; i++) { + if (copy_from_user(&ovly, (void *) + (ovly_table + i * sizeof (ovly)), + sizeof (ovly))) + goto fail; + + /* The ovly.vma/size/offset arguments are analogous to the same + * arguments used above for non-overlay maps. The final two + * args are referred to as the guard pointer and the guard + * value. + * The guard pointer is an entry in the _ovly_buf_table, + * computed using ovly.buf as the index into the table. Since + * ovly.buf values begin at '1' to reference the first (or 0th) + * entry in the _ovly_buf_table, the computation subtracts 1 + * from ovly.buf. + * The guard value is stored in the _ovly_buf_table entry and + * is an index (starting at 1) back to the _ovly_table entry + * that is pointing at this _ovly_buf_table entry. So, for + * example, for an overlay scenario with one overlay segment + * and two overlay sections: + * - Section 1 points to the first entry of the + * _ovly_buf_table, which contains a guard value + * of '1', referencing the first (index=0) entry of + * _ovly_table. + * - Section 2 points to the second entry of the + * _ovly_buf_table, which contains a guard value + * of '2', referencing the second (index=1) entry of + * _ovly_table. + */ + map = vma_map_add(map, ovly.vma, ovly.size, ovly.offset, + ovly_buf_table_sym + (ovly.buf - 1) * 4, i + 1); + if (!map) + goto fail; + } + goto out; + + fail: + map = NULL; + out: + return map; +} Index: linux-2.6.20/arch/powerpc/oprofile/common.c =================================================================== --- linux-2.6.20.orig/arch/powerpc/oprofile/common.c 2007-02-20 13:49:02.029235152 -0600 +++ linux-2.6.20/arch/powerpc/oprofile/common.c 2007-02-20 16:42:26.626176048 -0600 @@ -29,6 +29,8 @@ static struct op_counter_config ctr[OP_MAX_COUNTER]; static struct op_system_config sys; +static int op_powerpc_flag; + static void op_handle_interrupt(struct pt_regs *regs) { model->handle_interrupt(regs, ctr); @@ -36,25 +38,41 @@ static void op_powerpc_cpu_setup(void *dummy) { - model->cpu_setup(ctr); + int ret; + + ret = model->cpu_setup(ctr); + + if (ret != 0) + op_powerpc_flag = ret; } static int op_powerpc_setup(void) { int err; + op_powerpc_flag = 0; + /* Grab the hardware */ err = reserve_pmc_hardware(op_handle_interrupt); if (err) return err; /* Pre-compute the values to stuff in the hardware registers. */ - model->reg_setup(ctr, &sys, model->num_counters); + op_powerpc_flag = model->reg_setup(ctr, &sys, model- >num_counters); - /* Configure the registers on all cpus. */ + if (op_powerpc_flag) + goto out; + + /* Configure the registers on all cpus. If an error occurs on one + * of the cpus, op_powerpc_flag will be set to the error */ on_each_cpu(op_powerpc_cpu_setup, NULL, 0, 1); - return 0; +out: if (op_powerpc_flag) { + /* error on setup release the performance counter hardware */ + release_pmc_hardware(); + } + + return op_powerpc_flag; } static void op_powerpc_shutdown(void) @@ -64,16 +82,29 @@ static void op_powerpc_cpu_start(void *dummy) { - model->start(ctr); + /* If any of the cpus have return an error, set the + * global flag to the error so it can be returned + * to the generic OProfile caller. + */ + int ret; + + ret = model->start(ctr); + if (ret != 0) + op_powerpc_flag = ret; } static int op_powerpc_start(void) { + op_powerpc_flag = 0; + if (model->global_start) - model->global_start(ctr); - if (model->start) + return model->global_start(ctr); + if (model->start) { on_each_cpu(op_powerpc_cpu_start, NULL, 0, 1); - return 0; + return op_powerpc_flag; + } + return -EIO; /* No start function is defined for this + power architecture */ } static inline void op_powerpc_cpu_stop(void *dummy) @@ -150,6 +181,8 @@ #ifdef CONFIG_PPC_CELL_NATIVE case PPC_OPROFILE_CELL: model = &op_model_cell; + ops->sync_start = model->sync_start; + ops->sync_stop = model->sync_stop; break; #endif case PPC_OPROFILE_RS64: Index: linux-2.6.20/arch/powerpc/oprofile/Kconfig =================================================================== --- linux-2.6.20.orig/arch/powerpc/oprofile/Kconfig 2007-02-20 13:49:02.028235304 -0600 +++ linux-2.6.20/arch/powerpc/oprofile/Kconfig 2007-02-20 13:49:52.779240080 -0600 @@ -7,7 +7,7 @@ config OPROFILE tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING + depends on PROFILING help OProfile is a profiling system capable of profiling the whole system, include the kernel, kernel modules, libraries, @@ -15,3 +15,10 @@ If unsure, say N. +config OPROFILE_CELL + bool "OProfile for Cell Broadband Engine" + depends on (SPU_FS = y && OPROFILE = m) || (SPU_FS = y && OPROFILE = y) || (SPU_FS = m && OPROFILE = m) + default y + help + Profiling of Cell BE SPUs requires special support enabled + by this option. Index: linux-2.6.20/arch/powerpc/oprofile/Makefile =================================================================== --- linux-2.6.20.orig/arch/powerpc/oprofile/Makefile 2007-02-20 13:49:02.027235456 -0600 +++ linux-2.6.20/arch/powerpc/oprofile/Makefile 2007-02-20 13:49:52.781239776 -0600 @@ -11,7 +11,8 @@ timer_int.o ) oprofile-y := $(DRIVER_OBJS) common.o backtrace.o -oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o +oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \ + cell/spu_profiler.o cell/vma_map.o cell/spu_task_sync.o oprofile-$(CONFIG_PPC64) += op_model_rs64.o op_model_power4.o oprofile-$(CONFIG_FSL_BOOKE) += op_model_fsl_booke.o oprofile-$(CONFIG_6xx) += op_model_7450.o Index: linux-2.6.20/arch/powerpc/oprofile/op_model_cell.c =================================================================== --- linux-2.6.20.orig/arch/powerpc/oprofile/op_model_cell.c 2007-02-20 13:49:02.030235000 -0600 +++ linux-2.6.20/arch/powerpc/oprofile/op_model_cell.c 2007-02-20 16:49:48.719198544 -0600 @@ -37,11 +37,21 @@ #include <asm/system.h> #include "../platforms/cell/interrupt.h" +#include "cell/pr_util.h" + +/* spu_cycle_reset is the number of cycles between samples. + * This variable is used for SPU profiling and should ONLY be set + * at the beginning of cell_reg_setup; otherwise, it's read-only. + */ +static unsigned int spu_cycle_reset = 0; + +#define NUM_SPUS_PER_NODE 8 +#define SPU_CYCLES_EVENT_NUM 2 /* event number for SPU_CYCLES */ #define PPU_CYCLES_EVENT_NUM 1 /* event number for CYCLES */ #define PPU_CYCLES_GRP_NUM 1 /* special group number for identifying - * PPU_CYCLES event - */ + * PPU_CYCLES event + */ #define CBE_COUNT_ALL_CYCLES 0x42800000 /* PPU cycle event specifier */ #define NUM_THREADS 2 /* number of physical threads in @@ -50,6 +60,7 @@ #define NUM_TRACE_BUS_WORDS 4 #define NUM_INPUT_BUS_WORDS 2 +#define MAX_SPU_COUNT 0xFFFFFF /* maximum 24 bit LFSR value */ struct pmc_cntrl_data { unsigned long vcntr; @@ -64,7 +75,7 @@ struct pm_signal { u16 cpu; /* Processor to modify */ - u16 sub_unit; /* hw subunit this applies to (if applicable) */ + u16 sub_unit; /* hw subunit this applies to (if applicable)*/ short int signal_group; /* Signal Group to Enable/Disable */ u8 bus_word; /* Enable/Disable on this Trace/Trigger/Event * Bus Word(s) (bitmask) @@ -111,6 +122,20 @@ static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS]; +/* The CELL profiling code makes rtas calls to setup the debug bus to + * route the performance signals. Additionally, SPU profiling requires + * a second rtas call to setup the hardware to capture the SPU PCs. + * The EIO error value is returned if the token lookups or the rtas + * call fail. The EIO error number is the best choice of the existing + * error numbers. The probability of rtas related error is very low. But + * by returning EIO and printing additional information to dmsg the user + * will know that OProfile did not start and dmesg will tell them why. + * OProfile does not support returning errors on Stop. Not a huge issue + * since failure to reset the debug bus or stop the SPU PC collection is + * not a fatel issue. Chances are if the Stop failed, Start doesn't work + * either. + */ + /* Interpetation of hdw_thread: * 0 - even virtual cpus 0, 2, 4,... * 1 - odd virtual cpus 1, 3, 5, ... @@ -125,7 +150,8 @@ * is available. */ static struct pm_signal pm_signal[NR_PHYS_CTRS]; -static int pm_rtas_token; +static int pm_rtas_token; /* token for debug bus setup call */ +static int spu_rtas_token; /* token for SPU cycle profiling */ static u32 reset_value[NR_PHYS_CTRS]; static int num_counters; @@ -140,14 +166,15 @@ /* * Firmware interface functions */ + static int rtas_ibm_cbe_perftools(int subfunc, int passthru, void *address, unsigned long length) { u64 paddr = __pa(address); - return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru, - paddr >> 32, paddr & 0xffffffff, length); + return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, + passthru, paddr >> 32, paddr & 0xffffffff, length); } static void pm_rtas_reset_signals(u32 node) @@ -174,24 +201,28 @@ &pm_signal_local, sizeof(struct pm_signal)); - if (ret) + if (unlikely(ret)) + /* Not a fatal error. For Oprofile stop, the oprofile + * functions do not support returning an error for + * failure to stop OProfile. + */ printk(KERN_WARNING "%s: rtas returned: %d\n", __FUNCTION__, ret); } -static void pm_rtas_activate_signals(u32 node, u32 count) +static int pm_rtas_activate_signals(u32 node, u32 count) { int ret; int i, j; struct pm_signal pm_signal_local[NR_PHYS_CTRS]; /* There is no debug setup required for the cycles event. - * Note that only events in the same group can be used. - * Otherwise, there will be conflicts in correctly routing - * the signals on the debug bus. It is the responsiblity - * of the OProfile user tool to check the events are in - * the same group. - */ + * Note that only events in the same group can be used. + * Otherwise, there will be conflicts in correctly routing + * the signals on the debug bus. It is the responsiblity + * of the OProfile user tool to check the events are in + * the same group. + */ i = 0; for (j = 0; j < count; j++) { @@ -212,10 +243,14 @@ pm_signal_local, i * sizeof(struct pm_signal)); - if (ret) + if (unlikely(ret)) { printk(KERN_WARNING "%s: rtas returned: %d\n", __FUNCTION__, ret); + return -EIO; + } } + + return 0; } /* @@ -297,6 +332,7 @@ input_bus[j] = i; pm_regs.group_control |= (i << (31 - i)); + break; } } @@ -386,9 +422,8 @@ u32 cpu; unsigned long flags; - /* Make sure that the interrupt_hander and - * the virt counter are not both playing with - * the counters on the same node. + /* Make sure that the interrupt_hander and the virt counter are + * not both playing with the counters on the same node. */ spin_lock_irqsave(&virt_cntr_lock, flags); @@ -481,17 +516,41 @@ } /* This function is called once for all cpus combined */ -static void +static int cell_reg_setup(struct op_counter_config *ctr, struct op_system_config *sys, int num_ctrs) { int i, j, cpu; + spu_cycle_reset = 0; + + if (ctr[0].event == SPU_CYCLES_EVENT_NUM) { + spu_cycle_reset = ctr[0].count; + + /* Each node will need to make the rtas call to start + * and stop SPU profiling. Get the token once and store it. + */ + spu_rtas_token = rtas_token("ibm,cbe-spu-perftools"); + + if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) { + printk(KERN_ERR + "%s: rtas token ibm,cbe-spu-perftools unknown\n", + __FUNCTION__); + return -EIO; + } + } pm_rtas_token = rtas_token("ibm,cbe-perftools"); - if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) { - printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n", + + /* For all events excetp PPU CYCLEs, each node will need to make + * the rtas cbe-perftools call to setup and reset the debug bus. + * Make the token lookup call once and store it in the global + * variable pm_rtas_token. + */ + if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) { + printk(KERN_ERR + "%s: rtas token ibm,cbe-perftools unknown\n", __FUNCTION__); - goto out; + return -EIO; } num_counters = num_ctrs; @@ -568,28 +627,27 @@ for (i = 0; i < num_counters; ++i) { per_cpu(pmc_values, cpu)[i] = reset_value[i]; } -out: - ; + + return 0; } + + /* This function is called once for each cpu */ -static void cell_cpu_setup(struct op_counter_config *cntr) +static int cell_cpu_setup(struct op_counter_config *cntr) { u32 cpu = smp_processor_id(); u32 num_enabled = 0; int i; + if (spu_cycle_reset) + return 0; + /* There is one performance monitor per processor chip (i.e. node), * so we only need to perform this function once per node. */ if (cbe_get_hw_thread_id(cpu)) - goto out; - - if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) { - printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n", - __FUNCTION__); - goto out; - } + return 0; /* Stop all counters */ cbe_disable_pm(cpu); @@ -608,16 +666,283 @@ } } - pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled); + /* the pm_rtas_activate_signals will return -EIO if the FW + * call failed. + */ + return (pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled)); + +} + +#define ENTRIES 303 +#define MAXLFSR 0xFFFFFF + +/* precomputed table of 24 bit LFSR values */ +int initial_lfsr[] = +{8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424, + 15507573, 7458917, 285419, 2641121, 9780088, 3915503, 6668768, 1548716, + 4885000, 8774424, 9650099, 2044357, 2304411, 9326253, 10332526, 4421547, + 3440748, 10179459, 13332843, 10375561, 1313462, 8375100, 5198480, 6071392, + 9341783, 1526887, 3985002, 1439429, 13923762, 7010104, 11969769, 4547026, + 2040072, 4025602, 3437678, 7939992, 11444177, 4496094, 9803157, 10745556, + 3671780, 4257846, 5662259, 13196905, 3237343, 12077182, 16222879, 7587769, + 14706824, 2184640, 12591135, 10420257, 7406075, 3648978, 11042541, 15906893, + 11914928, 4732944, 10695697, 12928164, 11980531, 4430912, 11939291, 2917017, + 6119256, 4172004, 9373765, 8410071, 14788383, 5047459, 5474428, 1737756, + 15967514, 13351758, 6691285, 8034329, 2856544, 14394753, 11310160, 12149558, + 7487528, 7542781, 15668898, 12525138, 12790975, 3707933, 9106617, 1965401, + 16219109, 12801644, 2443203, 4909502, 8762329, 3120803, 6360315, 9309720, + 15164599, 10844842, 4456529, 6667610, 14924259, 884312, 6234963, 3326042, + 15973422, 13919464, 5272099, 6414643, 3909029, 2764324, 5237926, 4774955, + 10445906, 4955302, 5203726, 10798229, 11443419, 2303395, 333836, 9646934, + 3464726, 4159182, 568492, 995747, 10318756, 13299332, 4836017, 8237783, + 3878992, 2581665, 11394667, 5672745, 14412947, 3159169, 9094251, 16467278, + 8671392, 15230076, 4843545, 7009238, 15504095, 1494895, 9627886, 14485051, + 8304291, 252817, 12421642, 16085736, 4774072, 2456177, 4160695, 15409741, + 4902868, 5793091, 13162925, 16039714, 782255, 11347835, 14884586, 366972, + 16308990, 11913488, 13390465, 2958444, 10340278, 1177858, 1319431, 10426302, + 2868597, 126119, 5784857, 5245324, 10903900, 16436004, 3389013, 1742384, + 14674502, 10279218, 8536112, 10364279, 6877778, 14051163, 1025130, 6072469, + 1988305, 8354440, 8216060, 16342977, 13112639, 3976679, 5913576, 8816697, + 6879995, 14043764, 3339515, 9364420, 15808858, 12261651, 2141560, 5636398, + 10345425, 10414756, 781725, 6155650, 4746914, 5078683, 7469001, 6799140, + 10156444, 9667150, 10116470, 4133858, 2121972, 1124204, 1003577, 1611214, + 14304602, 16221850, 13878465, 13577744, 3629235, 8772583, 10881308, 2410386, + 7300044, 5378855, 9301235, 12755149, 4977682, 8083074, 10327581, 6395087, + 9155434, 15501696, 7514362, 14520507, 15808945, 3244584, 4741962, 9658130, + 14336147, 8654727, 7969093, 15759799, 14029445, 5038459, 9894848, 8659300, + 13699287, 8834306, 10712885, 14753895, 10410465, 3373251, 309501, 9561475, + 5526688, 14647426, 14209836, 5339224, 207299, 14069911, 8722990, 2290950, + 3258216, 12505185, 6007317, 9218111, 14661019, 10537428, 11731949, 9027003, + 6641507, 9490160, 200241, 9720425, 16277895, 10816638, 1554761, 10431375, + 7467528, 6790302, 3429078, 14633753, 14428997, 11463204, 3576212, 2003426, + 6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607}; + +/* + * The hardware uses an LFSR counting sequence to determine when to capture + * the SPU PCs. An LFSR sequence is like a puesdo random number sequence + * where each number occurs once in the sequence but the sequence is not in + * numerical order. The SPU PC capture is done when the LFSR sequence reaches + * the last value in the sequence. Hence the user specified value N + * corresponds to the LFSR number that is N from the end of the sequence. + * + * To avoid the time to compute the LFSR, a lookup table is used. The 24 bit + * LFSR sequence is broken into four ranges. The spacing of the precomputed + * values is adjusted in each range so the error between the user specifed + * number (N) of events between samples and the actual number of events based + * on the precomputed value will be les then about 6.2%. Note, if the user + * specifies N < 2^16, the LFSR value that is 2^16 from the end will be used. + * This is to prevent the loss of samples because the trace buffer is full. + * + * User specified N Step between Index in + * precomputed values precomputed + * table + * 0 to 2^16-1 ---- 0 + * 2^16 to 2^16+2^19-1 2^12 1 to 128 + * 2^16+2^19 to 2^16+2^19+2^22-1 2^15 129 to 256 + * 2^16+2^19+2^22 to 2^24-1 2^18 257 to 302 + * + * + * For example, the LFSR values in the second range are computed for 2^16, + * 2^16+2^12, ... , 2^19-2^16, 2^19 and stored in the table at indicies + * 1, 2,..., 127, 128. + * + * The 24 bit LFSR value for the nth number in the sequence can be + * calculated using the following code: + * + * #define size 24 + * int calculate_lfsr(int n) + * { + * int i; + * unsigned int newlfsr0; + * unsigned int lfsr = 0xFFFFFF; + * unsigned int howmany = n; + * + * for (i = 2; i < howmany + 2; i++) { + * newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^ + * ((lfsr >> (size - 1 - 1)) & 1) ^ + * (((lfsr >> (size - 1 - 6)) & 1) ^ + * ((lfsr >> (size - 1 - 23)) & 1))); + * + * lfsr >>= 1; + * lfsr = lfsr | (newlfsr0 << (siz... [truncated message content] |