commit 541b3c903f6265f7b663135d80423b27a2922064
Author: Josef Weidendorfer <Josef.Weidendorfer@gmx.de>
Date:   Fri Nov 16 21:47:38 2012 +0100

    Allow LL set count to be non-power-of-two
    
    For LL simulation, we use the modulo operator ('%') to map
    from address to set number. Slows down LL simulation.

diff --git a/cachegrind/cg-arch.c b/cachegrind/cg-arch.c
index da20441..ca297dd 100644
--- a/cachegrind/cg-arch.c
+++ b/cachegrind/cg-arch.c
@@ -41,10 +41,17 @@ static void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc,
 
 // Checks cache config is ok.  Returns NULL if ok, or a pointer to an error
 // string otherwise.
-static const HChar* check_cache(cache_t* cache)
+static const HChar* check_cache(cache_t* cache,
+				Bool check_power_of_two_set_count)
 {
-   // Simulator requires set count to be a power of two.
-   if ((cache->size % (cache->line_size * cache->assoc) != 0) ||
+   // Simulator requires that parameters result in an integer set count.
+   if (cache->size % (cache->line_size * cache->assoc) != 0)
+   {
+      return "Cache set count is not an integer.\n";
+   }
+
+   // Simulator may require set count to be a power of two.
+   if (check_power_of_two_set_count &&
        (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc)))
    {
       return "Cache set count is not a power of two.\n";
@@ -77,7 +84,8 @@ static const HChar* check_cache(cache_t* cache)
 
 
 static void parse_cache_opt ( cache_t* cache, const HChar* opt,
-                              const HChar* optval )
+                              const HChar* optval,
+			      Bool check_power_of_two_set_count )
 {
    Long i1, i2, i3;
    HChar* endptr;
@@ -96,7 +104,7 @@ static void parse_cache_opt ( cache_t* cache, const HChar* opt,
    if (cache->assoc     != i2) goto overflow;
    if (cache->line_size != i3) goto overflow;
 
-   checkRes = check_cache(cache);
+   checkRes = check_cache(cache, check_power_of_two_set_count);
    if (checkRes) {
       VG_(fmsg)("%s", checkRes);
       goto bad;
@@ -121,14 +129,14 @@ Bool VG_(str_clo_cache_opt)(const HChar *arg,
    const HChar* tmp_str;
 
    if      VG_STR_CLO(arg, "--I1", tmp_str) {
-      parse_cache_opt(clo_I1c, arg, tmp_str);
+      parse_cache_opt(clo_I1c, arg, tmp_str, 1);
       return True;
    } else if VG_STR_CLO(arg, "--D1", tmp_str) {
-      parse_cache_opt(clo_D1c, arg, tmp_str);
+      parse_cache_opt(clo_D1c, arg, tmp_str, 1);
       return True;
    } else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
               VG_STR_CLO(arg, "--LL", tmp_str)) {
-      parse_cache_opt(clo_LLc, arg, tmp_str);
+      parse_cache_opt(clo_LLc, arg, tmp_str, 0);
       return True;
    } else
       return False;
@@ -142,11 +150,13 @@ static void umsg_cache_img(const HChar* desc, cache_t* c)
 
 // Verifies if c is a valid cache.
 // An invalid value causes an assert, unless clo_redefined is True.
-static void check_cache_or_override(const HChar* desc, cache_t* c, Bool clo_redefined)
+static void check_cache_or_override(const HChar* desc, cache_t* c,
+				    Bool clo_redefined,
+				    Bool check_power_of_two_set_count )
 {
    const HChar* checkRes;
 
-   checkRes = check_cache(c);
+   checkRes = check_cache(c, check_power_of_two_set_count);
    if (checkRes) {
       VG_(umsg)("Auto-detected %s cache configuration not supported: %s",
                 desc, checkRes);
@@ -160,63 +170,6 @@ static void check_cache_or_override(const HChar* desc, cache_t* c, Bool clo_rede
 }
 
 
-/* If the LL cache config isn't something the simulation functions
-   can handle, try to adjust it so it is.  Caches are characterised
-   by (total size T, line size L, associativity A), and then we
-   have
-
-     number of sets S = T / (L * A)
-
-   The required constraints are:
-
-   * L must be a power of 2, but it always is in practice, so
-     no problem there
-
-   * A can be any value >= 1
-
-   * T can be any value, but ..
-
-   * S must be a power of 2.
-
-   That sometimes gives a problem.  For example, some Core iX based
-   Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288
-   sets.  The "fix" in this case is to increase the associativity
-   by 50% to 24, which reduces the number of sets to 8192, making
-   it a power of 2.  That's what the following code does (handing
-   the "3/2 rescaling case".)  We might need to deal with other
-   ratios later (5/4 ?).
-
-   The "fix" is "justified" (cough, cough) by alleging that
-   increases of associativity above about 4 have very little effect
-   on the actual miss rate.  It would be far more inaccurate to
-   fudge this by changing the size of the simulated cache --
-   changing the associativity is a much better option.
-*/
-
-static void
-maybe_tweak_LLc(cache_t *LLc)
-{
-  if (LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0) {
-      Long nSets = (Long)LLc->size / (Long)(LLc->line_size * LLc->assoc);
-      if (/* stay sane */
-          nSets >= 4
-          /* nSets is not a power of 2 */
-          && VG_(log2_64)( (ULong)nSets ) == -1
-          /* nSets is 50% above a power of 2 */
-          && VG_(log2_64)( (ULong)((2 * nSets) / (Long)3) ) != -1
-          /* associativity can be increased by exactly 50% */
-          && (LLc->assoc % 2) == 0
-         ) {
-         /* # sets is 1.5 * a power of two, but the associativity is
-            even, so we can increase that up by 50% and implicitly
-            scale the # sets down accordingly. */
-         Int new_assoc = LLc->assoc + (LLc->assoc / 2);
-         VG_(dmsg)("warning: pretending that LL cache has associativity"
-                   " %d instead of actual %d\n", new_assoc, LLc->assoc);
-         LLc->assoc = new_assoc;
-      }
-   }
-}
 
 void VG_(post_clo_init_configure_caches)(cache_t* I1c,
                                          cache_t* D1c,
@@ -237,14 +190,12 @@ void VG_(post_clo_init_configure_caches)(cache_t* I1c,
    // architecture).
    configure_caches( I1c, D1c, LLc, all_caches_clo_defined );
 
-   maybe_tweak_LLc( LLc );
-
    // Check the default/auto-detected values.
    // Allow the user to override invalid auto-detected caches
    // with command line.
-   check_cache_or_override ("I1", I1c, DEFINED(clo_I1c));
-   check_cache_or_override ("D1", D1c, DEFINED(clo_D1c));
-   check_cache_or_override ("LL", LLc, DEFINED(clo_LLc));
+   check_cache_or_override ("I1", I1c, DEFINED(clo_I1c), 1);
+   check_cache_or_override ("D1", D1c, DEFINED(clo_D1c), 1);
+   check_cache_or_override ("LL", LLc, DEFINED(clo_LLc), 0);
 
    // Then replace with any defined on the command line.  (Already checked in
    // VG(parse_clo_cache_opt)().)
diff --git a/cachegrind/cg_sim.c b/cachegrind/cg_sim.c
index 8d17a68..34568b9 100644
--- a/cachegrind/cg_sim.c
+++ b/cachegrind/cg_sim.c
@@ -121,14 +121,33 @@ Bool cachesim_setref_is_miss(cache_t2* c, UInt set_no, UWord tag)
    return True;
 }
 
+
+/* Calculate cache set number from memory block number.
+ * Two modes: use_modulo == false/true:
+ * - false: fast bit masking, set number has to be a power-of-two
+ * - true: use slower modulo operation, allows arbitrary set numbers
+ */
+__attribute__((always_inline))
+static __inline__
+UInt block2set(cache_t2* c, UWord block, Bool use_modulo)
+{
+   if (use_modulo)
+      return block % c->sets;
+   else
+      return block & c->sets_min_1;
+}
+
+// if use_modulo is true, use slower but general modulo operation
+// to map from address to set number in which we check for a hit
 __attribute__((always_inline))
 static __inline__
-Bool cachesim_ref_is_miss(cache_t2* c, Addr a, UChar size)
+Bool cachesim_ref_is_miss(cache_t2* c, Addr a, UChar size,
+			  Bool use_modulo)
 {
    /* A memory block has the size of a cache line */
    UWord block1 =  a         >> c->line_size_bits;
    UWord block2 = (a+size-1) >> c->line_size_bits;
-   UInt  set1   = block1 & c->sets_min_1;
+   UInt  set1   = block2set(c, block1, use_modulo);
 
    /* Tags used in real caches are minimal to save space.
     * As the last bits of the block number of addresses mapping
@@ -145,7 +164,7 @@ Bool cachesim_ref_is_miss(cache_t2* c, Addr a, UChar size)
 
    /* Access straddles two lines. */
    else if (block1 + 1 == block2) {
-      UInt  set2 = block2 & c->sets_min_1;
+      UInt  set2 = block2set(c, block2, use_modulo);
       UWord tag2 = block2;
 
       /* always do both, as state is updated as side effect */
@@ -178,9 +197,9 @@ __attribute__((always_inline))
 static __inline__
 void cachesim_I1_doref_Gen(Addr a, UChar size, ULong* m1, ULong *mL)
 {
-   if (cachesim_ref_is_miss(&I1, a, size)) {
+   if (cachesim_ref_is_miss(&I1, a, size, 0)) {
       (*m1)++;
-      if (cachesim_ref_is_miss(&LL, a, size))
+      if (cachesim_ref_is_miss(&LL, a, size, 1))
          (*mL)++;
    }
 }
@@ -191,11 +210,11 @@ static __inline__
 void cachesim_I1_doref_NoX(Addr a, UChar size, ULong* m1, ULong *mL)
 {
    UWord block  = a >> I1.line_size_bits;
-   UInt  I1_set = block & I1.sets_min_1;
+   UInt  I1_set = block2set(&I1, block, 0);
 
    // use block as tag
    if (cachesim_setref_is_miss(&I1, I1_set, block)) {
-      UInt  LL_set = block & LL.sets_min_1;
+      UInt  LL_set = block2set(&LL, block, 1);
       (*m1)++;
       // can use block as tag as L1I and LL cache line sizes are equal
       if (cachesim_setref_is_miss(&LL, LL_set, block))
@@ -207,9 +226,9 @@ __attribute__((always_inline))
 static __inline__
 void cachesim_D1_doref(Addr a, UChar size, ULong* m1, ULong *mL)
 {
-   if (cachesim_ref_is_miss(&D1, a, size)) {
+   if (cachesim_ref_is_miss(&D1, a, size, 0)) {
       (*m1)++;
-      if (cachesim_ref_is_miss(&LL, a, size))
+      if (cachesim_ref_is_miss(&LL, a, size, 1))
          (*mL)++;
    }
 }
