[Math-atlas-commits] CVS: AtlasBase/Clint atlas-lvl2.base, 1.74, 1.75
Brought to you by:
rwhaley,
tonyc040457
From: R. C. W. <rw...@us...> - 2009-07-28 23:18:26
|
Update of /cvsroot/math-atlas/AtlasBase/Clint In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv12521/Clint Modified Files: atlas-lvl2.base Log Message: Index: atlas-lvl2.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas-lvl2.base,v retrieving revision 1.74 retrieving revision 1.75 diff -C2 -d -r1.74 -r1.75 *** atlas-lvl2.base 28 Jul 2009 17:02:21 -0000 1.74 --- atlas-lvl2.base 28 Jul 2009 23:18:11 -0000 1.75 *************** *** 12569,12572 **** --- 12569,12573 ---- r1bestL1 = Clone@up@(rt)Node(r1bestL1); r1bestL2 = Clone@up@(rt)Node(r1bestL2); + r1bestL2->CacheElts = r1bestL2b->CacheElts; r1bestOC = Clone@up@(rt)Node(r1bestOC); r1bestOC->CacheElts = 0; *************** *** 12792,12796 **** * on rest of NU-wide column panel. */ ! void UnrollSYR2r ( FILE *fpout, /* stream to print to */ --- 12793,12797 ---- * on rest of NU-wide column panel. */ ! void UnrollSYR2 ( FILE *fpout, /* stream to print to */ *************** *** 12801,12810 **** ) /* ! * Real precision unroll of 'Upper' SYR2 */ { int i, j; ! fprintf(fpout, "#define %s(A_, lda_, x_, y_, xt_, yt_) \\\n{\n", name); fprintf(fpout, " TYPE *aa=(A_); \\\n"); fprintf(fpout, " const TYPE"); --- 12802,12811 ---- ) /* ! * Real precision unroll of SYR2 */ { int i, j; ! fprintf(fpout, "#define %s(A_, lda_, x_, y_) \\\n{\n", name); fprintf(fpout, " TYPE *aa=(A_); \\\n"); fprintf(fpout, " const TYPE"); *************** *** 12813,12823 **** fprintf(fpout, ";\\\n const TYPE"); for (i=0; i < nu; i++) - fprintf(fpout, " xt%d=(xt_)[%d]", i, i); - fprintf(fpout, ";\\\n const TYPE"); - for (i=0; i < nu; i++) fprintf(fpout, " y%d=(y_)[%d]", i, i); - fprintf(fpout, ";\\\n const TYPE"); - for (i=0; i < nu; i++) - fprintf(fpout, " yt%d=(yt_)[%d]", i, i); fprintf(fpout, ";\\\n"); if (Uplo == AtlasUpper) --- 12814,12818 ---- *************** *** 12825,12829 **** for (j=0; j < nu; j++) for (i=0; i <= j; i++) ! fprintf(fpout, " aa[%s+%d] += x%d*yt%d + y%d*xt%d; \\\n", GetMul(j, "lda_"), i, i, j, i, j); } --- 12820,12824 ---- for (j=0; j < nu; j++) for (i=0; i <= j; i++) ! fprintf(fpout, " aa[%s+%d] += x%d*y%d + y%d*x%d; \\\n", GetMul(j, "lda_"), i, i, j, i, j); } *************** *** 12832,12841 **** for (j=0; j < nu; j++) for (i=j; i < nu; i++) ! fprintf(fpout, " aa[%s+%d] += x%d*yt%d + y%d*xt%d; \\\n", GetMul(j, "lda_"), i, i, j, i, j); } fprintf(fpout, "}\n"); } ! void UnrollSYR2c ( FILE *fpout, /* stream to print to */ --- 12827,12836 ---- for (j=0; j < nu; j++) for (i=j; i < nu; i++) ! fprintf(fpout, " aa[%s+%d] += x%d*y%d + y%d*x%d; \\\n", GetMul(j, "lda_"), i, i, j, i, j); } fprintf(fpout, "}\n"); } ! void UnrollHER2 ( FILE *fpout, /* stream to print to */ *************** *** 12901,12906 **** void s2hgen ( ! ATL_r1node_t *r1R, /* restricted L1-blocked GER kernel */ ! ATL_r1node_t *r1G, /* general L1-blocked GER kernel */ char pre, char *path /* path to generate header files in */ --- 12896,12901 ---- void s2hgen ( ! ATL_r1node_t *r1B, /* standard 8-entry R1SUMM kernel list */ ! int L1Elts, /* number of elements in L1 cache */ char pre, char *path /* path to generate header files in */ *************** *** 12912,12917 **** char ln[1024]; FILE *fpout; char PRE = toupper(pre); ! /* * Can't have a restricted kernel if it is a repeat of the general kernel. --- 12907,12942 ---- char ln[1024]; FILE *fpout; + ATL_r1node_t *r1p, *r1IC, *r1OC, *r1ICr, *r1OCr, CacheElts; char PRE = toupper(pre); ! /* ! * Default to blocking for L1 ! */ ! r1OCr = r1B->next->next->next->next->next->next; ! r1OC = r1OCr->next; ! r1ICr = r1B->next->next->next->next; ! r1IC = r1ICr->next; ! CacheElts = r1OC->CacheElts; ! /* ! * If best out-of-cache (OC) kernels use no blocking or L2 blocking, we ! * need to check if L2-blocking will be faster for SYR2 ! */ ! r1p = r1B->next; ! if (r1p->CacheElts == 0 || r1p->CacheElts > L1Elts) ! { ! double mfL1, mfL2; ! mfL1 = (r1OCr->mflop[0] + r1IC->mflop[4])/2.0; ! mfL2 = (r1p->CacheElts) ? r1p->mflop[1] : r1p->mflop[2]; ! mfL2 = (mfL2+r1B->next->next->next->mflop[3])/2.0; ! if (mfL2 >= 1.02*mfL1) ! { ! r1OCr = r1B; ! r1OC = r1OCr->next; ! r1ICr = r1B->next->next; ! r1IC = r1ICr->next; ! CacheElts = (r1OC->CacheElts) ? r1OC->CacheElts : r1IC->CacheElts; ! // HERE HERE: make sure search fills in in-L2's CE! ! } ! } ! /* * Can't have a restricted kernel if it is a repeat of the general kernel. *************** *** 12945,12955 **** if (pre == 's' || pre == 'd') { ! UnrollSYR2r(fpout, "ATL_SYR2U_nu", pre, AtlasUpper, r1G->YU); ! UnrollSYR2r(fpout, "ATL_SYR2L_nu", pre, AtlasLower, r1G->YU); } else { ! UnrollSYR2c(fpout, "ATL_HER2U_nu", pre, AtlasUpper, r1G->YU); ! UnrollSYR2c(fpout, "ATL_HER2L_nu", pre, AtlasLower, r1G->YU); } --- 12970,12980 ---- if (pre == 's' || pre == 'd') { ! UnrollSYR2(fpout, "ATL_SYR2U_nu", pre, AtlasUpper, r1G->YU); ! UnrollSYR2(fpout, "ATL_SYR2L_nu", pre, AtlasLower, r1G->YU); } else { ! UnrollHER2(fpout, "ATL_HER2U_nu", pre, AtlasUpper, r1G->YU); ! UnrollHER2(fpout, "ATL_HER2L_nu", pre, AtlasLower, r1G->YU); } |