[Math-atlas-commits] CVS: AtlasBase/Clint atlas-parse.base, 1.50, 1.51 atlas.base, 1.211, 1.212
Brought to you by:
rwhaley,
tonyc040457
From: R. C. W. <rw...@us...> - 2010-01-22 17:44:52
|
Update of /cvsroot/math-atlas/AtlasBase/Clint In directory sfp-cvsdas-1.v30.ch3.sourceforge.com:/tmp/cvs-serv10922/Clint Modified Files: atlas-parse.base atlas.base Log Message: Index: atlas-parse.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas-parse.base,v retrieving revision 1.50 retrieving revision 1.51 diff -C2 -d -r1.50 -r1.51 *** atlas-parse.base 21 Jan 2010 18:03:51 -0000 1.50 --- atlas-parse.base 22 Jan 2010 17:44:39 -0000 1.51 *************** *** 1492,1502 **** else be = "bX"; ! /* dmm%d_TNMBxNBxKB_muxnuxku_ldc_rtMxrtNxrtK_a1_bX */ ! sprintf(fnam, "res/%cmm%s%d_%c%c%dx%dx%d_%dx%dx%d_%d_%dx%dx%d_a1_%s", pre, FLAG_IS_SET(mmp->flag, MMF_AOUTER) ? "MNK" : "NMK", mmp->ID, 'T', 'N', mb, nb, kb, mmp->mu, mmp->nu, mmp->ku, ldc, FLAG_IS_SET(mmp->flag, MMF_MRUNTIME), FLAG_IS_SET(mmp->flag, MMF_NRUNTIME), ! FLAG_IS_SET(mmp->flag, MMF_KRUNTIME), be); if (FORCETIME || !FileExists(fnam)) --- 1492,1503 ---- else be = "bX"; ! /* dmm%d_TNMBxNBxKB_muxnuxku_ldc_rtMxrtNxrtK_LDTOP_pf_a1_bX_flushKB */ ! sprintf(fnam, "res/%cmm%s%d_%c%c%dx%dx%d_%dx%dx%d_%d_%dx%dx%d_%d_%d_a1_%s_%d", pre, FLAG_IS_SET(mmp->flag, MMF_AOUTER) ? "MNK" : "NMK", mmp->ID, 'T', 'N', mb, nb, kb, mmp->mu, mmp->nu, mmp->ku, ldc, FLAG_IS_SET(mmp->flag, MMF_MRUNTIME), FLAG_IS_SET(mmp->flag, MMF_NRUNTIME), ! FLAG_IS_SET(mmp->flag, MMF_KRUNTIME), ! FLAG_IS_SET(mmp->flag, MMF_LDCTOP), mmp->pref, be, cflush); if (FORCETIME || !FileExists(fnam)) *************** *** 1511,1514 **** --- 1512,1518 ---- ch, mmp->comp, ch, mmp->cflags); } + if (!cflush) + i += sprintf(ln+i, "moves=\"\" "); + @skip i += sprintf(ln+i, "moves=\"-DMoveA -DMoveC\" "); i += sprintf(ln+i, "casnam=%s ", fnam); *************** *** 1577,1580 **** --- 1581,1585 ---- int iftch, /* # of initial fetches to do */ int nftch, /* # of fetches to do thereafter */ + int LDTOP, /* 1: load C at top, 0: at bottom */ int pf /* prefetch strategy */ ) *************** *** 1589,1592 **** --- 1594,1600 ---- char ln[1024]; int i; + + if (!LDTOP) + pf |= 512; i = sprintf(ln, "make mmgencase pre=%c muladd=%d lat=%d beta=%d mb=%d nb=%d kb=%d mu=%d nu=%d ku=%d if=%d nf=%d ff=%d, pfA=%d, csC=%d", pre, MACC, lat, beta, nb, nb, nb, mu, nu, ku, iftch, nftch, *************** *** 1603,1607 **** void FillInGMMNode(int verb, ATL_mmnode_t *mmp, char pre, int MACC, int lat, int beta, int nb, int mu, int nu, int ku, ! int fftch, int iftch, int nftch, int pf) /* * Take emit_mm's flags and fill in the standard ATL_mmnode_t struct mmp --- 1611,1615 ---- void FillInGMMNode(int verb, ATL_mmnode_t *mmp, char pre, int MACC, int lat, int beta, int nb, int mu, int nu, int ku, ! int fftch, int iftch, int nftch, int LDTOP, int pf) /* * Take emit_mm's flags and fill in the standard ATL_mmnode_t struct mmp *************** *** 1643,1647 **** mmp->rout[0] = pre; mmp->genstr = GetGmmGenString(verb, pre, MACC, lat, beta, nb, mu, nu, ku, ! fftch, iftch, nftch, pf); mmp->auth = DupString("Whaley/emit_mm"); i = 0; --- 1651,1655 ---- mmp->rout[0] = pre; mmp->genstr = GetGmmGenString(verb, pre, MACC, lat, beta, nb, mu, nu, ku, ! fftch, iftch, nftch, LDTOP, pf); mmp->auth = DupString("Whaley/emit_mm"); i = 0; Index: atlas.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas.base,v retrieving revision 1.211 retrieving revision 1.212 diff -C2 -d -r1.211 -r1.212 *** atlas.base 22 Jan 2010 01:05:55 -0000 1.211 --- atlas.base 22 Jan 2010 17:44:40 -0000 1.212 *************** *** 34739,34742 **** --- 34739,34743 ---- int iftch, /* # of initial fetches to do */ int nftch, /* # of fetches to do thereafter */ + int LDTOP, /* 1: load C at top, else at bottom */ int pf /* prefetch strategy */ ) *************** *** 34755,34759 **** lat = GetGoodLat(MACC, nb, mu, nu, ku, lat0); FillInGMMNode(verb, mmp, pre, MACC, lat, beta, nb, mu, nu, ku, ! fftch, iftch, nftch, pf); mf = TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, beta, -1, -1); } --- 34756,34760 ---- lat = GetGoodLat(MACC, nb, mu, nu, ku, lat0); FillInGMMNode(verb, mmp, pre, MACC, lat, beta, nb, mu, nu, ku, ! fftch, iftch, nftch, LDTOP, pf); mf = TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, beta, -1, -1); } *************** *** 34762,34770 **** lat = GetGoodLat(MACC, nb, mu, nu, 1, lat0); FillInGMMNode(verb, mmp, pre, MACC, lat, beta, nb, mu, nu, 1, ! fftch, iftch, nftch, pf); mf = TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, beta, -1, -1); FillInGMMNode(verb, mmp, pre, MACC, lat0, beta, nb, mu, nu, nb, ! fftch, iftch, nftch, pf); mf1= TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, beta, -1, -1); if (mf >= mf1) --- 34763,34771 ---- lat = GetGoodLat(MACC, nb, mu, nu, 1, lat0); FillInGMMNode(verb, mmp, pre, MACC, lat, beta, nb, mu, nu, 1, ! fftch, iftch, nftch, LDTOP, pf); mf = TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, beta, -1, -1); FillInGMMNode(verb, mmp, pre, MACC, lat0, beta, nb, mu, nu, nb, ! fftch, iftch, nftch, LDTOP, pf); mf1= TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, beta, -1, -1); if (mf >= mf1) *************** *** 34776,34779 **** --- 34777,34806 ---- } + double TryPFs + ( + ATL_mmnode_t *mmp, + char pre, /* precision */ + int verb, /* verbosity level */ + int MACC, /* 0 : separate mult&add, else MACC */ + int lat, /* multiply latency */ + int beta, /* 0,1 beta, else beta=X */ + int nb, /* blocking factor */ + int mu, int nu, int ku, /* unrolling factors */ + int fftch, /* do bogus fetch of C at top of loop? */ + int iftch, /* # of initial fetches to do */ + int nftch, /* # of fetches to do thereafter */ + int LDTOP /* 1: load C at top, else at bottom */ + ) + { + double mf0, mf1; + + mf0 = TryKUs(mmp, pre, verb, MACC, lat, beta, nb, mu, nu, ku, fftch, iftch, + nftch, LDTOP, 0); + mf1 = TryKUs(mmp, pre, verb, MACC, lat, beta, nb, mu, nu, ku, fftch, iftch, + nftch, LDTOP, 1); + mmp->pref = (mf1 > mf0); + return((mmp->pref) ? mf1 : mf0); + } + double FindNumRegsByMACC(char pre, int verb, int nb, int ku, int MACC, int lat, int *NREGS, int *MU, int *NU) *************** *** 34787,34794 **** { GetSafeMUNU(i, MACC, lat, &mu, &nu); ! mf = TryKUs(mmp, pre, verb, MACC, lat, 1, nb, mu, nu, ku, 0, mu+nu, 1, 0); if (verb) printf( ! " nreg=%3d: nb=%2d, mu=%2d, nu=%2d, ku=%2d, MACC=%2d, lat=%2d, mf=%.2f\n", i, nb, mu, nu, mmp->ku, MACC, lat, mf); if (mf > mfB) --- 34814,34822 ---- { GetSafeMUNU(i, MACC, lat, &mu, &nu); ! mf = TryKUs(mmp, pre, verb, MACC, lat, 1, nb, mu, nu, ku, 0, mu+nu, 1, ! 0, 0); if (verb) printf( ! " nreg=%3d: nb=%2d, mu=%2d, nu=%2d, ku=%2d, MACC=%1d, lat=%2d, mf=%.2f\n", i, nb, mu, nu, mmp->ku, MACC, lat, mf); if (mf > mfB) *************** *** 34811,34816 **** } ! int FindNumRegs(char pre, int verb, int nb, int ku, int *MACC, int *lat, ! int *MU, int *NU) /* * Finds an estimate for the number of registers the compiler will let --- 34839,34843 ---- } ! int FindNumRegs(char pre, int verb, int nb, int ku, int *MACC, int *lat) /* * Finds an estimate for the number of registers the compiler will let *************** *** 34833,34837 **** if (!ForceMACC) GetMulAdd(pre, MACC, lat); ! mf = FindNumRegsByMACC(pre, verb, nb, ku, *MACC, *lat, &nregs, MU, NU); /* * Using separate multiply and add is expensive in terms of registers, --- 34860,34864 ---- if (!ForceMACC) GetMulAdd(pre, MACC, lat); ! mf = FindNumRegsByMACC(pre, verb, nb, ku, *MACC, *lat, &nregs, &mu, &nu); /* * Using separate multiply and add is expensive in terms of registers, *************** *** 34847,34852 **** if (mf1 >= mf) /* latency of 1 just as good as longer latency */ { - *MU = mu; - *NU = nu; nregs = nr; *lat = 1; --- 34874,34877 ---- *************** *** 34857,34862 **** if (mfmacc > mf && mfmacc >= mf1) /* MACC is better */ { - *MU = mu; - *NU = nu; nregs = nr; *MACC = 1; --- 34882,34885 ---- *************** *** 34892,34896 **** { int i, L1Elts; ! const int imul = (pre == 'c' || pre == 'z') ? 5 : 3; if (pre == 'd' || pre == 'c') L1Elts = 1024/8; --- 34915,34919 ---- { int i, L1Elts; ! const int imul = (pre == 'c' || pre == 'z') ? 6 : 3; if (pre == 'd' || pre == 'c') L1Elts = 1024/8; *************** *** 34908,34911 **** --- 34931,35303 ---- } + ATL_mmnode_t *FindBestNB + ( + char pre, /* precision, one of s,d,c,z */ + int verb, /* verbosity */ + ATL_mmnode_t *mmp, /* input/output struct for best case found so far */ + int ku /* 0: tune ku, else we must use this ku */ + ) + /* + * This function tries to find the NB to use. It varies NB, prefetch, + * and ku (if allowed, but only between 1 and full unrolling) + * RETURNS: matmul struct of best found case + */ + { + int bN, b0, binc, nbB, muB, nuB, pfB, MACC, lat, KUISKB=0, i; + double mf, mfB, mf1; + + mfB = mmp->mflop[0]; + muB = mmp->mu; + nuB = mmp->nu; + pfB = mmp->pref; + MACC = mmp->muladd; + lat = mmp->lat; + /* + * Find largest block factor to tune; Since L1 estimate may be wrong, + * make sure that larger block factors aren't competitive, but max + * NB will be 80 regardless to avoid cleanup nightmare + */ + if (verb) + printf("\nFINDING UPPER BOUND ON NB:\n"); + bN = GetBigNB(pre); /* our guess for largest useful NB */ + while (bN < 80) + { + mf = TryKUs(mmp, pre, verb, MACC, lat, 1, bN+4, muB, nuB, ku, + 0, muB+nuB, 1, 0, 0); + printf(" nb=%3d, mu=%3d, nu=%3d, ku=%3d, MACC=%d, lat=%d, mf=%.2f\n", + bN+4, muB, nuB, mmp->ku, MACC, lat, mf); + if (mf > mfB) + { + mfB = mf; + nbB = bN+4; + } + else + break; + bN += 4; + } + if (bN > 80) + bN = 80; + if (verb) + printf("NB UPPER BOUND CHOSEN AS : %d (%.2f)\n", bN, mfB); + /* + * See if lowering NB past when all matrices should fit is useful + * (again, L1 detection could be wrong) + */ + if (verb) + printf("\nFINDING LOWER BOUND ON NB:\n"); + b0 = GetSmallNB(pre); + mf1 = TryKUs(mmp, pre, verb, MACC, lat, 1, b0, muB, nuB, ku, + 0, muB+nuB, 1, 0, 0); + printf(" nb=%3d, mu=%3d, nu=%3d, ku=%3d, MACC=%d, lat=%d, mf=%.2f\n", + b0, muB, nuB, mmp->ku, MACC, lat, mf1); + while(b0 > 20) + { + mf = TryKUs(mmp, pre, verb, MACC, lat, 1, b0-4, muB, nuB, ku, + 0, muB+nuB, 1, 0, 0); + printf(" nb=%3d, mu=%3d, nu=%3d, ku=%3d, MACC=%d, lat=%d, mf=%.2f\n", + b0-4, muB, nuB, mmp->ku, MACC, lat, mf); + if (mf < mf1) + break; + else if (mf > mfB) + { + mfB = mf; + nbB = b0-4; + } + b0 -= 4; + } + if (verb) + printf("NB LOWER BOUND CHOSEN AS : %d\n", b0); + + /* + * Now try all NBs with varying prefetch + */ + binc = (pre == 's' || pre == 'c') ? 4 : 2; + KUISKB = (!ku && mmp->ku == mmp->nbB); + b0 = (b0/binc)*binc; + bN = (bN/binc)*binc; + if (verb) + printf("\nFINDING BEST NB AND PREFETCH SETTING IN RANGE [%d,%d,%d]:\n", + b0, bN, binc); + + for (i=b0; i <= bN; i += binc) + { + mf = TryPFs(mmp, pre, verb, MACC, lat, 1, i, muB, nuB, KUISKB ? i:ku, + 0, muB+nuB, 1, 0); + printf( + " nb=%3d, pf=%d, mu=%3d, nu=%3d, ku=%3d, MACC=%d, lat=%d, mf=%.2f\n", + i, mmp->pref, muB, nuB, mmp->ku, MACC, lat, mf); + if (mf > mfB) + { + mfB = mf; + nbB = i; + pfB = mmp->pref; + } + } + if (verb) + printf("BEST NB=%d, BEST PREFETCH=%d (%.2f)\n", nbB, pfB, mfB); + mmp->mflop[0] = mfB; + mmp->mbB = mmp->nbB = mmp->kbB = nbB; + mmp->pref = pfB; + return(mmp); + } + ATL_mmnode_t *FindBestKU + ( + char pre, /* precision, one of s,d,c,z */ + int verb, /* verbosity */ + ATL_mmnode_t *mmp /* input/output struct for best case found so far */ + ) + /* + * Find best K unrolling. There is no data cache dependence here, so time + * with in-cache operands for increases speed and accuracy + */ + { + int k, kuB, latB, kN, incK, lat; + int nb, LAT, MACC, mu, nu, pf; + double mf, mfB; + + LAT = mmp->lat; /* canonical latency */ + MACC = mmp->muladd; + mu = mmp->mu; + nu = mmp->nu; + nb = mmp->nbB; + pf = mmp->pref; + if (verb) + printf("TRYING KUs FOR NB=%d, PF=%d, MU=%d, NU=%d MACC=%d, LAT=%d:\n", + nb, pf, mu, nu, MACC, LAT); + /* + * Try ku=1 as default + */ + kuB = 1; + latB = lat = GetGoodLat(MACC, nb, mu, nu, 1, LAT); + FillInGMMNode(verb, mmp, pre, MACC, lat, 1, nb, mu, nu, 1, + 0, mu+nu, 1, 0, pf); + mfB = TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, 1, -1, 0); + if (verb) + printf(" in-cache KU=%d, lat=%d, mf=%.2f\n", 1, lat, mfB); + /* + * Try NB/2 as maximal unrolling that actually has a loop + */ + k = nb>>1; + lat = GetGoodLat(MACC, nb, mu, nu, k, LAT); + FillInGMMNode(verb, mmp, pre, MACC, lat, 1, nb, mu, nu, k, + 0, mu+nu, 1, 0, pf); + mf = TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, 1, -1, 0); + if (verb) + printf(" in-cache KU=%d, lat=%d, mf=%.2f\n", k, LAT, mf); + if (mf > mfB) + { + mfB = mf; + kuB = nb; + latB = lat; + } + /* + * Try fully unrolled + */ + FillInGMMNode(verb, mmp, pre, MACC, LAT, 1, nb, mu, nu, nb, + 0, mu+nu, 1, 0, pf); + mf = TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, 1, -1, 0); + if (verb) + printf(" in-cache KU=%d, lat=%d, mf=%.2f\n", nb, LAT, mf); + if (mf > mfB) + { + mfB = mf; + kuB = nb; + latB = LAT; + } + /* + * Have already tried 1 & KB, so now try 2, 4, 6, 8 + */ + for (k=2; k <= 8; k += 2) + { + lat = GetGoodLat(MACC, nb, mu, nu, k, LAT); + FillInGMMNode(verb, mmp, pre, MACC, lat, 1, nb, mu, nu, k, + 0, mu+nu, 1, 0, pf); + mf = TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, 1, -1, 0); + if (verb) + printf(" in-cache KU=%d, lat=%d, mf=%.2f\n", k, LAT, mf); + if (mf > mfB) + { + mfB = mf; + kuB = nb; + latB = lat; + } + } + /* + * Try all cases in range [8,nb/2,4] + */ + kN = nb>>1; + if (!mmp->muladd && mmp->lat > 2) + { + incK = mmp->lat; + k = (incK >= 8) ? incK : (8/incK)*incK; + } + else + { + incK = 4; + k = 8; + } + for (; k < kN; k += incK) + { + FillInGMMNode(verb, mmp, pre, MACC, LAT, 1, nb, mu, nu, k, + 0, mu+nu, 1, 0, pf); + mf = TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, 1, -1, 0); + if (verb) + printf(" in-cache KU=%d, lat=%d, mf=%.2f\n", k, LAT, mf); + if (mf > mfB) + { + mfB = mf; + kuB = k; + latB = LAT; + } + } + /* + * Time the best found case out-of-cache so we it can be compared to others + */ + FillInGMMNode(verb, mmp, pre, MACC, latB, 1, nb, mu, nu, kuB, + 0, mu+nu, 1, 0, pf); + mfB = TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, 1, -1, -1); + mmp->mflop[0] = mfB; + if (verb) + printf("BEST KB=%d, lat=%d (%.2f)\n", kuB, latB, mfB); + return(mmp); + } + + ATL_mmnode_t *FindBestRest + ( + char pre, /* precision, one of s,d,c,z */ + int verb, /* verbosity */ + ATL_mmnode_t *mmp /* input/output struct for best case found so far */ + ) /* tunes iftch, nftch, fftch, LDTOP, tries opposite muladd */ + { + return(mmp); + } + + ATL_mmnode_t *FindBestGenGemm + ( + char pre, /* precision, one of s,d,c,z */ + int verb, /* verbosity */ + int nregs, /* max # of registers to use */ + int MACC, /* 1: machine has multiply & accumulate, else separate */ + int lat, /* latency on multiply */ + int FNB, /* is it required to use NB, or can we tune? */ + int NB, /* suggested nb */ + int ku /* 0: tune ku, else we must use this ku */ + ) + /* + * This routine finds the best copy matmul case that can be generated by + * emit_mm.c. It will search over the following parameters: + * (nb,pf), (mu,nu), ku, nftch, iftch, fftch, LDTOP + * + * pf is currently 1 or 0, and it controls whether the next block of A is + * prefetched or not. + * + * LDTOP determines if we load C values before entering the K loop (TOP) + * or after. After gives better error bound, so give it slight advantage + * + * nftch,iftch are crude load scheduling parameters, and they tend to + * have little affect on most machines (the compiler usually reschedules + * the loads on its own). + * + * fftch causes the generator to load C at the top of the loop even + * when we are don't need the values there, so that C is in cache at + * the bottom of the loop when we need it. + * + * RETURNS: filled structure with best gemm case found + */ + { + ATL_mmnode_t *mmp; + int nb, N, Ng, i, j, mu, nu, nbB, muB, nuB; + int *mus, *nus; + double mf, mfB, mf1; + double *fpls; + #ifdef ATL_GAS_x8664 + #define NEXMU 5 + int exmu[NEXMU] = {4, 6, 8, 10, 12}; + #elif defined(ATL_GAS_x8632) + #define NEXMU 3 + int exmu[NEXMU] = {3, 4, 6}; + #endif + + mmp = GetMMNode(); + /* + * Use either required nb, or one that is a multiple of a lot of our + * unrolling factors; Use a big block factor so that our register blocking + * matters more (cache is covering less of costs) + */ + if (FNB) + nb = NB; + else + { + nb = (GetBigNB(pre)/12)*12; + if (nb < 24) + nb = 24; + } + /* + * Get all MU/NU unrollings, Ng of them are competitive on flops/load ratio. + * For x86, always include extra 1-D blockings in mix, even if they + * are not judged competive (because if reg-reg moves aren't free, which + * is true for older x86 machines, 2-D register blocks don't really work + * due to 2-operand assembly) + */ + GetMuNus(nregs, MACC, lat, &Ng, &N, &mus, &nus, &fpls); + free(fpls); + #ifdef NEXMU + for (j=0; j < NEXMU; j++) + { + mu = exmu[j]; + for (i=0; i < Ng; i++) + if (mus[i] == mu && nus[i] == 1) break; + if (i == Ng) + { + mus[Ng] = mu; + nus[Ng] = 1; + Ng++; + } + } + #endif + if (verb) + printf("PROBING FOR M AND N UNROLLING FACTORS:\n"); + /* + * Try all competitive unrolling factors + */ + mfB = 0; + muB = nuB = 1; + for (i=0; i < Ng; i++) + { + mf = TryKUs(mmp, pre, verb, MACC, lat, 1, nb, mus[i], nus[i], ku, + 0, mus[i]+nus[i], 1, 0, 0); + + printf(" nb=%3d, mu=%3d, nu=%3d, ku=%3d, MACC=%d, lat=%d, mf=%.2f\n", + nb, mus[i], nus[i], mmp->ku, MACC, lat, mf); + if (mf > mfB) + { + muB = mus[i]; + nuB = nus[i]; + mfB = mf; + } + } + printf("SELECTED MU=%d, NU=%d (%.2f)\n", muB, nuB, mfB); + free(mus); + free(nus); + nbB = nb; + /* + * If we are allowed, try to tune NB + */ + if (!FNB) + { + FillInGMMNode(verb, mmp, pre, MACC, lat, 1, nb, muB, nuB, 1, + 0, muB+nuB, 1, 0, 0); + mmp->mflop[0] = mfB; + mmp = FindBestNB(pre, verb, mmp, ku); + } + /* + * If we are allowed, tune ku + */ + if (!ku) + mmp = FindBestKU(pre, verb, mmp); /* tunes ku */ + mmp = FindBestRest(pre, verb, mmp); /* tunes iftch, nftch, fftch, LDTOP */ + return(mmp); + } + int main(int nargs, char **args) { *************** *** 34923,34927 **** } if (!nregs) ! nregs = FindNumRegs(pre, verb, nb, ku, &MACC, &lat, &mu, &nu); #if 0 if (MACC < 0) --- 35315,35326 ---- } if (!nregs) ! nregs = FindNumRegs(pre, verb, nb, ku, &MACC, &lat); ! #if 0 ! else ! ConfirmMACC(&MACC, lat); ! #endif ! mmp = FindBestGenGemm(pre, verb, nregs, MACC, lat, FNB, nb, ku); ! WriteMMFileWithPath(pre, "res/", "gMMRES.sum", mmp); ! KillMMNode(mmp); #if 0 if (MACC < 0) *************** *** 34929,34935 **** if (!nb) nb = FindNB(pre, verb, nregs, ku, MACC); - mmp = FindBestGenGemm(pre, verb, nregs, FNB, nb, ku, MACC); - WriteMMFileWithPath(pre, res, "gMMRES.sum", mmp); - KillMMNode(mmp); #endif exit(0); --- 35328,35331 ---- |