[Math-atlas-commits] CVS: AtlasBase/Clint atlas-make.base, 1.252, 1.253 atlas-parse.base, 1.49, 1.5
Brought to you by:
rwhaley,
tonyc040457
From: R. C. W. <rw...@us...> - 2010-01-21 18:04:02
|
Update of /cvsroot/math-atlas/AtlasBase/Clint In directory sfp-cvsdas-1.v30.ch3.sourceforge.com:/tmp/cvs-serv25743/Clint Modified Files: atlas-make.base atlas-parse.base atlas.base Log Message: Index: atlas-make.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas-make.base,v retrieving revision 1.252 retrieving revision 1.253 diff -C2 -d -r1.252 -r1.253 *** atlas-make.base 20 Jan 2010 17:37:53 -0000 1.252 --- atlas-make.base 21 Jan 2010 18:03:50 -0000 1.253 *************** *** 1738,1743 **** - rm -f ATL_tMpFiLe ! @multidef dep @^ scases.dsc@^dcases.dsc@^ccases.dsc@^zcases.dsc @^ @^ xusercomb ! @whiledef rout mmsearch ummsearch usercomb userflag userindex x@(rout) : $(INCAdir)/atlas_type.h @(rout).o @(dep) $(parsedeps) $(XCC) $(XCCFLAGS) -o $@ @(rout).o --- 1738,1743 ---- - rm -f ATL_tMpFiLe ! @multidef dep @^ @^ scases.dsc@^dcases.dsc@^ccases.dsc@^zcases.dsc @^ @^ xusercomb ! @whiledef rout mmsearch gmmsearch ummsearch usercomb userflag userindex x@(rout) : $(INCAdir)/atlas_type.h @(rout).o @(dep) $(parsedeps) $(XCC) $(XCCFLAGS) -o $@ @(rout).o *************** *** 2203,2226 **** # mmgencase : xemit_mm xccobj ! rm -f $(pre)mm.c ! echo "#ifdef BETA0" >> $(pre)mm.c ./xemit_mm -p $(pre) -ta $(ta) -tb $(tb) -muladd $(muladd) -l $(lat) \ -L $(loopO) -M $(mb) -N $(nb) -K $(kb) -m $(mu) -n $(nu) -k $(ku) \ -lda $(lda) -ldb $(ldb) -ldc $(ldc) -csA $(csA) -csB $(csB) -csC $(csC)\ -F0 $(if) -FN $(nf) -FF $(ff) -b 0 -a $(alpha) -Z 1 \ ! -C $(cleanup) -pfA $(pfA) >> $(pre)mm.c ! echo "#elif defined(BETAX)" >> $(pre)mm.c ./xemit_mm -p $(pre) -ta $(ta) -tb $(tb) -muladd $(muladd) -l $(lat) \ -L $(loopO) -M $(mb) -N $(nb) -K $(kb) -m $(mu) -n $(nu) -k $(ku) \ -lda $(lda) -ldb $(ldb) -ldc $(ldc) -csA $(csA) -csB $(csB) -csC $(csC)\ -F0 $(if) -FN $(nf) -FF $(ff) -b 8 -a $(alpha) -Z 1 \ ! -C $(cleanup) -pfA $(pfA) >> $(pre)mm.c ! echo "#else" >> $(pre)mm.c ./xemit_mm -p $(pre) -ta $(ta) -tb $(tb) -muladd $(muladd) -l $(lat) \ -L $(loopO) -M $(mb) -N $(nb) -K $(kb) -m $(mu) -n $(nu) -k $(ku) \ -lda $(lda) -ldb $(ldb) -ldc $(ldc) -csA $(csA) -csB $(csB) -csC $(csC)\ -F0 $(if) -FN $(nf) -FF $(ff) -b 1 -a $(alpha) -Z 1 \ ! -C $(cleanup) -pfA $(pfA) >> $(pre)mm.c ! echo "#endif" >> $(pre)mm.c mmcase : xemit_mm xccobj rm -f $(pre)mm.c --- 2203,2226 ---- # mmgencase : xemit_mm xccobj ! rm -f $(pre)gmm.c ! echo "#ifdef BETA0" >> $(pre)gmm.c ./xemit_mm -p $(pre) -ta $(ta) -tb $(tb) -muladd $(muladd) -l $(lat) \ -L $(loopO) -M $(mb) -N $(nb) -K $(kb) -m $(mu) -n $(nu) -k $(ku) \ -lda $(lda) -ldb $(ldb) -ldc $(ldc) -csA $(csA) -csB $(csB) -csC $(csC)\ -F0 $(if) -FN $(nf) -FF $(ff) -b 0 -a $(alpha) -Z 1 \ ! -C $(cleanup) -pfA $(pfA) >> $(pre)gmm.c ! echo "#elif defined(BETAX)" >> $(pre)gmm.c ./xemit_mm -p $(pre) -ta $(ta) -tb $(tb) -muladd $(muladd) -l $(lat) \ -L $(loopO) -M $(mb) -N $(nb) -K $(kb) -m $(mu) -n $(nu) -k $(ku) \ -lda $(lda) -ldb $(ldb) -ldc $(ldc) -csA $(csA) -csB $(csB) -csC $(csC)\ -F0 $(if) -FN $(nf) -FF $(ff) -b 8 -a $(alpha) -Z 1 \ ! -C $(cleanup) -pfA $(pfA) >> $(pre)gmm.c ! echo "#else" >> $(pre)gmm.c ./xemit_mm -p $(pre) -ta $(ta) -tb $(tb) -muladd $(muladd) -l $(lat) \ -L $(loopO) -M $(mb) -N $(nb) -K $(kb) -m $(mu) -n $(nu) -k $(ku) \ -lda $(lda) -ldb $(ldb) -ldc $(ldc) -csA $(csA) -csB $(csB) -csC $(csC)\ -F0 $(if) -FN $(nf) -FF $(ff) -b 1 -a $(alpha) -Z 1 \ ! -C $(cleanup) -pfA $(pfA) >> $(pre)gmm.c ! echo "#endif" >> $(pre)gmm.c mmcase : xemit_mm xccobj rm -f $(pre)mm.c Index: atlas-parse.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas-parse.base,v retrieving revision 1.49 retrieving revision 1.50 diff -C2 -d -r1.49 -r1.50 *** atlas-parse.base 20 Jan 2010 17:37:57 -0000 1.49 --- atlas-parse.base 21 Jan 2010 18:03:51 -0000 1.50 *************** *** 1564,1567 **** --- 1564,1658 ---- beta, mflop, cflush); } + + char *GetGmmGenString + ( + int verb, /* verbosity */ + char pre, /* precision */ + int MACC, /* 0 : separate mult&add, else MACC */ + int lat, /* multiply latency */ + int beta, /* 0,1 beta, else beta=X */ + int nb, /* blocking factor */ + int mu, int nu, int ku, /* unrolling factors */ + int Fftch, /* do bogus fetch of C at top of loop? */ + int iftch, /* # of initial fetches to do */ + int nftch, /* # of fetches to do thereafter */ + int pf /* prefetch strategy */ + ) + /* + * returns a string that will result in generating a user-style kernel + * specialized for non-cleanup cases by invoking a make target that + * in turn invokes the scalar generator routine, emit_mm.c + * Because it is specialized for kernel cases, we don't specify leading + * dimensions, transpose cases, etc, but just take the defaults. + */ + { + char ln[1024]; + int i; + i = sprintf(ln, "make mmgencase pre=%c muladd=%d lat=%d beta=%d mb=%d nb=%d kb=%d mu=%d nu=%d ku=%d if=%d nf=%d ff=%d, pfA=%d, csC=%d", + pre, MACC, lat, beta, nb, nb, nb, mu, nu, ku, iftch, nftch, + Fftch, pf, (pre == 'c' || pre == 'z') ? 2 : 1); + if (verb < 3) + i += sprintf(ln+i, " > /dev/null 2>&1"); + else + i += sprintf(ln+i, "\n"); + if (verb > 2) + printf("genstr='%s'\n", ln); + return(DupString(ln)); + } + + void FillInGMMNode(int verb, ATL_mmnode_t *mmp, char pre, int MACC, int lat, + int beta, int nb, int mu, int nu, int ku, + int fftch, int iftch, int nftch, int pf) + /* + * Take emit_mm's flags and fill in the standard ATL_mmnode_t struct mmp + * making all the correct assumptions for the standard copy code + */ + { + int i; + + if (ku > (nb>>1)) + ku = nb; + mmp->mbB = mmp->nbB = mmp->kbB = nb; + mmp->mu = mu; + mmp->nu = nu; + mmp->ku = ku; + mmp->muladd = MACC; + mmp->lat = lat; + mmp->fftch = fftch; + mmp->iftch = iftch; + mmp->nftch = nftch; + mmp->pref = pf; + mmp->SSE = 0; + mmp->TA = AtlasTrans; + mmp->TB = AtlasNoTrans; + mmp->asmbits = 0; + if (mmp->auth) + free(mmp->auth); + if (mmp->rout) + free(mmp->rout); + if (mmp->comp) + free(mmp->comp); + if (mmp->cflags) + free(mmp->cflags); + if (mmp->str) + free(mmp->str); + if (mmp->genstr) + free(mmp->genstr); + mmp->comp = mmp->cflags = mmp->str = NULL; + mmp->rout = DupString("dgmm.c"); + mmp->rout[0] = pre; + mmp->genstr = GetGmmGenString(verb, pre, MACC, lat, beta, nb, mu, nu, ku, + fftch, iftch, nftch, pf); + mmp->auth = DupString("Whaley/emit_mm"); + i = 0; + SET_FLAG(i, MMF_KUISKB, (ku == nb)); + SET_FLAG(i, MMF_LDISKB, 1); + SET_FLAG(i, MMF_LDAB, 1); + if (pre == 's' || pre == 'c') + SET_FLAG(i, MMF_SINGLE, 1); + if (pre == 'c' || pre == 'z') + SET_FLAG(i, MMF_COMPLEX, 1); + } + #endif /* end guard around atlas_@(rt)testtime.h */ @ROUT mvread Index: atlas.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas.base,v retrieving revision 1.209 retrieving revision 1.210 diff -C2 -d -r1.209 -r1.210 *** atlas.base 20 Jan 2010 17:37:57 -0000 1.209 --- atlas.base 21 Jan 2010 18:03:51 -0000 1.210 *************** *** 996,999 **** --- 996,1008 ---- } + @ROUT gmmsearch + #include <stdio.h> + #include <stdlib.h> + #include <assert.h> + #include "atlas_misc.h" + #include "atlas_mmtesttime.h" + + #define MAXLAT 6 + @ROUT mmsearch gmmsearch int GetGoodLat(int MULADD, int kb, int mu, int nu, int ku, int lat) { *************** *** 1016,1026 **** } - @ROUT gmmsearch - #include <stdio.h> - #include <stdlib.h> - #include <assert.h> - #include "atlas_misc.h" - #include "atlas_mmtesttime.h" - @ROUT mmsearch gmmsearch int GetUniqueMuNus(int nregs, int muladd, int lat, int *mus, int *nus) --- 1025,1028 ---- *************** *** 1048,1055 **** } void SortByFlpLd(int N, int *mus, int *nus, double *FPL) /* * Simple selection sort, sorting from best (greatest) flops/load to worst ! * ties in mflop are broken by taking the most square one. */ { --- 1050,1072 ---- } + #ifdef DEBUG + void PrintMUNUs(int N, int *mus, int *nus, double *fpls) + { + int i; + for (i=0; i < N; i++) + { + if (fpls) + printf("%3d. MU=%d, NU=%d, fpl=%.3f\n", i, mus[i], nus[i], fpls[i]); + else + printf("%3d. MU=%d, NU=%d\n", i, mus[i], nus[i]); + } + } + #endif + void SortByFlpLd(int N, int *mus, int *nus, double *FPL) /* * Simple selection sort, sorting from best (greatest) flops/load to worst ! * ties in mflop are broken by taking the most square one, and if they ! * are equally square, then take the one with the bigger mu. */ { *************** *** 1057,1060 **** --- 1074,1081 ---- double fpl, fplB; + #ifdef DEBUG + printf("\nUNSORTED:\n"); + PrintMUNUs(N, mus, nus, NULL); + #endif for (i=0; i < N-1; i++) { *************** *** 1069,1072 **** --- 1090,1094 ---- imax = j; fplB = fpl; + mindimB = (mus[j] <= nus[j]) ? mus[j] : nus[j]; } else if (fpl == fplB) *************** *** 1076,1082 **** { imax = j; - fplB = fpl; mindimB = mindim; } } } --- 1098,1111 ---- { imax = j; mindimB = mindim; } + /* + * For symmetric shapes, choose the one with a bigger mu + */ + else if (mindim == mindimB) + { + if (mus[j] > mindim) + imax = j; + } } } *************** *** 1094,1097 **** --- 1123,1130 ---- } FPL[i] = (2.0 * mus[i] * nus[i]) / (mus[i] + nus[i]); + #ifdef DEBUG + printf("\n\nSORTED:\n"); + PrintMUNUs(N, mus, nus, FPL); + #endif } *************** *** 1132,1135 **** --- 1165,1183 ---- return(0); } + + void GetSafeMUNU(int nreg, int muladd, int lat, int *MU, int *NU) + { + int N, Ng, i; + int *mus, *nus; + double *fpls; + + GetMuNus(nreg, muladd, lat, &Ng, &N, &mus, &nus, &fpls); + i = GetSafeGoodMuNu(nreg, muladd, lat, N, mus, nus, fpls); + *MU = mus[i]; + *NU = nus[i]; + free(mus); + free(nus); + free(fpls); + } @ROUT mmsearch *************** *** 34621,34635 **** fprintf(stderr, " -k <ku> : K unrolling factor \n"); fprintf(stderr, " -r <nreg> : number of registers to assume\n"); fprintf(stderr, " -M <muladd> : -1: search 0: separate mul&add : else MACC\n"); exit(ierr ? ierr : -1); } char GetFlags(int nargs, char **args, int *verb, int *nregs, int *nb, ! int *ku, int *MACC) { ! char pre; *MACC = -1; ! *nregs = *nb = *ku = 0; ! pre = d; for (i=1; i < nargs; i++) { --- 34669,34685 ---- fprintf(stderr, " -k <ku> : K unrolling factor \n"); fprintf(stderr, " -r <nreg> : number of registers to assume\n"); + fprintf(stderr, " -l <lat> : multiply latency to assume\n"); fprintf(stderr, " -M <muladd> : -1: search 0: separate mul&add : else MACC\n"); exit(ierr ? ierr : -1); } char GetFlags(int nargs, char **args, int *verb, int *nregs, int *nb, ! int *ku, int *MACC, int *lat) { ! char pre, ch; ! int i; *MACC = -1; ! *lat = *nregs = *nb = *ku = 0; ! pre = 'd'; for (i=1; i < nargs; i++) { *************** *** 34656,34659 **** --- 34706,34714 ---- *nb = atoi(args[i]); break; + case 'l': + if (++i >= nargs) + PrintUsage(args[0], i, NULL); + *lat = atoi(args[i]); + break; case 'r': if (++i >= nargs) *************** *** 34670,34715 **** } ! @beginskip ! char *GetGmmGenString ( char pre, /* precision */ int MACC, /* 0 : separate mult&add, else MACC */ ! int lat, /* multiply latency */ int beta, /* 0,1 beta, else beta=X */ int nb, /* blocking factor */ ! int mu, nu, ku, /* unrolling factors */ int iftch, /* # of initial fetches to do */ int nftch, /* # of fetches to do thereafter */ - int Fftch, /* do bogus fetch of C at top of loop? */ int pf /* prefetch strategy */ ) /* ! * returns a string that will result in generating a user-style kernel ! * specialized for non-cleanup cases by invoking a make target that ! * in turn invokes the scalar generator routine, emit_mm.c ! * Because it is specialized for kernel cases, we don't specify leading ! * dimensions, transpose cases, etc, but just take the defaults. */ { ! char ln[1024]; ! sprintf(ln, "make mmgencase pre=%c muladd=%d lat=%d beta=%d mb=%d nb=%d kb=%d mu=%d nu=%d ku=%d if=%d nf=%d ff=%d, pfA=%d, csC=%d\n", ! pre, MACC, lat, beta, nb, nb, nb, mu, nu, ku, iftch, nftch, Fftch, ! pf, (pre == 'c' || pre == 'z') ? 2 : 1); ! return(DupString(ln); } - @endskip int main(int nargs, char **args) { char pre; ! int verb, nregs, FNB, nb, ku, MACC; ATL_mmnode_t *mmp; ! pre = GetFlags(nargs, args, &verb, &nregs, &nb, &ku, &MACC); ! FNB = (nb > 0); /* is NB forced to a particular value? */ if (!nregs) ! nregs = FindNumRegs(pre, verb, nb, ku, MACC); if (MACC < 0) ! MACC = UseMACC(pre, verb, nregs, nb, ku); if (!nb) nb = FindNB(pre, verb, nregs, ku, MACC); --- 34725,34904 ---- } ! double TryKUs ( + ATL_mmnode_t *mmp, char pre, /* precision */ + int verb, /* verbosity level */ int MACC, /* 0 : separate mult&add, else MACC */ ! int lat0, /* multiply latency */ int beta, /* 0,1 beta, else beta=X */ int nb, /* blocking factor */ ! int mu, int nu, int ku, /* unrolling factors */ ! int fftch, /* do bogus fetch of C at top of loop? */ int iftch, /* # of initial fetches to do */ int nftch, /* # of fetches to do thereafter */ int pf /* prefetch strategy */ ) /* ! * If ku is set, times only that value, else tries both ku=1 & ku=nb ! * RETURNS: best performance of timed problems, with ku set correctly, ! * but the generator flags may be bad! */ { ! double mf, mf1; ! int lat; ! ! if (ku) ! { ! lat = GetGoodLat(MACC, nb, mu, nu, ku, lat0); ! FillInGMMNode(verb, mmp, pre, MACC, lat, beta, nb, mu, nu, ku, ! fftch, iftch, nftch, pf); ! mf = TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, beta, -1, -1); ! } ! else ! { ! lat = GetGoodLat(MACC, nb, mu, nu, 1, lat0); ! FillInGMMNode(verb, mmp, pre, MACC, lat, beta, nb, mu, nu, 1, ! fftch, iftch, nftch, pf); ! mf = TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, beta, -1, -1); ! ! FillInGMMNode(verb, mmp, pre, MACC, lat0, beta, nb, mu, nu, nb, ! fftch, iftch, nftch, pf); ! mf1= TimeMMKernel(verb, 0, mmp, pre, nb, nb, nb, nb, nb, 0, beta, -1, -1); ! if (mf >= mf1) ! mmp->ku = 1; ! else ! mf = mf1; ! } ! return(mf); ! } ! ! double FindNumRegsByMACC(char pre, int verb, int nb, int ku, int MACC, int lat, ! int *NREGS, int *MU, int *NU) ! { ! int i, mu, nu, muB, nuB, ForceMACC; ! double mf, mfB; ! ATL_mmnode_t *mmp; ! ! mmp = GetMMNode(); ! for (i=8; i < 1024; i *= 2) ! { ! GetSafeMUNU(i, MACC, lat, &mu, &nu); ! mf = TryKUs(mmp, pre, verb, MACC, lat, 1, nb, mu, nu, ku, 0, mu+nu, 1, 0); ! if (verb) ! printf(" nreg=%d: nb=%d, mu=%d, nu=%d, MACC=%d, lat=%d, mf=%.2f\n", ! i, nb, mu, nu, MACC, lat, mf); ! if (mf > mfB) ! { ! mfB = mf; ! muB = mu; ! nuB = nu; ! } ! /* ! * Call a 8% decline in performance evidence of register overflow ! */ ! ! else if (1.08*mf < mfB) ! break; ! } ! *NREGS = i>>1; ! *MU = muB; ! *NU = nuB; ! return(mfB); ! } ! ! int FindNumRegs(char pre, int verb, int nb, int ku, int *MACC, int *lat, ! int *MU, int *NU) ! /* ! * Finds an estimate for the number of registers the compiler will let ! * you use in a generated matmul ! */ ! { ! int nregs, nr, ForceMACC, mu, nu; ! double mf, mf1, mfmacc; ! ! ForceMACC = (*MACC >= 0); ! if (ForceMACC && *MACC && !(*lat)) ! { ! fprintf(stderr, ! "If you force no MACC, then you must also force latency!\n"); ! exit(-1); ! } ! assert (pre == 'd' || pre == 's'); ! if (verb) ! printf("\nESTIMATING THE NUMBER OF USEABLE REGISTERS FOR GEMM:\n"); ! if (!ForceMACC) ! GetMulAdd(pre, MACC, lat); ! mf = FindNumRegsByMACC(pre, verb, nb, ku, *MACC, *lat, &nregs, MU, NU); ! /* ! * Using separate multiply and add is expensive in terms of registers, ! * and is often messed up by compilers, so let's try lat=1 (for dynamically ! * scheduled machines), and using a MACC, and see what happens ! */ ! if (!ForceMACC && *MACC == 0) ! { ! if (*lat > 1) ! { ! printf("\n"); ! mf1 = FindNumRegsByMACC(pre, verb, nb, ku, 0, 1, &nr, &mu, &nu); ! if (mf1 >= mf) /* latency of 1 just as good as longer latency */ ! { ! *MU = mu; ! *NU = nu; ! nregs = nr; ! *lat = 1; ! } ! } ! printf("\n"); ! mfmacc = FindNumRegsByMACC(pre, verb, nb, ku, 1, *lat, &nr, &mu, &nu); ! if (mfmacc > mf && mfmacc >= mf1) /* MACC is better */ ! { ! *MU = mu; ! *NU = nu; ! nregs = nr; ! *MACC = 1; ! mf = mf1; ! } ! } ! ! if (verb) ! printf("NUMBER OF ESTIMATED GEMM REGISTERS = %d, MACC=%d, lat=%d:\n", ! nregs, *MACC, *lat); ! return(nregs); ! } ! ! int GetBigNB(char pre) ! { ! int i, L1Elts; ! if (pre == 'd' || pre == 'c') ! L1Elts = 1024/8; ! else if (pre == 's') ! L1Elts = 1024/4; ! else ! L1Elts = 1024/16; ! L1Elts *= GetL1CacheSize(); ! for (i=16; i*i < L1Elts; i += 4); ! return((i*i <= L1Elts) ? i : i-4); } int main(int nargs, char **args) { char pre; ! int verb, nregs, FNB, nb, ku, MACC, lat, mu, nu; ATL_mmnode_t *mmp; ! pre = GetFlags(nargs, args, &verb, &nregs, &nb, &ku, &MACC, &lat); ! if (nb > 0) ! FNB = 1; ! else ! { ! nb = GetBigNB(pre); ! FNB = 0; ! } if (!nregs) ! nregs = FindNumRegs(pre, verb, nb, ku, &MACC, &lat, &mu, &nu); ! #if 0 if (MACC < 0) ! MACC = UseMACC(pre, verb, nregs, nb, ku, &lat); if (!nb) nb = FindNB(pre, verb, nregs, ku, MACC); *************** *** 34717,34720 **** --- 34906,34910 ---- WriteMMFileWithPath(pre, res, "gMMRES.sum", mmp); KillMMNode(mmp); + #endif exit(0); } |