[Math-atlas-commits] CVS: AtlasBase/Clint atlas-lvl2.base, 1.24, 1.25 atlas-make.base, 1.191, 1.192
Brought to you by:
rwhaley,
tonyc040457
From: R. C. W. <rw...@us...> - 2009-04-17 17:02:57
|
Update of /cvsroot/math-atlas/AtlasBase/Clint In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv28383/Clint Modified Files: atlas-lvl2.base atlas-make.base atlas-thr.base Log Message: Index: atlas-lvl2.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas-lvl2.base,v retrieving revision 1.24 retrieving revision 1.25 diff -C2 -d -r1.24 -r1.25 *** atlas-lvl2.base 16 Apr 2009 16:58:29 -0000 1.24 --- atlas-lvl2.base 17 Apr 2009 17:02:44 -0000 1.25 *************** *** 7,11 **** @ROUT ! r1test mvtest ATL_gpmvN_32x4_1 ATL_gpmvN_1x1_1a @\ mvscases.dsc mvdcases.dsc mvccases.dsc mvzcases.dsc @\ ! r1scases.dsc r1dcases.dsc r1ccases.dsc r1zcases.dsc @extract -b @(topd)/gen.inc what=cw @(cw99) --- 7,11 ---- @ROUT ! r1test mvtest ATL_gpmvN_32x4_1 ATL_gpmvN_1x1_1a @\ mvscases.dsc mvdcases.dsc mvccases.dsc mvzcases.dsc @\ ! r1scases.dsc r1dcases.dsc r1ccases.dsc r1zcases.dsc mvktime @extract -b @(topd)/gen.inc what=cw @(cw99) *************** *** 10354,10357 **** --- 10354,10362 ---- @ROUT mvktime @extract -b @(topd)/gen.inc what=cw @(cw09) + #include <stdio.h> + #include <stdlib.h> + #include <assert.h> + #include <string.h> + #include "atlas_misc.h" void ATL_UGEMV(ATL_CINT M, ATL_CINT N, const TYPE *A, ATL_CINT lda, *************** *** 10362,10367 **** size_t celts, /* # of elts in cache size we are blocking for */ size_t pgelts, /* # of elts on a virtual mem page (best guess) */ - ATL_CINT yu, /* unrolling on Y's loop by this kernel */ ATL_CINT xu, /* unrolling on X's loop by this kernel */ enum ATLAS_TRANS TA, ATL_CINT M, --- 10367,10372 ---- size_t celts, /* # of elts in cache size we are blocking for */ size_t pgelts, /* # of elts on a virtual mem page (best guess) */ ATL_CINT xu, /* unrolling on X's loop by this kernel */ + ATL_CINT yu, /* unrolling on Y's loop by this kernel */ enum ATLAS_TRANS TA, ATL_CINT M, *************** *** 10370,10382 **** ATL_CINT lda, const TYPE *X, ! SCALAR beta, ! TYPE *y) /* ! @mif = "T * This routine assumes the transpose case, where we write Y in the outer * loop, and apply X to each column of A in the inner loop, and we therefore * cut M in order encourage cache reuse of X @endmif ! @mif = "N * This routine assumes the Notranspose case, where we read X in the outer * loop, and do a axpy with each column of A into Y in the inner loop, --- 10375,10387 ---- ATL_CINT lda, const TYPE *X, ! const TYPE *beta, ! TYPE *Y) /* ! @mif TA = "T * This routine assumes the transpose case, where we write Y in the outer * loop, and apply X to each column of A in the inner loop, and we therefore * cut M in order encourage cache reuse of X @endmif ! @mif TA = "N * This routine assumes the Notranspose case, where we read X in the outer * loop, and do a axpy with each column of A into Y in the inner loop, *************** *** 10385,10388 **** --- 10390,10399 ---- */ { + ATL_INT Mp, m, i; + #ifdef TREAL + #define BETA *beta + #else + #define BETA beta + #endif /* * Compute where to cut M in order to get to reuse the vector in *************** *** 10405,10413 **** m = M - i; m = (m > Mp) ? Mp : m; ! @mif = "T ! ATL_UGEMV(m, N, A+i, lda, X+i, beta, Y); @endmif ! @mif = "N ! ATL_UGEMV(m, N, A+i, lda, X, beta, Y+i); @endmif } --- 10416,10424 ---- m = M - i; m = (m > Mp) ? Mp : m; ! @mif TA = "T ! ATL_UGEMV(m, N, A+i, lda, X+i, BETA, Y); @endmif ! @mif TA = "N ! ATL_UGEMV(m, N, A+i, lda, X, BETA, Y+i); @endmif } *************** *** 10416,10420 **** @undef UR @endwhile ! double *mvtime_OC( int nreps, /* number of reps to do for one timing sample */ ATL_INT flushelts, /* size of area to flush to avoid cache reuse */ --- 10427,10431 ---- @undef UR @endwhile ! double mvtime_OC( int nreps, /* number of reps to do for one timing sample */ ATL_INT flushelts, /* size of area to flush to avoid cache reuse */ *************** *** 10424,10432 **** ATL_INT M, /* # of rows of array A */ ATL_INT N, /* # of cols of array A */ ! ATL_int lda, /* leading dim */ ! SCALAR beta, ! short alignA, /* Alignment required for each operand */ ! short alignX, /* if set to zero, will be native alignment */ ! short alignY) /* cannot be greater than ATL_cachelen */ /* * Times the kernel for out-of-cache (where flushsz sets the cache that it --- 10435,10448 ---- ATL_INT M, /* # of rows of array A */ ATL_INT N, /* # of cols of array A */ ! ATL_INT lda, /* leading dim */ ! TYPE *beta, ! int xu, /* unrolling on X */ ! int yu, /* unrolling on Y */ ! int FAa, /* if (FA. = 0) enforce no alignment */ ! int MAa, /* else force op to be aligned to at least FA bytes */ ! int FAx, /* if MA. != 0, disallow op to be aligned to MA. bytes */ ! int MAx, ! int FAy, ! int MAy) /* * Times the kernel for out-of-cache (where flushsz sets the cache that it *************** *** 10436,10486 **** { double t0, t1; ! ATL_INT Aelts, Xelts, Yelts, i, j; ! size_t addrX, addrY, addrA, setsz, nsets, memsz; ! TYPE *mem, *A, *X, *Y; ! char *cp; ! void *(mvsim)(size_t celts, size_t pgelts, ATL_CINT yu, ATL_CINT xu, enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, const TYPE *A, ! ATL_CINT lda, const TYPE *X, SCALAR beta, TYPE *y); ! mvsim = (TA == AtlasNoTrans || TA == AtlasConj) mvsimN : mvsimT; ! alignA = (alignA) ? alignA : ATL_sizeof; ! alignX = (alignX) ? alignX : ATL_sizeof; ! alignY = (alignY) ? alignY : ATL_sizeof; Aelts = lda * N; Xelts = (TA == AtlasNoTrans || TA == AtlasConj) ? N : M; Yelts = (TA == AtlasNoTrans || TA == AtlasConj) ? M : N; ! addrA = alignA; ! for (addrY=addrA+ATL_MulBySize(Aelts); addrY%alignY; addrY++); ! for (addrX=addrY+ATL_MulBySize(Yelts); addrX%alignX; addrX++); ! for (setsz=addrX+ATL_MulBySize(Xelts); setsz%alignA; setsz++); ! assert(setsz%ATL_sizeof == 0); nsets = (ATL_MulBySize(flushelts)+setsz-1)/setsz; ! mem = malloc(ATL_Cachelen + nsets*setsz); ! assert(mem); /* ! * Make sure all operands start at requested alignment */ ! if (alignA) ! for (addrA=(size_t) mem; addrA%alignA; addrA++); ! else ! addrA = (size_t) mem; ! a = A = (TYPE*) addrA; ! y = Y = (TYPE*) (addrA + addrY); ! x = X = (TYPE*) (addrA + addrY + addrX); ! setsz = ATL_DivBySize(setsz); ! memsz = setsz * nsets; ! for (i=0; i < memsz; i++) ! A[i] = 0.0; j=0; t0 = time00(); ! for (i=nrep; i; i--) { ! mvsim(celts, pgelts, TA, M, N, a, lda, x, beta, y); ! if (++j < nsets) { a += setsz; x += setsz; y += setsz; } else { a = A; x = X; y = Y; } } ! t1 = time00() ! t1 = (t1-t0) / nrep; return(t1); } --- 10452,10576 ---- { double t0, t1; ! TYPE *A, *X, *Y, *a, *x, *y; ! void *vmem; ! void (*mvsim)(size_t celts, size_t pgelts, ATL_CINT xu, ATL_CINT yu, enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, const TYPE *A, ! ATL_CINT lda, const TYPE *X, const TYPE *beta, TYPE *Y); ! ATL_INT Aelts, Xelts, Yelts, setspan, ygap, xgap, agap, pregap, setsz, nsets; ! ATL_INT i, j; ! size_t ptr_st; ! int maxalign; ! mvsim = (TA == AtlasNoTrans || TA == AtlasConj) ? mvsimN : mvsimT; ! /* ! * Find basic length of each operand in elements ! */ Aelts = lda * N; Xelts = (TA == AtlasNoTrans || TA == AtlasConj) ? N : M; Yelts = (TA == AtlasNoTrans || TA == AtlasConj) ? M : N; ! /* ! * Map memory so that we can enforce all required alignments while moving ! * through memory; mem starts with maxalign-aligned memory, so that we can ! * guarantee all further alignments ! */ ! maxalign = (FAx >= FAa) ? FAx : FAa; ! maxalign = (maxalign >= FAy) ? maxalign : FAy; ! if (!maxalign && (MAx | MAy | MAa)) ! { ! maxalign = (MAx >= MAa) ? MAx : MAa; ! maxalign = (maxalign >= MAy) ? maxalign : MAy; ! } ! if (MAx) ! { ! j = (FAx) ? FAx : ATL_sizeof; ! for (i=0; (i % j != 0 || i%MAx == 0); i += j); ! pregap = i; ! } ! else pregap = 0; ! xgap = ATL_MulBySize(Xelts); ! if (FAy || MAy) ! { ! j = (FAy) ? FAy : ATL_sizeof; ! for (i=pregap+xgap; (i%j != 0 || i%MAy == 0); i += j); ! xgap = i - pregap; ! } ! ygap = ATL_MulBySize(Yelts); ! if (FAa || MAa) ! { ! j = (FAa) ? FAa : ATL_sizeof; ! for (i=pregap+xgap+ygap; (i%j != 0 || i%MAa == 0); i += j); ! ygap = i - pregap - xgap; ! } ! agap = ATL_MulBySize(Aelts); ! ! j = pregap; ! for (i=pregap+xgap+ygap+agap; i%maxalign != 0; i++); ! agap = i-xgap-ygap; ! ! setspan = xgap + ygap + agap; ! assert(setspan%ATL_sizeof == 0); ! setsz = ATL_MulBySize(M+N+M*N); nsets = (ATL_MulBySize(flushelts)+setsz-1)/setsz; ! vmem = malloc(maxalign + nsets*setspan); ! assert(vmem); ! for (ptr_st = (size_t)vmem; ptr_st%maxalign; ptr_st++); /* start maxaligned */ ! X = (TYPE*) (ptr_st + pregap); ! Y = (TYPE*) (ptr_st + pregap + xgap); ! A = (TYPE*) (ptr_st + pregap + xgap + ygap); /* ! * Initialize memory from greatest to least; just zero for now */ ! y = (TYPE*) (ptr_st + setspan); ! setspan /= ATL_sizeof; ! for (i=setspan; i; i--) *y-- = ATL_rzero; ! /* ! * Set ptrs to last set in memory ! */ ! A += (nsets-1) * setspan; ! X += (nsets-1) * setspan; ! Y += (nsets-1) * setspan; ! #define DEBUG_FA ! #ifdef DEBUG_FA ! if (FAa) ! assert(((size_t)A)%FAa == 0); ! if (FAx) ! assert(((size_t)X)%FAx == 0); ! if (FAy) ! assert(((size_t)Y)%FAy == 0); ! if (MAa) ! assert(((size_t)A)%MAa != 0); ! if (MAx) ! assert(((size_t)X)%MAx != 0); ! if (MAy) ! assert(((size_t)Y)%MAy != 0); ! #endif ! a = A; ! x = X; ! y = Y; ! j=0; t0 = time00(); ! for (i=nreps; i; i--) { ! mvsim(celts, pgelts, xu, yu, TA, M, N, a, lda, x, beta, y); ! if (++j < nsets) { a -= setspan; x -= setspan; y -= setspan; } else { a = A; x = X; y = Y; } + #ifdef DEBUG_FA + if (FAa) + assert(((size_t)a)%FAa == 0); + if (FAx) + assert(((size_t)x)%FAx == 0); + if (FAy) + assert(((size_t)y)%FAy == 0); + if (MAa) + assert(((size_t)a)%MAa != 0); + if (MAx) + assert(((size_t)x)%MAx != 0); + if (MAy) + assert(((size_t)y)%MAy != 0); + #endif } ! t1 = time00(); ! t1 = (t1-t0) / nreps; return(t1); } *************** *** 10492,10496 **** { double min; ! int imin; for (i=0; i < N-1; i++) { --- 10582,10587 ---- { double min; ! int imin, i, j; ! for (i=0; i < N-1; i++) { *************** *** 10537,10552 **** int DoTimes(ATL_INT flshelts, ATL_INT pgelts, ATL_INT ntim, ATL_INT nrep, ! ATL_INT xu, ATL_INT yu, ! enum ATLAS_TRANS TA, ATL_INT M, ATL_INT N, ATL_INT lda) { double *times; times = malloc(ntim * sizeof(double)); assert(times); ! fprintf(stdout, "GEMV: M=%d, N=%d, lda=%d, alignAXY=[%d,%d,%d], beta=%e:\n", ! M, N, lda, alignA, alignX, alignY, beta); for (i=0; i < ntim; i++) ! times[i] = mvtime_OC(nrep, flshelts, pgelts, TA, M, N, lda, beta, ! alignA, alignX, alignY); SortDoubles(ntim, times); fprintf(fpout, "NSAMPLES=%d, MAX=%.2f, MIN=%.2f, AVG=%.2f, MED=%.2f\n", --- 10628,10653 ---- int DoTimes(ATL_INT flshelts, ATL_INT pgelts, ATL_INT ntim, ATL_INT nrep, ! ATL_INT xu, ATL_INT yu, enum ATLAS_TRANS TA, ATL_INT M, ATL_INT N, ! ATL_INT lda, TYPE *beta, ! int FAa, int MAa, int FAx, int MAx, int FAy, int MAy) { double *times; + int i; + times = malloc(ntim * sizeof(double)); assert(times); ! #ifdef TREAL ! fprintf(stdout, ! "GEMV: M=%d, N=%d, lda=%d, AF=[%d,%d,%d], AM=[%d,%d,%d], beta=%e:\n", ! M, N, lda, FAa, FAx, FAy, MAa, MAx, MAy, *beta); ! #else ! fprintf(stdout, ! "GEMV: M=%d, N=%d, lda=%d, AF=[%d,%d,%d], AM=[%d,%d,%d], beta=[%e,%e]:\n", ! M, N, lda, FAa, FAx, FAy, MAa, MAx, MAy, *beta, beta[1]); ! #endif for (i=0; i < ntim; i++) ! times[i] = mvtime_OC(nrep, flshelts, pgelts, TA, M, N, lda, beta, xu, yu ! FAa, MAa, FAx, MAx, FAy, MAy); SortDoubles(ntim, times); fprintf(fpout, "NSAMPLES=%d, MAX=%.2f, MIN=%.2f, AVG=%.2f, MED=%.2f\n", *************** *** 10572,10575 **** --- 10673,10681 ---- fprintf(stderr, " -# <#> : report # timings (each interval may have multiple calls)\n"); + fprintf(stderr, + " -F[x,y,a] <#> : if(# > 0) -> force op to be aligned to at least # bytes\n"); + fprintf(stderr, + " if(# < 0) -> force op to be aligned to < # bytes.\n"); + fprintf(stderr, " -b <beta> : 2 floats for complex, one for real.\n); exit(i ? i : -1); } *************** *** 10577,10585 **** void GetFlags(int nargs, char **args, ATL_INT *celts, ATL_INT *pgelts, ATL_INT *xu, ATL_INT *yu, ATL_INT *ntim, ATL_INT *nrep, ! ATL_INT *m, ATL_INT *n, ATL_INT *lda) { - int i; - ATL_INT j; double mfF=200.0, flops; #ifdef ATL_PAGESZ --- 10683,10693 ---- void GetFlags(int nargs, char **args, ATL_INT *celts, ATL_INT *pgelts, ATL_INT *xu, ATL_INT *yu, ATL_INT *ntim, ATL_INT *nrep, ! ATL_INT *m, ATL_INT *n, ATL_INT *lda, TYPE *beta, ! int *FAa, int *MAa, int *FAx, int *MAx, *FAy, int *MAy) { double mfF=200.0, flops; + ATL_INT j, h; + int i; + char ch; #ifdef ATL_PAGESZ *************** *** 10594,10597 **** --- 10702,10706 ---- *nrep = *lda = 0; *ntim = 3; + *FAa = *MAa = *FAx = *MAx = *FAy *MAy = 0; for (i=1; i < nargs; i++) *************** *** 10638,10646 **** *lda = atoi(args[i]); break; ! case 'F' : /* set nrep by specifying MFLOPS */ if (++i >= nargs) ! PrintUsage(args[0], "out of flags in -F ", i-1); ! j = atoi(args[i]); ! mfF = j; break; case 'r' : /* set nrep directly as integer */ --- 10747,10795 ---- *lda = atoi(args[i]); break; ! case 'b' : /* set beta */ if (++i >= nargs) ! PrintUsage(args[0], "out of flags in -b ", i-1); ! *beta = atof(args[i]); ! #ifdef TCPLX ! if (++i >= nargs) ! PrintUsage(args[0], "out of flags in -b ", i-1); ! beta[1] = atof(args[i]); ! #endif ! break; ! case 'F' : /* set nrep by specifying MFLOPS, or force alignment */ ! ch = args[i][2]; ! if (!ch) /* specifying MFLOPS */ ! { ! if (++i >= nargs) ! PrintUsage(args[0], "out of flags in -F ", i-1); ! j = atoi(args[i]); ! mfF = j; ! } ! else ! { ! if (ch != 'a' && ch != 'y' && ch != 'x') ! PrintUsage(args[0], args[i], i); ! if (++i >= nargs) ! PrintUsage(args[0], args[i-1], i-1); ! j = atoi(args[i]); ! if (j < 0) ! { ! if (ch == 'a') ! *MAa = -j; ! else if (ch == 'y') ! *MAy = -j; ! else if (ch == 'x') ! *MAx = -j; ! } ! else ! { ! if (ch == 'a') ! *FAa = j; ! else if (ch == 'y') ! *FAy = j; ! else if (ch == 'x') ! *FAx = j; ! } ! } break; case 'r' : /* set nrep directly as integer */ *************** *** 10670,10674 **** { ATL_INT cetls, pgelts, xu, yu, ntim, nrep, m, n, lda; ! GetFlags(nargs, args, &celts, &pgelts, &xu, &yu, &ntim, &nrep, &m, &n, &lda); ! exit(DoTimes(celts, pgelts, ntim, nrep, xu, yu, m, n, lda)); } --- 10819,10832 ---- { ATL_INT cetls, pgelts, xu, yu, ntim, nrep, m, n, lda; ! int FAa, MAa, FAx, MAx, FAy, MAy; /* Force & Max align for ops */ ! #ifdef TREAL ! TYPE beta; ! #else ! TYPE beta[2]; ! #endif ! ! GetFlags(nargs, args, &celts, &pgelts, &xu, &yu, &ntim, &nrep, &m, &n, &lda, ! SADDR beta, &FAa, &MAa, &FAx, &MAx, &FAy, &MAy); ! exit(DoTimes(celts, pgelts, ntim, nrep, xu, yu, m, n, lda, SADDR beta, ! FAa, MAa, FAx, MAx, FAy, MAy)); } Index: atlas-make.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas-make.base,v retrieving revision 1.191 retrieving revision 1.192 diff -C2 -d -r1.191 -r1.192 *** atlas-make.base 5 Apr 2009 17:30:04 -0000 1.191 --- atlas-make.base 17 Apr 2009 17:02:44 -0000 1.192 *************** *** 1350,1353 **** --- 1350,1355 ---- $(ATLRUN) $(MVTdir) x@(pre)mvtst -A T -m $(Mt) -n $(Nt) + @(pre)mvktime.o : $(mySRCdir)/mvktime.c + $(ICC) -c $(ICCFLAGS) -o $@ -D@(typ) $(mySRCdir)/mvktime.c @(pre)mvtest.o : $(mySRCdir)/mvtest.c $(ICC) -c $(ICCFLAGS) -o $@ -D@(typ) $(mySRCdir)/mvtest.c Index: atlas-thr.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas-thr.base,v retrieving revision 1.52 retrieving revision 1.53 diff -C2 -d -r1.52 -r1.53 *** atlas-thr.base 7 Apr 2009 15:57:12 -0000 1.52 --- atlas-thr.base 17 Apr 2009 17:02:44 -0000 1.53 *************** *** 749,752 **** --- 749,755 ---- */ i = ((ptmms[indx].M + 3)>>2)<<2; + if (!(i & (i-1))) + i += 4; + @beginskip for (j=0; j < sizeof(ATL_INT)*8-1; j++) { *************** *** 757,760 **** --- 760,764 ---- } } + @endskip ptmms[indx].ldcw = i; } *************** *** 2036,2039 **** --- 2040,2046 ---- */ ldw = ((N+3)>>2)<<2; + if (!(ldw&(ldw-1))) + ldw += 4; + @beginskip for (i=0; i <= sizeof(ldw)*8; i++) { *************** *** 2044,2047 **** --- 2051,2055 ---- } } + @endskip sz = (ldw*N)<<eltsh; if (sz <= ATL_NTHREADS*ATL_PTMAXMALLOC) *************** *** 2769,2772 **** --- 2777,2783 ---- */ ldcw = ((N+3)>>2)<<2; /* multiple of 4 */ + if (!(ldcw&(ldcw-1))) + ldcw += 4; + @beginskip for (i=0; i < sizeof(ldcw)*8; i++) { *************** *** 2777,2780 **** --- 2788,2792 ---- } } + @endskip if ((ldcw<<eltsh)*N > ATL_PTMAXMALLOC) return(0); |