[Math-atlas-commits] CVS: AtlasBase/Clint atlas-iaux.base, 1.7, 1.8 atlas-make.base, 1.95, 1.96 atl
Brought to you by:
rwhaley,
tonyc040457
From: R. C. W. <rw...@us...> - 2007-02-24 23:05:03
|
Update of /cvsroot/math-atlas/AtlasBase/Clint In directory sc8-pr-cvs7.sourceforge.net:/tmp/cvs-serv13344/Clint Modified Files: atlas-iaux.base atlas-make.base atlas.base Log Message: Index: atlas-iaux.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas-iaux.base,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** atlas-iaux.base 23 Feb 2007 14:00:03 -0000 1.7 --- atlas-iaux.base 24 Feb 2007 23:04:54 -0000 1.8 *************** *** 795,798 **** --- 795,829 ---- #endif } + @ROUT ATL_gereflect + void ATL_gereflect(const enum ATLAS_UPLO Uplo, const int N, + TYPE *C, const int ldc) + /* + * If Uplo == Lower, reflect lower triangle into upper, + * If Uplo == Upper, reflect upper triangle into lower. + { + int j; + #ifdef TCPLX + const int ldc2 = ldc+ldc, incC = ldc2+2; + #else + const int incC = ldc+1; + #define ldc2 ldc + #endif + TYPE *pC; + if (Uplo == AtlasLower) + { + for (j=0; j < N-1; j++, C += incC) + Mjoin(PATL,copy)(N-j-1, C+(1 SHIFT), 1, C+ldc2, ldc); + } + else + { + pC = (N*ldc+N-1)SHIFT; + C += ldc2*(N-1); + for (j=0; j < N-1; j++, C -= ldc2, pC -= incC) + Mjoin(PATL,copy)(N-j-1, C, 1, C+ldc2, -ldc); + } + } + #ifdef ldc2 + #undef ldc2 + #endif @ROUT ATL_trscal @extract -b @(topd)/gen.inc what=cw @(ap99) Index: atlas-make.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas-make.base,v retrieving revision 1.95 retrieving revision 1.96 diff -C2 -d -r1.95 -r1.96 *** atlas-make.base 22 Feb 2007 18:27:03 -0000 1.95 --- atlas-make.base 24 Feb 2007 23:04:54 -0000 1.96 *************** *** 3784,3788 **** ATL_@(pre)AgemmNN.o ATL_@(pre)AgemmNT.o ATL_@(pre)AgemmTN.o ATL_@(pre)AgemmTT.o ! ATL_@(pre)mmJIK.o ATL_@(pre)mmIJK.o ATL_@(pre)mmJKI.o ATL_@(pre)mmBPP.o @ptyp d s @whiledef al a1 aX --- 3784,3789 ---- ATL_@(pre)AgemmNN.o ATL_@(pre)AgemmNT.o ATL_@(pre)AgemmTN.o ATL_@(pre)AgemmTT.o ! ATL_@(pre)mmJIK.o ATL_@(pre)mmIJK.o ATL_@(pre)mmJKI.o ! ATL_@(pre)mmK.o ATL_@(pre)mmBPP.o ATL_@(pre)mmMNK.o @ptyp d s @whiledef al a1 aX *************** *** 3856,3860 **** @endwhile ! @whiledef rout mmIJK mmJIK mmJKI mmBPP IBJBmm MBJBmm IBNBmm ATL_@(pre)@(rout).o : $(mySRCdir)/ATL_@(rout).c $(@(pre)INCdep) $(ICC) -o $@ -c $(ICCFLAGS) -D@(typ) $(mySRCdir)/ATL_@(rout).c --- 3857,3861 ---- @endwhile ! @whiledef rout mmIJK mmJIK mmJKI mmMNK mmBPP mmK IBJBmm MBJBmm IBNBmm ATL_@(pre)@(rout).o : $(mySRCdir)/ATL_@(rout).c $(@(pre)INCdep) $(ICC) -o $@ -c $(ICCFLAGS) -D@(typ) $(mySRCdir)/ATL_@(rout).c *************** *** 3898,3902 **** @endwhile ! @whiledef rout mmJIK mmIJK mmJKI mmBPP ATL_@(pre)@(rout).o : $(mySRCdir)/ATL_c@(rout).c $(@(pre)INCdep) $(ICC) -o $@ -c $(ICCFLAGS) -D@(typ) $(mySRCdir)/ATL_c@(rout).c --- 3899,3903 ---- @endwhile ! @whiledef rout mmJIK mmIJK mmJKI mmBPP mmK mmMNK ATL_@(pre)@(rout).o : $(mySRCdir)/ATL_c@(rout).c $(@(pre)INCdep) $(ICC) -o $@ -c $(ICCFLAGS) -D@(typ) $(mySRCdir)/ATL_c@(rout).c Index: atlas.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas.base,v retrieving revision 1.94 retrieving revision 1.95 diff -C2 -d -r1.94 -r1.95 *** atlas.base 23 Feb 2007 14:00:03 -0000 1.94 --- atlas.base 24 Feb 2007 23:04:54 -0000 1.95 *************** *** 6674,6678 **** } ! @ROUT ATL_mmBPP @extract -b @(topd)/gen.inc what=cw @(cw07) #include "atlas_misc.h" --- 6674,6678 ---- } ! @ROUT ATL_mmK @extract -b @(topd)/gen.inc what=cw @(cw07) #include "atlas_misc.h" *************** *** 6680,6719 **** #include <stdlib.h> ! @beginskip ! static void mat2blk(int M, int N, const TYPE *A, int lda, TYPE *C, int ldc) /* ! * C(0:M-1,0:N-1) = A(0:M-1,0:N-1) */ { ! int i, j; ! /* ! * Run loop backwards so first cols remain in cache ! */ ! A += lda*(N-1); ! C += ldc*(N-1); ! for (j=N; j; j--, A -= lda, C -= ldc) { ! for (i=M-1; i >= 0; i--) ! C[i] = A[i]; } } ! static void mat2blkT(int N, int M, const TYPE *A, int lda, TYPE *C, int ldc) /* ! * Transpose NxM matrix A to MxN C */ { ! int i, j, incC; ! lda -= M; ! incC = 1 - ldc*M; ! for (j=N; j; j--, A += lda, C += incC) { ! for (i=M; i; i--, C += ldc) ! *C = *A++; } } - @endskip int Mjoin(PATL,mmBPP)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N, const int K, --- 6680,6914 ---- #include <stdlib.h> ! void Mjoin(PATL,mmK)(int M, /* true # of rows in row-panel, M <= MB */ ! int m, /* # of rows to operate on, m >= M */ ! int N, /* true # of cols in col-panel, N < = NB */ ! int n, /* # of cols to operate on, n >= N */ ! int nblk, /* # of blocks in K dimension */ ! int kr, /* kr = K - nKb*KB; */ ! int KR, /* 0 : do not do full KB-call to avoid cleanup */ ! const SCALAR alphaA, /* alpha to apply during A copy */ ! const SCALAR alphaB, /* alpha to apply during B copy */ ! const SCALAR beta, /* beta to apply to C */ ! const TYPE *A, /* array to copy from, NULL if already cp */ ! const int lda, /* leading dimension of A */ ! const int incA, /* inc to next blk in A */ ! TYPE *pA, /* wrkspace to copy A to */ ! const int incAW, /* 0 : keep using same KBxMB space */ ! const TYPE *B, /* array to copy from, NULL if already cp */ ! const int ldb, /* leading dimension of B */ ! const int incB, /* inc to next blk in B */ ! TYPE *pB, /* wrkspace to copy B to */ ! const int incBW, /* 0 : keep using same KBxNB space */ ! TYPE *C, /* output matrix */ ! const int ldc, ! MAT2BLK2 A2blk, /* rout to copy A */ ! MAT2BLK2 B2blk, /* rout to copy B */ ! NBMM0 NBmm0, /* rout to do first mul (applies beta) */ ! NBMM0 NBmm1) /* rout to do later muls (beta=1) */ /* ! * Performs a K-inner-loop matmul, while copying A & B if necessary. ! * If M > m, we are doing extra flops so we don't call cleanup (same for N) */ { ! int k; ! if (nblk) { ! if (B) { B2blk(KB, N, alphaB, B, ldb, pB, KB); B += incB; } ! if (A) { A2blk(KB, M, alphaA, A, lda, pA, KB); A += incA; } ! NBmm0(m, n, KB, ATL_rone, pA, KB, pB, KB, beta, C, ldc); ! pA += incAW; pB += incBW; ! for (k = nblk-1; k; k--) ! { ! if (B) { B2blk(KB, N, alphaB, B, ldb, pB, KB); B += incB; } ! if (A) { A2blk(KB, M, alphaA, A, lda, pA, KB); A += incA; } ! NBmm1(m, n, KB, ATL_rone, pA, KB, pB, KB, ATL_rone, C, ldc); ! pA += incAW; pB += incBW; ! } ! } ! if (kr) /* need to cleanup K loop */ ! { ! if (KR) ! { ! if (B) ! { ! B2blk(kr, N, alphaB, B, ldb, pB, KB); ! Mjoin(PATL,gezero)(KB-kr, n, pB+kr, KB); ! } ! if (A) ! { ! A2blk(kr, M, alphaA, A, lda, pA, KB); ! Mjoin(PATL,gezero)(KB-kr, m, pA+kr, KB); ! } ! if (nblk) ! NBmm1(m, n, KB, ATL_rone, pA, KB, pB, KB, ATL_rone, C, ldc); ! else ! NBmm0(m, n, KB, ATL_rone, pA, KB, pB, KB, beta, C, ldc); ! } ! else ! { ! if (B) B2blk(kr, N, alphaB, B, ldb, pB, kr); ! if (A) A2blk(kr, M, alphaA, A, lda, pA, kr); ! Mjoin(PATL,pKBmm)(M, N, kr, ATL_rone, pA, kr, pB, kr, ! nblk ? ATL_rone : beta, C, ldc); ! } } } ! @ROUT ATL_mmMNK ! @extract -b @(topd)/gen.inc what=cw @(cw07) ! #include "atlas_misc.h" ! #include "atlas_lvl3.h" ! #include <stdlib.h> ! ! int Mjoin(PATL,mmMNK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, ! const int M, const int N, const int K, ! const SCALAR alpha, const TYPE *A, const int lda, ! const TYPE *B, const int ldb, const SCALAR beta, ! TYPE *C, const int ldc) /* ! * Copy matmul algorithm, copies A and B on-the-fly */ { ! void *v=NULL; ! const TYPE *a=A; ! TYPE *pA, *pB, *pB0; ! MAT2BLK2 A2blk, B2blk; ! NBMM0 NBmm0, NBmm1; ! int nkblks, nmblks, nnblks, mr, nr, kr, KR, bigK, h, i, j, ZEROC; ! int incAk, incBk, incAm, incBn, incAW, incAWp, incBW, incBWp, incW; ! /* ! * If these workspace intcrements are 0, we do JIT NBxNB copies instead of ! * copying entire array/panel. Don't copy mat if you can't reuse it. ! */ ! incAW = (N > NB) ? KB*MB : 0; ! incBW = (M > NB) ? KB*NB : 0; ! nmblks = M/MB; ! nnblks = N/NB; ! nkblks = K/KB; ! mr = M - nmblks*MB; ! nr = N - nnblks*NB; ! kr = K - nkblks*KB; ! /* ! * K-loop is special, in that we don't call user cleanup, must explicitly zero, ! * and K-cleanup is typically slower even for generated kernels. Therefore, ! * allow extra leaway for doing extra flops. Note error is unaffected by ! * any of these extra flops: K-loop has elts zeroed, and multiplying zeros ! * and adding in zeros doesn't add to error ! */ ! KR = (kr && kr+4 >= KB) ? KB : kr; ! bigK = nkblks*KB+KR; ! incAWp = incAW ? KR*mr : 0; ! incBWp = incBW ? KR*nr : 0; ! incW = incBW ? bigK*NB : 0; ! i = (incAW) ? MB*bigK : MB*KB; ! i += (incBW) ? N*bigK : NB*KB; ! i *= sizeof(TYPE); ! if (i <= ATL_MaxMalloc || N <= NB) ! v = malloc(ATL_Cachelen+i); ! if (!v) return(-1); ! pA = v; ! pB0 = pA + (incAW ? bigK*MB : KB*MB); ! if (TA == AtlasNoTrans) { ! A2blk = Mjoin(PATL,gemoveT); ! incAk = lda*KB; ! incAm = MB; } + else + { + A2blk = Mjoin(PATL,gemove); + incAk = KB; + incAm = MB*lda; + } + if (TB == AtlasNoTrans) + { + B2blk = Mjoin(PATL,gemove); + incBk = KB; + incBn = NB*ldb; + } + else + { + B2blk = Mjoin(PATL,gemoveT); + incBk = ldb*KB; + incBn = NB; + } + /* + * See what kernel we're calling + */ + if ( SCALAR_IS_ONE(beta) ) NBmm0 = NBmm_b1; + else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = NBmm_b0; + else NBmm0 = NBmm_bX; + KR = (KR == KB) ? KB : 0; + ZEROC = !(nnblks | nkblks | KR) && SCALAR_IS_ZERO(beta); + + for (i=0; i < nmblks; i++) + { + a = A+i*incAm; + pB = pB0; /* foreach row-panel of A, start at B's copy space */ + for (j=nnblks; j; j--) + { + Mjoin(PATL,mmK)(MB, MB, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta, + a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBW, + C, ldc, A2blk, B2blk, NBmm0, NBmm_b1); + B += incBn; /* copy next col panel of B */ + pB += incW; /* to next col panel of pB */ + a = (incAW ? NULL : a); /* reuse row-panel of A if copied */ + C += ldc*NB; + } + if (nr) + { + if (ZEROC) + Mjoin(PATL,gezero)(MB, nr, C, ldc); + Mjoin(PATL,mmK)(MB, MB, nr, nr, nkblks, kr, KR, ATL_rone, alpha, beta, + a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBWp, + C, ldc, A2blk, B2blk, NBmm0, NBmm_b1); + } + C += MB - nnblks*ldc*NB; + if (incBW) + { + B = NULL; /* finished copying B */ + incBn = 0; + } + else + B -= nnblks*incBn; + } + if (mr) + { + a = A + nmblks*incAm; + pB = pB0; + if ( SCALAR_IS_ONE(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b1); + else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b0); + else NBmm0 = Mjoin(PATL,pMBmm_bX); + for (j=nnblks; j; j--) + { + Mjoin(PATL,mmK)(mr, mr, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta, + a, lda, incAk, pA, incAWp, B, ldb, incBk, pB, incBW, + C, ldc, A2blk, B2blk, NBmm0, Mjoin(PATL,pMBmm_b1)); + B += incBn; /* copy next col panel of B */ + pB += incW; /* to next col panel of pB */ + a = (incAW ? NULL : a); /* reuse row-panel of A if copied */ + C += ldc*NB; + } + if (nr) + { + if ( SCALAR_IS_ZERO(beta) ) + Mjoin(PATL,gezero)(mr, nr, C, ldc); + Mjoin(PATL,mmK)(mr, mr, nr, nr, nkblks, kr, (incAW | incBW) ? KR:0, + ATL_rone, alpha, beta, + a, lda, incAk, pA, incAWp, B, ldb, incBk, pB, incBWp, + C, ldc, A2blk, B2blk, + Mjoin(PATL,pKBmm), Mjoin(PATL,pKBmm)); + } + } + free(v); + return(0); } + @ROUT ATL_mmBPP + @extract -b @(topd)/gen.inc what=cw @(cw07) + #include "atlas_misc.h" + #include "atlas_lvl3.h" + #include <stdlib.h> int Mjoin(PATL,mmBPP)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N, const int K, *************** *** 6808,6846 **** nblk = K / KB; kr = K - nblk*KB; ! if (nblk) ! { ! B2blk(KB, N, ATL_rone, B, ldb, pB, KB); B += incB; ! A2blk(KB, M, ATL_rone, A, lda, pA, KB); A += incA; ! NBmm0(m, n, KB, ATL_rone, pA, KB, pB, KB, ATL_rzero, pC, ldc); ! for (k = nblk-1; k; k--) ! { ! B2blk(KB, N, ATL_rone, B, ldb, pB, KB); B += incB; ! A2blk(KB, M, ATL_rone, A, lda, pA, KB); A += incA; ! NBmm1(m, n, KB, ATL_rone, pA, KB, pB, KB, ATL_rone, pC, ldc); ! } ! } ! if (kr) /* need to cleanup K loop */ ! { ! if (kr+4 >= KB) /* pad with zeros and use full-KB kernel */ ! { ! Mjoin(PATL,gezero)(KB-kr, n, pB+kr, KB); ! Mjoin(PATL,gezero)(KB-kr, m, pA+kr, KB); ! B2blk(kr, N, ATL_rone, B, ldb, pB, KB); ! A2blk(kr, M, ATL_rone, A, lda, pA, KB); ! if (nblk) ! NBmm1(m, n, KB, ATL_rone, pA, KB, pB, KB, ATL_rone, pC, ldc); ! else ! NBmm0(m, n, KB, ATL_rone, pA, KB, pB, KB, ATL_rone, pC, ldc); ! } ! else ! { ! if (!nblk) ! Mjoin(PATL,zero)(ldc*n, pC, 1); ! B2blk(kr, N, ATL_rone, B, ldb, pB, kr); ! A2blk(kr, M, ATL_rone, A, lda, pA, kr); ! Mjoin(PATL,pKBmm)(M, N, kr, ATL_rone, pA, kr, pB, kr, ! nblk ? ATL_rone : ATL_rzero, pC, ldc); ! } ! } Mjoin(PATL,geadd)(M, N, alpha, pC, ldc, beta, C, ldc0); free(vC); --- 7003,7011 ---- nblk = K / KB; kr = K - nblk*KB; ! if (!nblk && kr) ! Mjoin(PATL,zero)(ldc*n, pC, 1); ! Mjoin(PATL,mmK)(M, m, N, n, nblk, kr, (kr && kr+4 >= KB) ? KB : 0, ! ATL_rone, ATL_rone, ATL_rzero, A, lda, incA, pA, 0, ! B, ldb, incB, pB, 0, pC, ldc, A2blk, B2blk, NBmm0, NBmm1); Mjoin(PATL,geadd)(M, N, alpha, pC, ldc, beta, C, ldc0); free(vC); *************** *** 17425,17428 **** --- 17590,17595 ---- typedef void (*MAT2BLK)(int, int, const TYPE*, int, TYPE*, const SCALAR); + typedef void (*MAT2BLK2)(const int, const int, const SCALAR, const TYPE*, + const int, TYPE*, const int); typedef void (*MATSCAL)(const int, const int, const SCALAR, TYPE*, const int); typedef void (*PUTBLK)(int, int, TYPE*, TYPE*, int, const SCALAR); *************** *** 17585,17588 **** --- 17752,17760 ---- @endwhile + int Mjoin(PATL,mmMNK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, + const SCALAR alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const SCALAR beta, + TYPE *C, const int ldc); int Mjoin(PATL,mmJIK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N, const int K, *************** *** 17601,17604 **** --- 17773,17789 ---- TYPE *C, const int ldc); + void Mjoin(PATL,mmK) + (int M, int m, int N, int n, int nblk, int kr, int KR, const SCALAR alphaA, + const SCALAR alphaB, const SCALAR beta, const TYPE *A, const int lda, + const int incA, TYPE *pA, const int incAW, const TYPE *B, const int ldb, + const int incB, TYPE *pB, const int incBW, TYPE *C, const int ldc, + MAT2BLK2 A2blk, MAT2BLK2 B2blk, NBMM0 NBmm0, NBMM0 NBmm1); + + int Mjoin(PATL,mmBPP)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, + const SCALAR alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const SCALAR beta, + TYPE *C, const int ldc); + @whiledef tatb NN NT TN TT |