[Math-atlas-commits] CVS: AtlasBase/Clint atlas-l2k.base, 1.34, 1.35 atlas-lvl2.base, 1.168, 1.169
Brought to you by:
rwhaley,
tonyc040457
From: R. C. W. <rw...@us...> - 2010-07-26 17:04:06
|
Update of /cvsroot/math-atlas/AtlasBase/Clint In directory sfp-cvsdas-4.v30.ch3.sourceforge.com:/tmp/cvs-serv29321/Clint Modified Files: atlas-l2k.base atlas-lvl2.base atlas-make.base Log Message: Index: atlas-l2k.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas-l2k.base,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** atlas-l2k.base 29 Jun 2010 22:38:04 -0000 1.34 --- atlas-l2k.base 26 Jul 2010 17:03:57 -0000 1.35 *************** *** 1007,1011 **** add lda, pA9 @ROUT ATL_zgerk_1x4_sse3 zgemv_1x4_sse2 ! @extract -b @(topd)/cw.inc lang=C -def cwdate 2009 #include <xmmintrin.h> --- 1007,1011 ---- add lda, pA9 @ROUT ATL_zgerk_1x4_sse3 zgemv_1x4_sse2 ! @extract -b @(topd)/cw.inc lang=C -def cwdate 2009 -def cwdate 2010 #include <xmmintrin.h> *************** *** 1027,1040 **** @whiledef sf u static void gerk_sse3@(sf) ! (ATL_CINT M0, ATL_CINT N, const TYPE *alpha, const TYPE *X, ATL_CINT incX, ! const TYPE *Y0, ATL_CINT incY0, TYPE *A0, ATL_CINT lda0) /* * N must be a multiple of 4 */ { ! ATL_CINT lda=lda0+lda0, lda4 = lda<<2, incY = incY0+incY0, M = M0+M0; ! ATL_CINT incY4 = incY<<2; TYPE *A1=A0+lda, *A2=A1+lda, *A3=A2+lda; - const TYPE *Y1 = Y0+incY, *Y2 = Y1+incY, *Y3 = Y2+incY; register __m128d y0a, y0b, y1a, y1b, y2a, y2b, y3a, y3b, x0a, x0b, xn; register __m128d a00, a01, a02, a03; --- 1027,1038 ---- @whiledef sf u static void gerk_sse3@(sf) ! (ATL_CINT M0, ATL_CINT N, const TYPE *X, const TYPE *Y0, ! TYPE *A0, ATL_CINT lda0) /* * N must be a multiple of 4 */ { ! ATL_CINT lda=lda0+lda0, lda4 = lda<<2, M = M0+M0; TYPE *A1=A0+lda, *A2=A1+lda, *A3=A2+lda; register __m128d y0a, y0b, y1a, y1b, y2a, y2b, y3a, y3b, x0a, x0b, xn; register __m128d a00, a01, a02, a03; *************** *** 1043,1047 **** for (j=0; j < N; j += 4, A0 += lda4, A1 += lda4, A2 += lda4, A3 += lda4, ! Y0 += incY4, Y1 += incY4, Y2 += incY4, Y3 += incY4) { a00 = _mm_load@(sf)_pd(A0); --- 1041,1045 ---- for (j=0; j < N; j += 4, A0 += lda4, A1 += lda4, A2 += lda4, A3 += lda4, ! Y0 += 8) { a00 = _mm_load@(sf)_pd(A0); *************** *** 1056,1067 **** y0b = _mm_load1_pd(Y0+1); /* y0b = {Yi, Yi} */ y0b = _mm_mul_sd(y0b, vnone); /* y0b = {Yi,-Yi} */ ! y1a = _mm_load1_pd(Y1); /* y1a = {Yr, Yr} */ ! y1b = _mm_load1_pd(Y1+1); /* y1b = {Yi, Yi} */ y1b = _mm_mul_sd(y1b, vnone); /* y1b = {Yi,-Yi} */ ! y2a = _mm_load1_pd(Y2); /* y2a = {Yr, Yr} */ ! y2b = _mm_load1_pd(Y2+1); /* y2b = {Yi, Yi} */ y2b = _mm_mul_sd(y2b, vnone); /* y2b = {Yi,-Yi} */ ! y3a = _mm_load1_pd(Y3); /* y3a = {Yr, Yr} */ ! y3b = _mm_load1_pd(Y3+1); /* y3b = {Yi, Yi} */ y3b = _mm_mul_sd(y3b, vnone); /* y3b = {Yi,-Yi} */ for (i=2; i < M; i += 2) --- 1054,1065 ---- y0b = _mm_load1_pd(Y0+1); /* y0b = {Yi, Yi} */ y0b = _mm_mul_sd(y0b, vnone); /* y0b = {Yi,-Yi} */ ! y1a = _mm_load1_pd(Y0+2); /* y1a = {Yr, Yr} */ ! y1b = _mm_load1_pd(Y0+3); /* y1b = {Yi, Yi} */ y1b = _mm_mul_sd(y1b, vnone); /* y1b = {Yi,-Yi} */ ! y2a = _mm_load1_pd(Y0+4); /* y2a = {Yr, Yr} */ ! y2b = _mm_load1_pd(Y0+5); /* y2b = {Yi, Yi} */ y2b = _mm_mul_sd(y2b, vnone); /* y2b = {Yi,-Yi} */ ! y3a = _mm_load1_pd(Y0+6); /* y3a = {Yr, Yr} */ ! y3b = _mm_load1_pd(Y0+7); /* y3b = {Yi, Yi} */ y3b = _mm_mul_sd(y3b, vnone); /* y3b = {Yi,-Yi} */ for (i=2; i < M; i += 2) *************** *** 1163,1181 **** @endwhile void ATL_UGERK ! (ATL_CINT M, ATL_CINT N, const TYPE *alpha, const TYPE *X, ATL_CINT incX, ! const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda) { size_t ia = (size_t) A, ix = (size_t) X; ATL_CINT N4 = (N>>2)<<2; if (N4) { if ((ia>>4)<<4 == ia && (ix>>4)<<4 == ix) ! gerk_sse3(M, N4, alpha, X, incX, Y, incY, A, lda); else ! gerk_sse3u(M, N4, alpha, X, incX, Y, incY, A, lda); } if (N4 != N) ! Mjoin(PATL,gerk_axpy)(M, N-N4, alpha, X, 1, Y+incY*(N4 SHIFT), incY, A+N4*(lda SHIFT), lda); --- 1161,1179 ---- @endwhile void ATL_UGERK ! (ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y, TYPE *A, ATL_CINT lda) { size_t ia = (size_t) A, ix = (size_t) X; ATL_CINT N4 = (N>>2)<<2; + const TYPE one = {ATL_rone, ATL_rzero}; if (N4) { if ((ia>>4)<<4 == ia && (ix>>4)<<4 == ix) ! gerk_sse3(M, N4, X, Y, A, lda); else ! gerk_sse3u(M, N4, X, Y, A, lda); } if (N4 != N) ! Mjoin(PATL,gerk_axpy)(M, N-N4, one, X, 1, Y+(N4 SHIFT), 1, A+N4*(lda SHIFT), lda); *************** *** 1337,1341 **** } @ROUT ATL_dgerk_4x8_sse ! @extract -punymacs -b @(topd)/cw.inc lang=C -def cwdate 2009 #include <xmmintrin.h> #include <stdio.h> --- 1335,1339 ---- } @ROUT ATL_dgerk_4x8_sse ! @extract -punymacs -b @(topd)/cw.inc lang=C -def cwdate 2009 -def cwdate 2010 #include <xmmintrin.h> #include <stdio.h> *************** *** 1343,1349 **** void ATL_UGERK ! (ATL_CINT M, ATL_CINT N, const SCALAR alpha, ! const TYPE *X, ATL_CINT incX, const TYPE *Y, ! ATL_CINT incY1, TYPE *A, ATL_CINT lda1) {/* BEGIN GER: nMU=1, MU=8, NU=4 */ ATL_INT i, j; --- 1341,1346 ---- void ATL_UGERK ! (ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y, ! TYPE *A, ATL_CINT lda1) {/* BEGIN GER: nMU=1, MU=8, NU=4 */ ATL_INT i, j; *************** *** 1351,1363 **** ATL_CINT MAp = ( ((size_t)A)&(15) ) / sizeof(TYPE); ATL_CINT MA=M-MAp; ! ATL_CINT M8=((MA/8)*8)+MAp, M2=((MA>>1)<<1)+MAp, N4=((N/4)*4), lda2=lda1+lda1, incY2=incY1+incY1, lda3=lda2+lda1, incY3=incY2+incY1, lda4=lda3+lda1, incY4=incY3+incY1; __m128d x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, a0_0, m0_0, a1_0, m1_0, a2_0, m2_0, a3_0, m3_0, a4_0, m4_0, a5_0, m5_0, a6_0, m6_0, a7_0, m7_0, a0_1, m0_1, a1_1, m1_1, a2_1, m2_1, a3_1, m3_1, a4_1, m4_1, a5_1, m5_1, a6_1, m6_1, a7_1, m7_1, a0_2, m0_2, a1_2, m1_2, a2_2, m2_2, a3_2, m3_2, a4_2, m4_2, a5_2, m5_2, a6_2, m6_2, a7_2, m7_2, a0_3, m0_3, a1_3, m1_3, a2_3, m2_3, a3_3, m3_3, a4_3, m4_3, a5_3, m5_3, a6_3, m6_3, a7_3, m7_3; ! for (j=0; j < N4; j += 4, A += lda4, Y += incY4) {/* BEGIN N-LOOP UR=4 */ y0 = _mm_load1_pd(Y); ! y1 = _mm_load1_pd(Y+incY1); ! y2 = _mm_load1_pd(Y+incY2); ! y3 = _mm_load1_pd(Y+incY3); if (MAp) {/* peel to force X/A alignment */ --- 1348,1360 ---- ATL_CINT MAp = ( ((size_t)A)&(15) ) / sizeof(TYPE); ATL_CINT MA=M-MAp; ! ATL_CINT M8=((MA/8)*8)+MAp, M2=((MA>>1)<<1)+MAp, N4=((N/4)*4), lda2=lda1+lda1, lda3=lda2+lda1, lda4=lda3+lda1; __m128d x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, a0_0, m0_0, a1_0, m1_0, a2_0, m2_0, a3_0, m3_0, a4_0, m4_0, a5_0, m5_0, a6_0, m6_0, a7_0, m7_0, a0_1, m0_1, a1_1, m1_1, a2_1, m2_1, a3_1, m3_1, a4_1, m4_1, a5_1, m5_1, a6_1, m6_1, a7_1, m7_1, a0_2, m0_2, a1_2, m1_2, a2_2, m2_2, a3_2, m3_2, a4_2, m4_2, a5_2, m5_2, a6_2, m6_2, a7_2, m7_2, a0_3, m0_3, a1_3, m1_3, a2_3, m2_3, a3_3, m3_3, a4_3, m4_3, a5_3, m5_3, a6_3, m6_3, a7_3, m7_3; ! for (j=0; j < N4; j += 4, A += lda4, Y += 4) {/* BEGIN N-LOOP UR=4 */ y0 = _mm_load1_pd(Y); ! y1 = _mm_load1_pd(Y+1); ! y2 = _mm_load1_pd(Y+2); ! y3 = _mm_load1_pd(Y+3); if (MAp) {/* peel to force X/A alignment */ *************** *** 1503,1507 **** }/* END N-LOOP UR=4 */ ! for (j=N4; j < N; j += 1, A += lda1, Y += incY1) {/* BEGIN N-LOOP UR=1 */ y0 = _mm_load1_pd(Y); --- 1500,1504 ---- }/* END N-LOOP UR=4 */ ! for (j=N4; j < N; j += 1, A += lda1, Y++) {/* BEGIN N-LOOP UR=1 */ y0 = _mm_load1_pd(Y); *************** *** 1572,1576 **** #endif @ROUT ATL_cgerk_2x1p ! @extract -punymacs -b @(topd)/cw.inc lang=C -def cwdate 1999 -def cwdate 2009 #include "atlas_misc.h" #include "atlas_lvl2.h" --- 1569,1574 ---- #endif @ROUT ATL_cgerk_2x1p ! @multidef cwdate 1999 2009 2010 ! @extract -punymacs -b @(topd)/cw.inc lang=C #include "atlas_misc.h" #include "atlas_lvl2.h" *************** *** 1578,1587 **** void ATL_UGERK ! (ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *X, ATL_CINT incX, ! const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda) { ATL_CINT mr = M - ((M>>1)<<1); ! ATL_CINT incA = ((lda-M+mr)<<1), incy = incY+incY, lda2=lda+lda; ! const TYPE *stY = Y + N*incy, *stX = X + ((M>>1)<<2), *x; register TYPE ry, iy, rx, ix, ra, ia; --- 1576,1584 ---- void ATL_UGERK ! (ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y, TYPE *A, ATL_CINT lda) { ATL_CINT mr = M - ((M>>1)<<1); ! ATL_CINT incA = ((lda-M+mr)<<1), lda2=lda+lda; ! const TYPE *stY = Y + N+N, *stX = X + ((M>>1)<<2), *x; register TYPE ry, iy, rx, ix, ra, ia; *************** *** 1632,1636 **** A[1] = ia; } ! L1: Y += incy; A += incA; } --- 1629,1633 ---- A[1] = ia; } ! L1: Y += 2; A += incA; } *************** *** 1651,1660 **** *A = ra; A[1] = ia; ! Y += incy; A += lda2; } while (Y != stY); #else ! Mjoin(PATL,axpy)(N, X, Y, incY, A, lda); #endif } --- 1648,1657 ---- *A = ra; A[1] = ia; ! Y += 2; A += lda2; } while (Y != stY); #else ! Mjoin(PATL,axpy)(N, X, Y, 1, A, lda); #endif } *************** *** 1662,1675 **** } @ROUT ATL_cgerk_axpy ! @extract -punymacs -b @(topd)/cw.inc lang=C -def cwdate 1999 -def cwdate 2009 #include "atlas_misc.h" #include "atlas_lvl2.h" ! void ATL_UGERK(ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *X, ! ATL_CINT incX, const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda) { ! ATL_CINT incy = incY<<1, lda2 = lda<<1; ! const TYPE *stY = Y + N*incy; TYPE y[2]; do --- 1659,1672 ---- } @ROUT ATL_cgerk_axpy ! @multidef cwdate 1999 2009 2010 ! @extract -punymacs -b @(topd)/cw.inc lang=C #include "atlas_misc.h" #include "atlas_lvl2.h" ! void ATL_UGERK(ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y, TYPE *A, ATL_CINT lda) { ! ATL_CINT lda2 = lda<<1; ! const TYPE *stY = Y + N+N; TYPE y[2]; do *************** *** 1682,1686 **** #endif Mjoin(PATL,axpy)(M, y, X, 1, A, 1); ! Y += incy; A += lda2; } --- 1679,1683 ---- #endif Mjoin(PATL,axpy)(M, y, X, 1, A, 1); ! Y += 2; A += lda2; } *************** *** 1694,1699 **** void ATL_UGERK ! (ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *X, ATL_CINT incX, ! const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda) { @ROUT ATL_gerk_1x4_0 --- 1691,1695 ---- void ATL_UGERK ! (ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y, TYPE *A, ATL_CINT lda) { @ROUT ATL_gerk_1x4_0 *************** *** 1701,1706 **** const TYPE *x; TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda; ! const TYPE *Y1 = Y+incY, *Y2 = Y1+incY, *Y3 = Y2+incY; ! ATL_CINT N4 = (N>>2)<<2, incAn = (lda<<2) - M + 1, incY4 = incY<<2; register TYPE m0, m1, m2, m3, x0, y0, y1, y2, y3; --- 1697,1701 ---- const TYPE *x; TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda; ! ATL_CINT N4 = (N>>2)<<2, incAn = (lda<<2) - M + 1; register TYPE m0, m1, m2, m3, x0, y0, y1, y2, y3; *************** *** 1709,1716 **** for (j=N4; j; j -= 4) { ! y0 = *Y; Y += incY4; ! y1 = *Y1; Y1 += incY4; ! y2 = *Y2; Y2 += incY4; ! y3 = *Y3; Y3 += incY4; x = X; x0 = *X; x = X + 1; --- 1704,1712 ---- for (j=N4; j; j -= 4) { ! y0 = *Y; ! y1 = Y[1]; ! y2 = Y[2]; ! y3 = Y[3]; ! Y += 4; x = X; x0 = *X; x = X + 1; *************** *** 1733,1744 **** } if (N-N4) ! Mjoin(PATL,gerk_axpy)(M, N-N4, alpha, X, incX, Y, incY, A0, lda); } else ! Mjoin(PATL,gerk_Mlt16)(M, N, alpha, X, incX, Y, incY, A, lda); @ROUT ATL_gerk_8x4_0 int i, j; - ATL_CINT incy = incY<<2; - const TYPE *Y1= Y + incY, *Y2 = Y1 + incY, *Y3 = Y2 + incY; const TYPE *x; TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda; --- 1729,1738 ---- } if (N-N4) ! Mjoin(PATL,gerk_axpy)(M, N-N4, ATL_rone, X, 1, Y, 1, A0, lda); } else ! Mjoin(PATL,gerk_Mlt16)(M, N, ATL_rone, X, 1, Y, 1, A, lda); @ROUT ATL_gerk_8x4_0 int i, j; const TYPE *x; TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda; *************** *** 1751,1760 **** for (j=N4; j; j -= 4) { ! y0 = *Y; y1 = *Y1; y2 = *Y2; y3 = *Y3; x0 = *X; x = X + 1; ! m0 = y0 * x0; Y += incy; ! m1 = y1 * x0; Y1 += incy; ! m2 = y2 * x0; Y2 += incy; ! m3 = y3 * x0; Y3 += incy; for (i=M8; i; i -= 8) { --- 1745,1754 ---- for (j=N4; j; j -= 4) { ! y0 = *Y; y1 = Y[1]; y2 = Y[2]; y3 = Y[3]; x0 = *X; x = X + 1; ! m0 = y0 * x0; Y += 4; ! m1 = y1 * x0; ! m2 = y2 * x0; ! m3 = y3 * x0; for (i=M8; i; i -= 8) { *************** *** 1817,1828 **** } if (N-N4) ! Mjoin(PATL,gerk_axpy)(M, N-N4, alpha, X, incX, Y, incY, A0, lda); } else ! Mjoin(PATL,gerk_Mlt16)(M, N, alpha, X, incX, Y, incY, A, lda); @ROUT ATL_gerk_4x4_1 ATL_INT i, j; - ATL_CINT incy = incY<<2; - const TYPE *Y1= Y + incY, *Y2 = Y1 + incY, *Y3 = Y2 + incY; const TYPE *x; TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda; --- 1811,1820 ---- } if (N-N4) ! Mjoin(PATL,gerk_axpy)(M, N-N4, ATL_rone, X, 1, Y, 1, A0, lda); } else ! Mjoin(PATL,gerk_Mlt16)(M, N, ATL_rone, X, 1, Y, 1, A, lda); @ROUT ATL_gerk_4x4_1 ATL_INT i, j; const TYPE *x; TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda; *************** *** 1834,1841 **** for (j=N4; j; j -= 4) { ! y0 = *Y; Y += incy; ! y1 = *Y1; Y1 += incy; ! y2 = *Y2; Y2 += incy; ! y3 = *Y3; Y3 += incy; x = X; for (i=M4; i; i -= 4) --- 1826,1833 ---- for (j=N4; j; j -= 4) { ! y0 = *Y; ! y1 = Y[1]; ! y2 = Y[2]; ! y3 = Y[3]; Y += 4; x = X; for (i=M4; i; i -= 4) *************** *** 1902,1911 **** } if (N-N4) ! Mjoin(PATL,gerk_axpy)(M, N-N4, alpha, X, incX, Y, incY, A0, lda); } else ! Mjoin(PATL,gerk_Mlt16)(M, N, alpha, X, incX, Y, incY, A, lda); @ROUT ATL_gerk_axpy ! const TYPE *stY = Y + N*incY; @beginskip #ifdef ATL_AltiVec --- 1894,1903 ---- } if (N-N4) ! Mjoin(PATL,gerk_axpy)(M, N-N4, ATL_rone, X, 1, Y, 1, A0, lda); } else ! Mjoin(PATL,gerk_Mlt16)(M, N, ATL_rone, X, 1, Y, 1, A, lda); @ROUT ATL_gerk_axpy ! const TYPE *stY = Y + N; @beginskip #ifdef ATL_AltiVec *************** *** 1925,1929 **** @endskip Mjoin(PATL,axpy)(M, *Y, X, 1, A, 1); ! Y += incY; A += lda; } --- 1917,1921 ---- @endskip Mjoin(PATL,axpy)(M, *Y, X, 1, A, 1); ! Y++; A += lda; } *************** *** 1931,1935 **** } else ! Mjoin(PATL,gerk_Mlt16)(M, N, alpha, X, incX, Y, incY, A, lda); @ROUT ATL_gerk_axpy ATL_gerk_4x4_1 ATL_gerk_8x4_0 ATL_gerk_1x4_0 } --- 1923,1927 ---- } else ! Mjoin(PATL,gerk_Mlt16)(M, N, ATL_rone, X, 1, Y, 1, A, lda); @ROUT ATL_gerk_axpy ATL_gerk_4x4_1 ATL_gerk_8x4_0 ATL_gerk_1x4_0 } *************** *** 1938,1945 **** void ATL_UGER2K ! (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *X0, ! ATL_CINT incX0, const TYPE *Y0, ATL_CINT incY0, const SCALAR alpha1, ! const TYPE *X1, ATL_CINT incX1, const TYPE *Y1, ATL_CINT incY1, ! TYPE *A, ATL_CINT lda) { register TYPE y0, y1; --- 1930,1935 ---- void ATL_UGER2K ! (ATL_CINT M, ATL_CINT N, const TYPE *X0, const TYPE *Y0, ! const TYPE *X1, const TYPE *Y1, TYPE *A, ATL_CINT lda) { register TYPE y0, y1; *************** *** 1948,1953 **** for (j=0; j < N; j++) { ! y0 = Y0[incY0*j]; ! y1 = Y1[incY1*j]; for (i=0; i < M; i++) A[i+j*lda] += X0[i] * y0 + X1[i] * y1; --- 1938,1943 ---- for (j=0; j < N; j++) { ! y0 = Y0[j]; ! y1 = Y1[j]; for (i=0; i < M; i++) A[i+j*lda] += X0[i] * y0 + X1[i] * y1; *************** *** 1958,1970 **** void ATL_UGER2K ! (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *X0, ! ATL_CINT incX0, const TYPE *Y0, ATL_CINT incY0, const SCALAR alpha1, ! const TYPE *X1, ATL_CINT incX1, const TYPE *Y1, ATL_CINT incY1, ! TYPE *A, ATL_CINT lda) { const TYPE *x0, *x1; register TYPE y0r, y0i, y1r, y1i, x0r, x0i, x1r, x1i; register ATL_INT i, j; - const ATL_INT incY02 = incY0+incY0, incY12 = incY1+incY1; const ATL_INT incA = (lda-M)<<1; --- 1948,1957 ---- void ATL_UGER2K ! (ATL_CINT M, ATL_CINT N, const TYPE *X0, const TYPE *Y0, ! const TYPE *X1, const TYPE *Y1, TYPE *A, ATL_CINT lda) { const TYPE *x0, *x1; register TYPE y0r, y0i, y1r, y1i, x0r, x0i, x1r, x1i; register ATL_INT i, j; const ATL_INT incA = (lda-M)<<1; *************** *** 1973,1980 **** y0r = *Y0; y0i = Y0[1]; ! Y0 += incY02; y1r = *Y1; y1i = Y1[1]; ! Y1 += incY12; x0 = X0; x1 = X1; --- 1960,1967 ---- y0r = *Y0; y0i = Y0[1]; ! Y0 += 2; y1r = *Y1; y1i = Y1[1]; ! Y1 += 2; x0 = X0; x1 = X1; *************** *** 2030,2036 **** void ATL_UGER2K ! (ATL_CINT M, ATL_CINT N, const TYPE alpha, const TYPE *X, ATL_CINT incX, ! const TYPE *Y, ATL_CINT incY, const TYPE beta, const TYPE *W, ! ATL_CINT incW, const TYPE *Z, ATL_CINT incZ, TYPE *A, ATL_CINT lda) /* * A += xy + wz --- 2017,2022 ---- void ATL_UGER2K ! (ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y, const TYPE *W, ! const TYPE *Z, TYPE *A, ATL_CINT lda) /* * A += xy + wz *************** *** 2100,2112 **** #define N %rsi /* already in */ #define II %rax /* loaded in loop */ ! #define pX %rcx /* already in */ ! #define pA0 %rdx /* 56(%rsp) */ #define pA1 %rbx /* computed */ ! #define pY %r9 /* already in */ ! #define incY %r8 /* 8(%rsp) */ ! #define pZ %rbp /* 40(%rsp) */ ! #define lda %r10 /* 64(%rsp) */ ! #define pW %r11 /* 24(%rsp) */ ! #define incZ %r12 /* 48(%rsp) */ #define a0 %xmm0 --- 2086,2096 ---- #define N %rsi /* already in */ #define II %rax /* loaded in loop */ ! #define pX %rdx /* already in */ ! #define pA0 %r11 /* 56(%rsp) */ #define pA1 %rbx /* computed */ ! #define pY %rcx /* already in */ ! #define pZ %r9 /* already in */ ! #define lda %r10 /* 16(%rsp) */ ! #define pW %r8 /* already in */ #define a0 %xmm0 *************** *** 2128,2139 **** /* void ATL_UGER2K ! %rdi %rsi %rdx %rcx ! (ATL_CINT M, ATL_CINT N, const TYPE *alpha0, const TYPE *X, ! %r8 %r9 8(%rsp) 16(%rsp) ! ATL_CINT incX, const TYPE *Y, ATL_CINT incY, const TYPE *beta, ! 24(%rsp) 32(%rsp) 40(%rsp) 48(%rsp) ! const TYPE *W, ATL_CINT incW, const TYPE *Z, ATL_CINT incZ, ! 56(%rsp) 64(%rsp) ! TYPE *A, ATL_CINT lda) */ .text --- 2112,2119 ---- /* void ATL_UGER2K ! %rdi %rsi %rdx %rcx ! (ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y, ! %r8 %r9 8(%rsp) 16(%rsp) ! const TYPE *W, const TYPE *Z, TYPE *A, ATL_CINT lda); */ .text *************** *** 2151,2177 **** movapd -24(%rsp), vposneg /* vposneg = {1.0, -1.0} */ /* ! * Save callee-saved iregs */ ! movq %rbp, -8(%rsp) ! movq %rbx, -16(%rsp) ! movq %r12, -24(%rsp) ! #if 0 ! movq %r13, -32(%rsp) ! movq %r14, -40(%rsp) ! movq %r15, -48(%rsp) ! #endif /* * Load & compute all integer variables */ ! movslq 64(%rsp), lda shl $4, lda /* lda *= sizeof */ ! movq 56(%rsp), pA0 lea (pA0, lda), pA1 /* pA1 = pA0 + lda */ - movslq 8(%rsp), incY - shl $4, incY /* incY *= sizeof */ - movq 40(%rsp), pZ - movq 24(%rsp), pW - movslq 48(%rsp), incZ - shl $4, incZ /* incZ *= sizeof */ lea -2(M,M), M /* M = 2(M-1) */ --- 2131,2144 ---- movapd -24(%rsp), vposneg /* vposneg = {1.0, -1.0} */ /* ! * Save callee-saved regs */ ! movq %rbx, -8(%rsp) /* * Load & compute all integer variables */ ! movslq 16(%rsp), lda shl $4, lda /* lda *= sizeof */ ! movq 8(%rsp), pA0 lea (pA0, lda), pA1 /* pA1 = pA0 + lda */ lea -2(M,M), M /* M = 2(M-1) */ *************** *** 2189,2205 **** movddup 8(pY), y0i /* y0i = {y0i, y0i} */ mulpd vposneg, y0i /* y0i = {y0i,-y0i} */ ! movddup (pY,incY), y1r /* y1r = {y1r, y1r} */ ! movddup 8(pY,incY), y1i /* y1i = {y1i, y1i} */ mulpd vposneg, y1i /* y0i = {y1i,-y1i} */ movapd (pX,M,8), x0 /* x0 = {x0i, x0r} */ ! lea (pY,incY,2), pY /* pY += incY*2; */ # movddup (pZ), z0r /* z0r = {z0r, z0r} */ movddup 8(pZ), z0i /* z0i = {z0i, z0i} */ mulpd vposneg, z0i /* z0i = {z0i,-z0i} */ ! movddup (pZ,incZ), z1r /* z1r = {z1r, z1r} */ ! movddup 8(pZ,incZ), z1i /* z1i = {z1i, z1i} */ mulpd vposneg, z1i /* z0i = {z1i,-z1i} */ ! lea (pZ,incZ,2), pZ /* pZ += incZ*2; */ pshufd $0x4E, x0, revx0 /* revx0 = {x0r, x0i} */ mov M, II --- 2156,2172 ---- movddup 8(pY), y0i /* y0i = {y0i, y0i} */ mulpd vposneg, y0i /* y0i = {y0i,-y0i} */ ! movddup 16(pY), y1r /* y1r = {y1r, y1r} */ ! movddup 24(pY), y1i /* y1i = {y1i, y1i} */ mulpd vposneg, y1i /* y0i = {y1i,-y1i} */ movapd (pX,M,8), x0 /* x0 = {x0i, x0r} */ ! add $32, pY # movddup (pZ), z0r /* z0r = {z0r, z0r} */ movddup 8(pZ), z0i /* z0i = {z0i, z0i} */ mulpd vposneg, z0i /* z0i = {z0i,-z0i} */ ! movddup 16(pZ), z1r /* z1r = {z1r, z1r} */ ! movddup 24(pZ), z1i /* z1i = {z1i, z1i} */ mulpd vposneg, z1i /* z0i = {z1i,-z1i} */ ! add $32, pZ pshufd $0x4E, x0, revx0 /* revx0 = {x0r, x0i} */ mov M, II *************** *** 2302,2308 **** * EPILOGUE: restore registers and return */ ! movq -8(%rsp), %rbp ! movq -16(%rsp), %rbx ! movq -24(%rsp), %r12 #if 0 movq %r13, -32(%rsp), %r13 --- 2269,2273 ---- * EPILOGUE: restore registers and return */ ! movq -8(%rsp), %rbx #if 0 movq %r13, -32(%rsp), %r13 Index: atlas-lvl2.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas-lvl2.base,v retrieving revision 1.168 retrieving revision 1.169 diff -C2 -d -r1.168 -r1.169 *** atlas-lvl2.base 24 Jul 2010 20:54:13 -0000 1.168 --- atlas-lvl2.base 26 Jul 2010 17:03:57 -0000 1.169 *************** *** 3,7 **** @endifdef @extract -b @(topd)/gen.inc what=crsetup ! @ROUT mvtktest mvnktest mvttest mvntest @extract -b @(topd)/cw.inc lang=c -define cwdate 2010 @ROUT r1ktest --- 3,7 ---- @endifdef @extract -b @(topd)/gen.inc what=crsetup ! @ROUT mvtktest mvnktest mvttest mvntest ATL_ger ATL_ger2 @extract -b @(topd)/cw.inc lang=c -define cwdate 2010 @ROUT r1ktest *************** *** 9,13 **** @ROUT r1test mvtest @extract -b @(topd)/gen.inc what=cw @(cw00) ! @ROUT ATL_symv_old atlas_r1.h ATL_ger ATL_gemv_old @\ ATL_cgemvN_4x2_1 ATL_cgemvN_2x2_0 ATL_cgemvN_mm @\ ATL_cgemvN_1x1_1 ATL_cgemvN_1x1_1a ATL_gemvN_4x4_1 ATL_gemvN_4x2_0 @\ --- 9,13 ---- @ROUT r1test mvtest @extract -b @(topd)/gen.inc what=cw @(cw00) ! @ROUT ATL_symv_old atlas_r1.h ATL_gemv_old @\ ATL_cgemvN_4x2_1 ATL_cgemvN_2x2_0 ATL_cgemvN_mm @\ ATL_cgemvN_1x1_1 ATL_cgemvN_1x1_1a ATL_gemvN_4x4_1 ATL_gemvN_4x2_0 @\ *************** *** 308,319 **** TYPE *A, ATL_CINT lda); @ROUT r1ktest ! void ATL_UGERK(ATL_CINT M, ATL_CINT N, const SCALAR alpha, ! const TYPE *X, ATL_CINT incX, const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda); @ROUT r2ktest ! void ATL_UGER2K(ATL_CINT M, ATL_CINT N, const SCALAR alpha0, ! const TYPE *X0, ATL_CINT incX0, const TYPE *Y0, ATL_CINT incY0, ! const SCALAR alpha1, const TYPE *X1, ATL_CINT incX1, ! const TYPE *Y1, ATL_CINT incY1, TYPE *A, ATL_CINT lda); @ROUT mvnktest mvtktest void ATL_UGEMV(ATL_CINT M, ATL_CINT N, const TYPE *A, ATL_CINT lda, --- 308,316 ---- TYPE *A, ATL_CINT lda); @ROUT r1ktest ! void ATL_UGERK(ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y, TYPE *A, ATL_CINT lda); @ROUT r2ktest ! void ATL_UGER2K(ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y, ! const TYPE *W, const TYPE *Z, TYPE *A, ATL_CINT lda); @ROUT mvnktest mvtktest void ATL_UGEMV(ATL_CINT M, ATL_CINT N, const TYPE *A, ATL_CINT lda, *************** *** 476,479 **** --- 473,477 ---- @endskip @rout r1ktest + @beginskip #ifdef TCPLS if (CONJ) *************** *** 481,487 **** else #endif ! ATL_UGERK(M, N, one, X, 1, Y, incY, A, lda); @rout r2ktest ! ATL_UGER2K(M, N, one, X, 1, Y, incY, one, X1, 1, Y1, incY, A, lda); dumb_ger2(CONJ, M, N, one, X, 1, Y, incY, one, X1, 1, Y1, incY, A0, lda); if (incY < 0) Y1 -= (N-1) * (aincY SHIFT); --- 479,486 ---- else #endif ! @endskip ! ATL_UGERK(M, N, X, Y, A, lda); @rout r2ktest ! ATL_UGER2K(M, N, X, Y, X1, Y1, A, lda); dumb_ger2(CONJ, M, N, one, X, 1, Y, incY, one, X1, 1, Y1, incY, A0, lda); if (incY < 0) Y1 -= (N-1) * (aincY SHIFT); *************** *** 755,759 **** *incXs = GetIntList1(1); if (*incYs == NULL) ! *incYs = GetIntList2(1, -1); @rout mvntest mvttest if (*ALPHAs == NULL) --- 754,758 ---- *incXs = GetIntList1(1); if (*incYs == NULL) ! *incYs = GetIntList1(1); @rout mvntest mvttest if (*ALPHAs == NULL) *************** *** 1544,1549 **** size_t t1, t2; ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; ! ATL_INT incz=1; ! int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, minM, minN, INCYIS1; int COPYX, COPYY, COPYW, COPYZ, APPLYALPHAX, APPLYBETAW; #ifdef TREAL --- 1543,1547 ---- size_t t1, t2; ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; ! int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, minM, minN; int COPYX, COPYY, COPYW, COPYZ, APPLYALPHAX, APPLYBETAW; #ifdef TREAL *************** *** 1610,1614 **** */ ger2k = ATL_GetR2Kern(M, N, A, lda, &mu, &nu, &minM, &minN, &alignX, ! &ALIGNX2A, &alignY, &INCYIS1, &ForceNU, &CacheElts); if (CacheElts) { --- 1608,1612 ---- */ ger2k = ATL_GetR2Kern(M, N, A, lda, &mu, &nu, &minM, &minN, &alignX, ! &ALIGNX2A, &alignY, &ForceNU, &CacheElts); if (CacheElts) { *************** *** 1788,1792 **** * Call optimized kernel (can be restricted or general) */ ! ger2k(imb, Nm, one, x, 1, y, incy, one, w, 1, z, incz, A, lda); /* * Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy --- 1786,1790 ---- * Call optimized kernel (can be restricted or general) */ ! ger2k(imb, Nm, x, y, w, z, A, lda); /* * Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy *************** *** 1805,1809 **** free(vp); } - @ROUT ATL_GER @ROUT ATL_ger #include "atlas_misc.h" --- 1803,1806 ---- *************** *** 1857,1861 **** ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX; ! int minM, minN, INCYIS1; #ifdef TREAL #define one ATL_rone --- 1854,1858 ---- ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX; ! int minM, minN; #ifdef TREAL #define one ATL_rone *************** *** 1900,1904 **** */ gerk = ATL_GetR1Kern(M, N, A, lda, &mu, &nu, &minM, &minM, &alignX, ! &ALIGNX2A, &alignY, &INCYIS1, &ForceNU, &CacheElts); if (CacheElts) { --- 1897,1901 ---- */ gerk = ATL_GetR1Kern(M, N, A, lda, &mu, &nu, &minM, &minM, &alignX, ! &ALIGNX2A, &alignY, &ForceNU, &CacheElts); if (CacheElts) { *************** *** 2022,2026 **** * Call optimized kernel (can be restricted or general) */ ! gerk(imb, Nm, one, x, 1, y, incy, A, lda); /* * Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy --- 2019,2026 ---- * Call optimized kernel (can be restricted or general) */ ! if (imb > minM) ! gerk(imb, Nm, x, y, A, lda); ! else ! Mjoin(PATL,gerk_Mlt16)(imb, Nm, one, x, 1, y, 1, A, lda); /* * Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy *************** *** 9196,9203 **** @ROUT r2ktime #ifdef TIME_KERNEL ! void ATL_UGER2K(ATL_CINT M, ATL_CINT N, const SCALAR alpha, ! const TYPE *X, ATL_CINT incX, const TYPE *Y, ATL_CINT incY, ! const SCALAR alpha1, const TYPE *X1, ATL_CINT incX1, ! const TYPE *Y1, ATL_CINT incY1, TYPE *A, ATL_CINT lda); #else void Mjoin(PATL,ger2)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, --- 9196,9201 ---- @ROUT r2ktime #ifdef TIME_KERNEL ! void ATL_UGER2K(ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y, ! const TYPE *X1, const TYPE *Y1, TYPE *A, ATL_CINT lda); #else void Mjoin(PATL,ger2)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, *************** *** 9219,9224 **** @ROUT r1ktime #ifdef TIME_KERNEL ! void ATL_UGERK(ATL_CINT M, ATL_CINT N, const SCALAR alpha, ! const TYPE *X, ATL_CINT incX, const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda); #else --- 9217,9221 ---- @ROUT r1ktime #ifdef TIME_KERNEL ! void ATL_UGERK(ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y, TYPE *A, ATL_CINT lda); #else *************** *** 9374,9379 **** Mjoin(PATL,gegen)(M, N, A, lda, N*M+513*7+90); /* ! * NOTE: if nreps too high this could lead to under/overflow, since changing ! * alpha will only help for alpha=X kernels! */ t0 = time00(); --- 9371,9375 ---- Mjoin(PATL,gegen)(M, N, A, lda, N*M+513*7+90); /* ! * NOTE: if nreps too high this could lead to under/overflow */ t0 = time00(); *************** *** 9381,9389 **** { @ROUT r2ktime ! ATL_UGER2K(M, N, SVVAL alpha, X, incX, Y, incY, ! SVVAL negalpha, X1, incX, Y1, incY, A, lda); @ROUT r1ktime ! ATL_UGERK(M, N, SVVAL alpha, X, incX, Y, incY, A, lda); ! *alpha = -(*alpha); @ROUT mvktime ATL_UGEMV(M, N, A, lda, X, Y); --- 9377,9383 ---- { @ROUT r2ktime ! ATL_UGER2K(M, N, X, Y, X1, Y1, A, lda); @ROUT r1ktime ! ATL_UGERK(M, N, X, Y, A, lda); @ROUT mvktime ATL_UGEMV(M, N, A, lda, X, Y); *************** *** 12616,12619 **** --- 12610,12614 ---- @ROUT r1hgen r2hgen + @beginskip void PrintPrototype(FILE *fpout, char pre, char *rout, char *type, char *styp) { *************** *** 12621,12624 **** --- 12616,12620 ---- rout, styp, type, type, type); } + @endskip @ROUT mvthgen mvnhgen void PrintPrototype(FILE *fpout, char pre, char *rout, char *type, char *styp) *************** *** 12890,12894 **** " int *mu, int *nu, int *minM, int *minN, int *alignX, int *ALIGNX2A,\n"); fprintf(fpout, ! " int *alignY, int *INCYIS1, int *FNU, ATL_INT *CacheElts) \n{\n"); spc -= 3; --- 12886,12890 ---- " int *mu, int *nu, int *minM, int *minN, int *alignX, int *ALIGNX2A,\n"); fprintf(fpout, ! " int *alignY, int *FNU, ATL_INT *CacheElts) \n{\n"); spc -= 3; *************** *** 12930,12935 **** FLAG_IS_SET(kp->flag, @up@(ru)F_ALIGNX2A)); fprintf(fpout, "%s*FNU = %d;\n", spc, FLAG_IS_SET(kp->flag, @up@(ru)F_FNU)); - fprintf(fpout, "%s*INCYIS1 = %d;\n", spc, - FLAG_IS_SET(kp->flag, @up@(ru)F_INCYISONE)); fprintf(fpout, "%s*CacheElts = %d;\n", spc, kp->CacheElts); @ROUT mvthgen mvnhgen --- 12926,12929 ---- *************** *** 12970,12977 **** { ATL_@(ru)node_t *kp; ! char *type, *styp; type = Pre2Type(pre); - styp = Pre2ScalarType(pre); for (kp=kb; kp; kp = kp->next) { --- 12964,12970 ---- { ATL_@(ru)node_t *kp; ! char *type; type = Pre2Type(pre); for (kp=kb; kp; kp = kp->next) { *************** *** 12987,13000 **** @ROUT r1hgen fprintf(fpout, ! " (ATL_CINT, ATL_CINT, const %s, const %s*, ATL_CINT, const %s*,\n", ! styp, type, type); ! fprintf(fpout, " ATL_CINT, %s*, ATL_CINT);\n", type); @ROUT r2hgen fprintf(fpout, ! " (ATL_CINT, ATL_CINT, const %s, const %s*, ATL_CINT, const %s*,\n", ! styp, type, type); ! fprintf(fpout, " ATL_CINT, const %s, const %s*, ATL_CINT, const %s*,\n", ! styp, type, type); ! fprintf(fpout, " ATL_CINT, %s*, ATL_CINT);\n", type); @ROUT r1hgen r2hgen mvthgen mvnhgen } --- 12980,12990 ---- @ROUT r1hgen fprintf(fpout, ! " (ATL_CINT, ATL_CINT, const %s*, const %s*, %s*, ATL_CINT);\n", ! type, type, type); @ROUT r2hgen fprintf(fpout, ! " (ATL_CINT, ATL_CINT, const %s*, const %s*, const %s*,\n", ! type, type, type); ! fprintf(fpout, " const %s*, %s*, ATL_CINT);\n", type, type); @ROUT r1hgen r2hgen mvthgen mvnhgen } *************** *** 13041,13054 **** @ROUT r1hgen fprintf(fpout, "typedef void (*ATL_r1kern_t)\n"); ! fprintf(fpout, " (ATL_CINT, ATL_CINT, const %s, const %s*, ATL_CINT,\n", ! styp, type)/ ! fprintf(fpout, " const %s*, ATL_CINT, %s*, ATL_CINT);\n\n", type, type); @ROUT r2hgen fprintf(fpout, "typedef void (*ATL_r2kern_t)\n"); ! fprintf(fpout, " (ATL_CINT, ATL_CINT, const %s, const %s*, ATL_CINT,\n", ! styp, type)/ ! fprintf(fpout, " const %s*, ATL_CINT, const %s, const %s*, ATL_CINT,\n", ! type, styp, type); ! fprintf(fpout, "const %s*, ATL_CINT, %s*, ATL_CINT);\n\n", type, type); @ROUT mvthgen mvnhgen fprintf(fpout, "#ifndef ATL_MVKERN_DEF\n"); --- 13031,13043 ---- @ROUT r1hgen fprintf(fpout, "typedef void (*ATL_r1kern_t)\n"); ! fprintf(fpout, ! " (ATL_CINT, ATL_CINT, const %s*, const %s*, %s*, ATL_CINT);\n", ! type, type, type); @ROUT r2hgen fprintf(fpout, "typedef void (*ATL_r2kern_t)\n"); ! fprintf(fpout, ! " (ATL_CINT, ATL_CINT, const %s*, const %s*, const %s*,\n", ! type, type, type); ! fprintf(fpout, " const %s*, %s*, ATL_CINT);\n", type, type); @ROUT mvthgen mvnhgen fprintf(fpout, "#ifndef ATL_MVKERN_DEF\n"); *************** *** 13780,13785 **** @ROUT ATL_gerk_Mlt16 ATL_ger2k_Mlt16 #include "atlas_misc.h" ! @ROUT ATL_ger2k_Mlt16 typedef void (*ger2k_t) (ATL_CINT M, ATL_CINT N, const SCALAR alp_a, const TYPE *Xa, ATL_CINT incXa, --- 13769,13775 ---- @ROUT ATL_gerk_Mlt16 ATL_ger2k_Mlt16 #include "atlas_misc.h" ! #include "atlas_lvl2.h" @ROUT ATL_ger2k_Mlt16 + typedef void (*ger2k_t) (ATL_CINT M, ATL_CINT N, const SCALAR alp_a, const TYPE *Xa, ATL_CINT incXa, *************** *** 13787,13790 **** --- 13777,13784 ---- ATL_CINT incXb, const TYPE *Yb, ATL_CINT incYb, TYPE *A, const int lda); @ROUT ATL_gerk_Mlt16 + #ifdef TCPLX + #include "atlas_reflevel2.h" + #endif + typedef void (*gerk_t) (const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX, *************** *** 13933,13937 **** @undef i @enddeclare ! ATL_assert(M < 16); @ROUT ATL_ger2k_Mlt16 ger2ks[M-1](M, N, alp_a, Xa, incXa, Ya, incYa, --- 13927,13946 ---- @undef i @enddeclare ! #ifdef ATL_GAS_x8664 ! if (M > 14) ! #elif defined(ATL_GAS_x8632) ! if (M > 6) ! #else ! if (M > 15) ! #endif ! { ! @ROUT ATL_gerk_Mlt16 ! Mjoin(PATL,gerk_axpy)(M, N, alpha, X, incX, Y, incY, A, lda); ! @ROUT ATL_ger2k_Mlt16 ! Mjoin(PATL,ger2k_Nlt8)(M, N, alp_a, Xa, incXa, Ya, incYa, alp_b, ! Xb, incXb, Yb, incYb, A, lda); ! @ROUT ATL_gerk_Mlt16 ATL_ger2k_Mlt16 ! return; ! } @ROUT ATL_ger2k_Mlt16 ger2ks[M-1](M, N, alp_a, Xa, incXa, Ya, incYa, *************** *** 13949,13953 **** void Mjoin(PATL,ger2k_Mlt16) #endif ! (ATL_CINT M, ATL_CINT N, const SCALAR alp_a, const TYPE *Xa, ATL_CINT incXa, const TYPE *Ya, ATL_CINT incYa, SCALAR alp_b, const TYPE *Xb, ATL_CINT incXb, const TYPE *Yb, ATL_CINT incYb, TYPE *A, ATL_CINT lda) { --- 13958,13962 ---- void Mjoin(PATL,ger2k_Mlt16) #endif ! (ATL_CINT M, ATL_CINT N, const SCALAR alp_a, const TYPE *Xa, ATL_CINT incXa, const TYPE *Ya, ATL_CINT incYa, const SCALAR alp_b, const TYPE *Xb, ATL_CINT incXb, const TYPE *Yb, ATL_CINT incYb, TYPE *A, ATL_CINT lda) { *************** *** 13973,13982 **** { /* ! * For now, complex simply calls axpy-based routine. Need to fix later */ #ifdef Conj_ ! Mjoin(PATL,gerck_axpy)(M, N, alpha, X, incX, Y, incY, A, lda); #else ! Mjoin(PATL,gerk_axpy)(M, N, alpha, X, incX, Y, incY, A, lda); #endif } --- 13982,14001 ---- { /* ! * For now, complex simply calls refblas for short M, axpy-based for large. ! * Probably not worth additional instruction load for complex to unroll. */ + #ifndef TUNING + if (M < 8) #ifdef Conj_ ! Mjoin(PATL,refgerc)(M, N, alpha, X, incX, Y, incY, A, lda); #else ! Mjoin(PATL,refgeru)(M, N, alpha, X, incX, Y, incY, A, lda); ! #endif ! else ! #endif ! #ifdef Conj_ ! Mjoin(PATL,gerck_axpy)(M, N, alpha, X, incX, Y, incY, A, lda); ! #else ! Mjoin(PATL,gerk_axpy)(M, N, alpha, X, incX, Y, incY, A, lda); #endif } *************** *** 14230,14234 **** @ROUT ATL_syr ` ATL_INT MB, NB, mb, nb, Nmb, i, n, incx=incX, CacheElts;` @ROUT ATL_her ` ATL_INT MB, NB, mb, nb, Nmb, n, i, CacheElts;` ! int mu, nu, minM, minN, alignX, alignXt, INCYIS1, FNU; int COPYX=0, COPYXt=0, ALIGNX2A=0; const int ALPHA_IS_ONE=(alpha == ATL_rone); --- 14249,14253 ---- @ROUT ATL_syr ` ATL_INT MB, NB, mb, nb, Nmb, i, n, incx=incX, CacheElts;` @ROUT ATL_her ` ATL_INT MB, NB, mb, nb, Nmb, n, i, CacheElts;` ! int mu, nu, minM, minN, alignX, alignXt, FNU; int COPYX=0, COPYXt=0, ALIGNX2A=0; const int ALPHA_IS_ONE=(alpha == ATL_rone); *************** *** 14260,14264 **** } gerk = ATL_GetR1Kern(MB, NB, A, lda, &mu, &nu, &minM, &minN, &alignX, ! &ALIGNX2A, &alignXt, &INCYIS1, &FNU, &CacheElts); /* * Determine if we need to copy the vectors --- 14279,14283 ---- } gerk = ATL_GetR1Kern(MB, NB, A, lda, &mu, &nu, &minM, &minN, &alignX, ! &ALIGNX2A, &alignXt, &FNU, &CacheElts); /* * Determine if we need to copy the vectors *************** *** 14286,14290 **** } @ROUT ATL_syr ! COPYXt = (INCYIS1 && incX != 1); if (!COPYXt && alignXt) /* alignment might still force a copy */ { --- 14305,14309 ---- } @ROUT ATL_syr ! COPYXt = (incX != 1); if (!COPYXt && alignXt) /* alignment might still force a copy */ { *************** *** 14633,14637 **** void *vp=NULL; ATL_r2kern_t gerk, gerk0; ! int MB, NB, mb, nb, mu, nu, minM, minN, alignX, alignXt, INCYIS1, FNU; int COPYX=0, COPYY, ALIGNX2A=0; ATL_INT CacheElts, i, n; --- 14652,14656 ---- void *vp=NULL; ATL_r2kern_t gerk, gerk0; ! int MB, NB, mb, nb, mu, nu, minM, minN, alignX, alignXt, FNU; int COPYX=0, COPYY, ALIGNX2A=0; ATL_INT CacheElts, i, n; *************** *** 14664,14668 **** } gerk = ATL_GetR2Kern(MB, NB, A, lda, &mu, &nu, &minM, &minN, &alignX, ! &ALIGNX2A, &alignXt, &INCYIS1, &FNU, &CacheElts); @ROUT ATL_syr2 /* --- 14683,14687 ---- } gerk = ATL_GetR2Kern(MB, NB, A, lda, &mu, &nu, &minM, &minN, &alignX, ! &ALIGNX2A, &alignXt, &FNU, &CacheElts); @ROUT ATL_syr2 /* *************** *** 15818,15822 **** ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX; ! int minM, minN, INCYIS1; #ifdef TREAL #define one ATL_rone --- 15837,15841 ---- ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX; ! int minM, minN; #ifdef TREAL #define one ATL_rone *************** *** 15860,15864 **** mvtk_b1 = ATL_GetMVTKern(M, N, A, lda, &mvtk_b0, &mu, &nu, &minM, &minN, &alignX, &ALIGNX2A, &alignY, ! &INCYIS1, &ForceNU, &CacheElts); /* * Set up to handle case where kernel requres N to be a multiple if NU --- 15879,15883 ---- mvtk_b1 = ATL_GetMVTKern(M, N, A, lda, &mvtk_b0, &mu, &nu, &minM, &minN, &alignX, &ALIGNX2A, &alignY, ! &ForceNU, &CacheElts); /* * Set up to handle case where kernel requres N to be a multiple if NU *************** *** 16125,16129 **** ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNY2A, ForceNU, COPYX, COPYY, APPLYALPHAX; ! int minM, minN, INCYIS1, DOTBASED; #ifdef TREAL #define one ATL_rone --- 16144,16148 ---- ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNY2A, ForceNU, COPYX, COPYY, APPLYALPHAX; ! int minM, minN, DOTBASED; #ifdef TREAL #define one ATL_rone *************** *** 16168,16172 **** mvnk_b1 = ATL_GetMVNKern(M, N, A, lda, &mvnk_b0, &DOTBASED, &mu, &nu, &minM, &minN, &alignY, &ALIGNY2A, &alignX, ! &INCYIS1, &ForceNU, &CacheElts); /* * Set up to handle case where kernel requires N to be a multiple if NU --- 16187,16191 ---- mvnk_b1 = ATL_GetMVNKern(M, N, A, lda, &mvnk_b0, &DOTBASED, &mu, &nu, &minM, &minN, &alignY, &ALIGNY2A, &alignX, ! &ForceNU, &CacheElts); /* * Set up to handle case where kernel requires N to be a multiple if NU Index: atlas-make.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/Clint/atlas-make.base,v retrieving revision 1.309 retrieving revision 1.310 diff -C2 -d -r1.309 -r1.310 *** atlas-make.base 24 Jul 2010 20:54:13 -0000 1.309 --- atlas-make.base 26 Jul 2010 17:03:57 -0000 1.310 *************** *** 1240,1249 **** @whiledef rt r r2 ATL_@(pre)ge@(rt)k_@(suf).o : $(r1SRCdir)/ATL_ge@(rt)k_@(suf).c ! $(@up@(upr)KC) -c $(@up@(upr)KCFLAGS) $(CDEFS) -I$(R1Tdir) \ -o $@ -D@(typ) $(r1SRCdir)/ATL_ge@(rt)k_@(suf).c @PTYP c z ATL_@(pre)ge@(rt)ck_@(suf).o : $(r1SRCdir)/ATL_ge@(rt)k_@(suf).c $(@up@(upr)KC) -c $(@up@(upr)KCFLAGS) $(CDEFS) -DConj_ -I$(R1Tdir) \ ! -o $@ -D@(typ) $(r1SRCdir)/ATL_ge@(rt)k_@(suf).c @PTYP ! @endwhile --- 1240,1249 ---- @whiledef rt r r2 ATL_@(pre)ge@(rt)k_@(suf).o : $(r1SRCdir)/ATL_ge@(rt)k_@(suf).c ! $(@up@(upr)KC) -c $(@up@(upr)KCFLAGS) $(CDEFS) -I$(R1Tdir) -DTUNING=1 \ -o $@ -D@(typ) $(r1SRCdir)/ATL_ge@(rt)k_@(suf).c @PTYP c z ATL_@(pre)ge@(rt)ck_@(suf).o : $(r1SRCdir)/ATL_ge@(rt)k_@(suf).c $(@up@(upr)KC) -c $(@up@(upr)KCFLAGS) $(CDEFS) -DConj_ -I$(R1Tdir) \ ! -DTUNING -o $@ -D@(typ) $(r1SRCdir)/ATL_ge@(rt)k_@(suf).c @PTYP ! @endwhile |