[Math-atlas-commits] CVS: AtlasBase/kernel/ClintWhaley ATL_dmm2x1x40_x87.c,NONE,1.1 ATL_dmm4x1x44_4_
Brought to you by:
rwhaley,
tonyc040457
Update of /cvsroot/math-atlas/AtlasBase/kernel/ClintWhaley In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv19198 Modified Files: ATL_dmm2x1x40_5pABC.c ATL_dmm6x1x72_sse2.c clint.base Added Files: ATL_dmm2x1x40_x87.c ATL_dmm4x1x44_4_sse2.c ATL_smm4x1x60_4_sse2.c Log Message: eff kernels --- NEW FILE: ATL_dmm2x1x40_x87.c --- #ifndef Mjoin #define Mjoin(pre, nam) my_join(pre, nam) #define my_join(pre, nam) pre ## nam #endif #if defined(ATL_OS_WinNT) || defined(ATL_OS_Win9x) #define ATL_AUSERMM Mjoin(_,ATL_USERMM) #else #define ATL_AUSERMM ATL_USERMM #endif #ifndef ATL_GAS_x8632 #error "This kernel requires gas x86-32 assembler!" #endif #if !defined(KB) || (KB == 0) #error "KB must be a compile-time constant!" #endif #if !defined(NB) #define NB 0 #endif #if !defined(MB) #define MB 0 #endif #if (MB/2)*2 != MB #error "MB must be multiple of 2!" #endif #ifdef DCPLX #define OFF 16 #define CMUL(i_) (2*(i_)) #else #define OFF 8 #define CMUL(i_) i_ #endif # # Integer register usage shown be these defines # #define pC %esi #define pA %ecx #define pB %edi #define incCn %eax #define stM %bl #define stN %bh #define ldab %edx /* #define pA3 %ebp */ #define pA0 pA #define pB0 pB #define pfA incCn #define NBso (KB*8) #define NB2so (NBso+NBso) #define NB3so (NBso+NBso+NBso) #define NB4so (NBso+NBso+NBso+NBso) #define NB5so (NBso+NBso+NBso+NBso+NBso) #define NB6so (NBso+NBso+NBso+NBso+NBso+NBso) #define NB7so (NB6so+NBso) #define NB8so (NB6so+NB2so) #define NB9so (NB6so+NB3so) #define NB10so (NB6so+NB4so) #define NB11so (NB6so+NB5so) #if MB != 0 #define MBKBso (MB*KB*8) #endif # # Prefetch defines # #if 1 #define pref2(mem) prefetcht1 mem #define prefB(mem) prefetcht0 mem #define prefC(mem) prefetcht0 mem #else #define pref2(mem) #define prefB(mem) #define prefC(mem) #endif # offset 4 8 12 16 # void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, # offset 24 28 32 36 # const TYPE *A, const int lda, const TYPE *B, const int ldb, # offset 40 48 52 # const TYPE beta, TYPE *C, const int ldc) # .text .global ATL_AUSERMM ATL_AUSERMM: # # Save callee-saved iregs; Save old stack pointer in eax, # so we can adjust for BETA alignment # #define FSIZE 28 #define BETAOFF FSIZE+40 #define COFF 16 subl $FSIZE, %esp movl %ebp, 12(%esp) movl %ebx, 8(%esp) movl %esi, 4(%esp) movl %edi, (%esp) # # Initialize pA = A; pB = B; pC = C; # #if MB == 0 movl FSIZE+4(%esp), %ebx movl %ebx, COFF+4(%esp) imul $NBso, %ebx subl $NB2so, %ebx movl %ebx, COFF+8(%esp) #endif movl FSIZE+24(%esp), pA movl FSIZE+32(%esp), pB movl FSIZE+48(%esp), pC #if NB == 0 movb FSIZE+8(%esp), stN #else movb $NB, stN #endif # # Set incCn = (ldc - MB)*sizeof # movl FSIZE+52(%esp), incCn #if MB == 0 subl COFF+4(%esp), incCn addl $2, incCn #else subl $MB-2, incCn #endif #ifdef DCPLX shl $4, incCn #else shl $3, incCn #endif movl incCn, COFF(%esp) movl pA0, pfA #if MB == 0 addl $NB2so, pfA addl COFF+8(%esp), pfA #else addl $MBKBso, pfA #endif NLOOP: #if MB == 0 movb COFF+4(%esp), stM subb $2, stM jz MLOOPCU #else movb $MB-2, stM #endif #if MB != 6 MLOOP: fldl (pB) fldl (pA) fmul %st(1),%st fldl 320(pA) fmulp %st,%st(2) fldl 8(pB) fldl 8(pA) fmul %st(1),%st fldl 328(pA) fmulp %st,%st(2) fldl 16(pB) fldl 16(pA) fmul %st(1),%st #if defined(BETA0) || defined (BETAX) fldz #else fldl (pC) #endif faddp %st,%st(5) fldl 336(pA) fmulp %st,%st(2) fldl 24(pB) #if defined(BETA0) || defined (BETAX) fldz #else fldl OFF(pC) #endif faddp %st,%st(7) fldl 24(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 344(pA) fmulp %st,%st(1) fldl 32(pB) fxch %st(7) faddp %st,%st(5) fldl 32(pA) fmul %st(7),%st fxch %st(4) faddp %st,%st(2) fldl 352(pA) fmulp %st,%st(7) fldl 40(pB) fxch %st(5) faddp %st,%st(3) fldl 40(pA) fmul %st(5),%st fxch %st(2) faddp %st,%st(6) fldl 360(pA) fmulp %st,%st(5) fldl 48(pB) fxch %st(3) faddp %st,%st(1) fldl 48(pA) fmul %st(3),%st fxch %st(6) faddp %st,%st(4) fldl 368(pA) fmulp %st,%st(3) fldl 56(pB) fxch %st(1) faddp %st,%st(7) fldl 56(pA) fmul %st(1),%st fxch %st(4) faddp %st,%st(2) fldl 376(pA) fmulp %st,%st(1) fldl 64(pB) fxch %st(7) faddp %st,%st(5) fldl 64(pA) fmul %st(7),%st fxch %st(2) faddp %st,%st(6) fldl 384(pA) fmulp %st,%st(7) pref2((pfA)) addl $16, pfA fldl 72(pB) fxch %st(5) faddp %st,%st(3) fldl 72(pA) fmul %st(5),%st fxch %st(6) faddp %st,%st(4) fldl 392(pA) fmulp %st,%st(5) fldl 80(pB) fxch %st(3) faddp %st,%st(1) fldl 80(pA) fmul %st(3),%st fxch %st(4) faddp %st,%st(2) fldl 400(pA) fmulp %st,%st(3) fldl 88(pB) fxch %st(1) faddp %st,%st(7) fldl 88(pA) fmul %st(1),%st fxch %st(2) faddp %st,%st(6) fldl 408(pA) fmulp %st,%st(1) fldl 96(pB) fxch %st(7) faddp %st,%st(5) fldl 96(pA) fmul %st(7),%st fxch %st(6) faddp %st,%st(4) fldl 416(pA) fmulp %st,%st(7) fldl 104(pB) fxch %st(5) faddp %st,%st(3) fldl 104(pA) fmul %st(5),%st fxch %st(4) faddp %st,%st(2) fldl 424(pA) fmulp %st,%st(5) fldl 112(pB) fxch %st(3) faddp %st,%st(1) fldl 112(pA) fmul %st(3),%st fxch %st(2) faddp %st,%st(6) fldl 432(pA) fmulp %st,%st(3) fldl 120(pB) fxch %st(1) faddp %st,%st(7) fldl 120(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 440(pA) fmulp %st,%st(1) fldl 128(pB) fxch %st(7) faddp %st,%st(5) fldl 128(pA) fmul %st(7),%st fxch %st(4) faddp %st,%st(2) fldl 448(pA) fmulp %st,%st(7) fldl 136(pB) fxch %st(5) faddp %st,%st(3) fldl 136(pA) fmul %st(5),%st fxch %st(2) faddp %st,%st(6) fldl 456(pA) fmulp %st,%st(5) fldl 144(pB) fxch %st(3) faddp %st,%st(1) fldl 144(pA) fmul %st(3),%st fxch %st(6) faddp %st,%st(4) fldl 464(pA) fmulp %st,%st(3) fldl 152(pB) fxch %st(1) faddp %st,%st(7) fldl 152(pA) fmul %st(1),%st fxch %st(4) faddp %st,%st(2) fldl 472(pA) fmulp %st,%st(1) fldl 160(pB) fxch %st(7) faddp %st,%st(5) fldl 160(pA) fmul %st(7),%st fxch %st(2) faddp %st,%st(6) fldl 480(pA) fmulp %st,%st(7) fldl 168(pB) fxch %st(5) faddp %st,%st(3) fldl 168(pA) fmul %st(5),%st fxch %st(6) faddp %st,%st(4) fldl 488(pA) fmulp %st,%st(5) fldl 176(pB) fxch %st(3) faddp %st,%st(1) fldl 176(pA) fmul %st(3),%st fxch %st(4) faddp %st,%st(2) fldl 496(pA) fmulp %st,%st(3) fldl 184(pB) fxch %st(1) faddp %st,%st(7) fldl 184(pA) fmul %st(1),%st fxch %st(2) faddp %st,%st(6) fldl 504(pA) fmulp %st,%st(1) fldl 192(pB) fxch %st(7) faddp %st,%st(5) fldl 192(pA) fmul %st(7),%st fxch %st(6) faddp %st,%st(4) fldl 512(pA) fmulp %st,%st(7) fldl 200(pB) fxch %st(5) faddp %st,%st(3) fldl 200(pA) fmul %st(5),%st fxch %st(4) faddp %st,%st(2) fldl 520(pA) fmulp %st,%st(5) fldl 208(pB) fxch %st(3) faddp %st,%st(1) fldl 208(pA) fmul %st(3),%st fxch %st(2) faddp %st,%st(6) fldl 528(pA) fmulp %st,%st(3) fldl 216(pB) fxch %st(1) faddp %st,%st(7) fldl 216(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 536(pA) fmulp %st,%st(1) fldl 224(pB) fxch %st(7) faddp %st,%st(5) fldl 224(pA) fmul %st(7),%st fxch %st(4) faddp %st,%st(2) fldl 544(pA) fmulp %st,%st(7) fldl 232(pB) fxch %st(5) faddp %st,%st(3) fldl 232(pA) fmul %st(5),%st fxch %st(2) faddp %st,%st(6) fldl 552(pA) fmulp %st,%st(5) fldl 240(pB) fxch %st(3) faddp %st,%st(1) fldl 240(pA) fmul %st(3),%st fxch %st(6) faddp %st,%st(4) fldl 560(pA) fmulp %st,%st(3) fldl 248(pB) fxch %st(1) faddp %st,%st(7) fldl 248(pA) fmul %st(1),%st fxch %st(4) faddp %st,%st(2) fldl 568(pA) fmulp %st,%st(1) fldl 256(pB) fxch %st(7) faddp %st,%st(5) fldl 256(pA) fmul %st(7),%st fxch %st(2) faddp %st,%st(6) fldl 576(pA) fmulp %st,%st(7) fldl 264(pB) fxch %st(5) faddp %st,%st(3) fldl 264(pA) fmul %st(5),%st fxch %st(6) faddp %st,%st(4) fldl 584(pA) fmulp %st,%st(5) fldl 272(pB) fxch %st(3) faddp %st,%st(1) fldl 272(pA) fmul %st(3),%st fxch %st(4) faddp %st,%st(2) fldl 592(pA) fmulp %st,%st(3) fldl 280(pB) fxch %st(1) faddp %st,%st(7) fldl 280(pA) fmul %st(1),%st fxch %st(2) faddp %st,%st(6) fldl 600(pA) fmulp %st,%st(1) fldl 288(pB) fxch %st(7) faddp %st,%st(5) fldl 288(pA) fmul %st(7),%st fxch %st(6) faddp %st,%st(4) fldl 608(pA) fmulp %st,%st(7) fldl 296(pB) fxch %st(5) faddp %st,%st(3) fldl 296(pA) fmul %st(5),%st fxch %st(4) faddp %st,%st(2) fldl 616(pA) fmulp %st,%st(5) fldl 304(pB) fxch %st(3) faddp %st,%st(1) fldl 304(pA) fmul %st(3),%st fxch %st(2) faddp %st,%st(6) fldl 624(pA) fmulp %st,%st(3) fldl 312(pB) fxch %st(1) faddp %st,%st(7) fldl 312(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 632(pA) fmulp %st,%st(1) fxch %st(6) faddp %st,%st(4) faddp %st,%st(2) faddp %st,%st(2) faddp %st,%st(2) faddp %st,%st(2) # # While (pB != stK); # # cmp pB, stK # jne KLOOP # # Write results back to C # #ifdef BETAX fldl (pC) fldl OFF(pC) fldl BETAOFF(%esp) fmul %st, %st(1) fmulp %st, %st(2) faddp %st, %st(3) faddp %st, %st(1) #endif fstpl (pC) fstpl OFF(pC) # pC += 2; pA += 2*NB # addl $CMUL(16), pC addl $NB2so, pA # # while (pA != stM); # subb $2, stM jnz MLOOP #endif # # Last iteration of MLOOP unrolled for prefetch of next col of B # #if MB == 0 MLOOPCU: #endif fldl (pB) fldl (pA) fmul %st(1),%st fldl 320(pA) fmulp %st,%st(2) fldl 8(pB) fldl 8(pA) fmul %st(1),%st fldl 328(pA) fmulp %st,%st(2) fldl 16(pB) fldl 16(pA) fmul %st(1),%st #if defined(BETA0) || defined (BETAX) fldz #else fldl (pC) #endif faddp %st,%st(5) fldl 336(pA) fmulp %st,%st(2) fldl 24(pB) #if defined(BETA0) || defined (BETAX) fldz #else fldl OFF(pC) #endif faddp %st,%st(7) fldl 24(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 344(pA) fmulp %st,%st(1) fldl 32(pB) fxch %st(7) faddp %st,%st(5) fldl 32(pA) fmul %st(7),%st fxch %st(4) faddp %st,%st(2) fldl 352(pA) fmulp %st,%st(7) fldl 40(pB) fxch %st(5) faddp %st,%st(3) fldl 40(pA) fmul %st(5),%st fxch %st(2) faddp %st,%st(6) fldl 360(pA) fmulp %st,%st(5) fldl 48(pB) fxch %st(3) faddp %st,%st(1) fldl 48(pA) fmul %st(3),%st fxch %st(6) faddp %st,%st(4) fldl 368(pA) fmulp %st,%st(3) fldl 56(pB) fxch %st(1) faddp %st,%st(7) fldl 56(pA) fmul %st(1),%st fxch %st(4) faddp %st,%st(2) fldl 376(pA) fmulp %st,%st(1) fldl 64(pB) fxch %st(7) faddp %st,%st(5) fldl 64(pA) fmul %st(7),%st fxch %st(2) faddp %st,%st(6) fldl 384(pA) fmulp %st,%st(7) prefB(NBso(pB0)) prefB(32+NBso(pB0)) fldl 72(pB) fxch %st(5) faddp %st,%st(3) fldl 72(pA) fmul %st(5),%st fxch %st(6) faddp %st,%st(4) fldl 392(pA) fmulp %st,%st(5) fldl 80(pB) fxch %st(3) faddp %st,%st(1) fldl 80(pA) fmul %st(3),%st fxch %st(4) faddp %st,%st(2) fldl 400(pA) fmulp %st,%st(3) fldl 88(pB) fxch %st(1) faddp %st,%st(7) fldl 88(pA) fmul %st(1),%st fxch %st(2) faddp %st,%st(6) fldl 408(pA) fmulp %st,%st(1) prefB(64+NBso(pB0)) prefB(96+NBso(pB0)) fldl 96(pB) fxch %st(7) faddp %st,%st(5) fldl 96(pA) fmul %st(7),%st fxch %st(6) faddp %st,%st(4) fldl 416(pA) fmulp %st,%st(7) fldl 104(pB) fxch %st(5) faddp %st,%st(3) fldl 104(pA) fmul %st(5),%st fxch %st(4) faddp %st,%st(2) fldl 424(pA) fmulp %st,%st(5) fldl 112(pB) fxch %st(3) faddp %st,%st(1) fldl 112(pA) fmul %st(3),%st fxch %st(2) faddp %st,%st(6) fldl 432(pA) fmulp %st,%st(3) fldl 120(pB) fxch %st(1) faddp %st,%st(7) fldl 120(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 440(pA) fmulp %st,%st(1) prefB(128+NBso(pB0)) prefB(160+NBso(pB0)) fldl 128(pB) fxch %st(7) faddp %st,%st(5) fldl 128(pA) fmul %st(7),%st fxch %st(4) faddp %st,%st(2) fldl 448(pA) fmulp %st,%st(7) fldl 136(pB) fxch %st(5) faddp %st,%st(3) fldl 136(pA) fmul %st(5),%st fxch %st(2) faddp %st,%st(6) fldl 456(pA) fmulp %st,%st(5) fldl 144(pB) fxch %st(3) faddp %st,%st(1) fldl 144(pA) fmul %st(3),%st fxch %st(6) faddp %st,%st(4) fldl 464(pA) fmulp %st,%st(3) prefB(192+NBso(pB0)) prefB(256+NBso(pB0)) fldl 152(pB) fxch %st(1) faddp %st,%st(7) fldl 152(pA) fmul %st(1),%st fxch %st(4) faddp %st,%st(2) fldl 472(pA) fmulp %st,%st(1) fldl 160(pB) fxch %st(7) faddp %st,%st(5) fldl 160(pA) fmul %st(7),%st fxch %st(2) faddp %st,%st(6) fldl 480(pA) fmulp %st,%st(7) fldl 168(pB) fxch %st(5) faddp %st,%st(3) fldl 168(pA) fmul %st(5),%st fxch %st(6) faddp %st,%st(4) fldl 488(pA) fmulp %st,%st(5) prefB(288+NBso(pB0)) fldl 176(pB) fxch %st(3) faddp %st,%st(1) fldl 176(pA) fmul %st(3),%st fxch %st(4) faddp %st,%st(2) fldl 496(pA) fmulp %st,%st(3) fldl 184(pB) fxch %st(1) faddp %st,%st(7) fldl 184(pA) fmul %st(1),%st fxch %st(2) faddp %st,%st(6) fldl 504(pA) fmulp %st,%st(1) fldl 192(pB) fxch %st(7) faddp %st,%st(5) fldl 192(pA) fmul %st(7),%st fxch %st(6) faddp %st,%st(4) fldl 512(pA) fmulp %st,%st(7) fldl 200(pB) fxch %st(5) faddp %st,%st(3) fldl 200(pA) fmul %st(5),%st fxch %st(4) faddp %st,%st(2) fldl 520(pA) fmulp %st,%st(5) fldl 208(pB) fxch %st(3) faddp %st,%st(1) fldl 208(pA) fmul %st(3),%st fxch %st(2) faddp %st,%st(6) fldl 528(pA) fmulp %st,%st(3) fldl 216(pB) fxch %st(1) faddp %st,%st(7) fldl 216(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 536(pA) fmulp %st,%st(1) fldl 224(pB) fxch %st(7) faddp %st,%st(5) fldl 224(pA) fmul %st(7),%st fxch %st(4) faddp %st,%st(2) fldl 544(pA) fmulp %st,%st(7) fldl 232(pB) fxch %st(5) faddp %st,%st(3) fldl 232(pA) fmul %st(5),%st fxch %st(2) faddp %st,%st(6) fldl 552(pA) fmulp %st,%st(5) fldl 240(pB) fxch %st(3) faddp %st,%st(1) fldl 240(pA) fmul %st(3),%st fxch %st(6) faddp %st,%st(4) fldl 560(pA) fmulp %st,%st(3) fldl 248(pB) fxch %st(1) faddp %st,%st(7) fldl 248(pA) fmul %st(1),%st fxch %st(4) faddp %st,%st(2) fldl 568(pA) fmulp %st,%st(1) fldl 256(pB) fxch %st(7) faddp %st,%st(5) fldl 256(pA) fmul %st(7),%st fxch %st(2) faddp %st,%st(6) fldl 576(pA) fmulp %st,%st(7) fldl 264(pB) fxch %st(5) faddp %st,%st(3) fldl 264(pA) fmul %st(5),%st fxch %st(6) faddp %st,%st(4) fldl 584(pA) fmulp %st,%st(5) fldl 272(pB) fxch %st(3) faddp %st,%st(1) fldl 272(pA) fmul %st(3),%st fxch %st(4) faddp %st,%st(2) fldl 592(pA) fmulp %st,%st(3) fldl 280(pB) fxch %st(1) faddp %st,%st(7) fldl 280(pA) fmul %st(1),%st fxch %st(2) faddp %st,%st(6) fldl 600(pA) fmulp %st,%st(1) fldl 288(pB) fxch %st(7) faddp %st,%st(5) fldl 288(pA) fmul %st(7),%st fxch %st(6) faddp %st,%st(4) fldl 608(pA) fmulp %st,%st(7) fldl 296(pB) fxch %st(5) faddp %st,%st(3) fldl 296(pA) fmul %st(5),%st fxch %st(4) faddp %st,%st(2) fldl 616(pA) fmulp %st,%st(5) fldl 304(pB) fxch %st(3) faddp %st,%st(1) fldl 304(pA) fmul %st(3),%st fxch %st(2) faddp %st,%st(6) fldl 624(pA) fmulp %st,%st(3) fldl 312(pB) fxch %st(1) faddp %st,%st(7) fldl 312(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 632(pA) fmulp %st,%st(1) fxch %st(6) faddp %st,%st(4) faddp %st,%st(2) faddp %st,%st(2) faddp %st,%st(2) faddp %st,%st(2) # # While (pB != stK); # # cmp pB, stK # jne KLOOP # # Write results back to C # #ifdef BETAX fldl (pC) fldl OFF(pC) fldl BETAOFF(%esp) fmul %st, %st(1) fmulp %st, %st(2) faddp %st, %st(3) faddp %st, %st(1) #endif fstpl (pC) fstpl OFF(pC) # pC += 2; pA += 2*NB # # addl $CMUL(16), pC # addl $NB2so, pA # # while (pA != stM); # # pC += incCn; pA -= NBNB; pB += NB; # addl COFF(%esp), pC #if MB == 0 subl COFF+8(%esp), pA #else subl $MBKBso-NB2so, pA #endif addl $NBso, pB # # while (pB != stN); # sub $1, stN jnz NLOOP # # Restore callee-saved iregs # movl 12(%esp), %ebp movl 8(%esp), %ebx movl 4(%esp), %esi movl (%esp), %edi addl $FSIZE, %esp ret --- NEW FILE: ATL_dmm4x1x44_4_sse2.c --- # # Efficeon-optimized 4x1x44 DGEMM. Pipelined to 4 (4 accumulators). # Prefetches the next col of B, and a col from the next block of A in the M-loop # Purposely kept small so it is retained in cache, and easy to translate when # not # #ifndef ATL_GAS_x8632 #error "This kernel requires gas x86-32 assembler!" #endif #if KB != 44 #error "KB must be 44!" #endif #if !defined(KB) || (KB == 0) #error "KB must be a compile-time constant!" #endif #if !defined(NB) #define NB 0 #endif #if !defined(MB) #define MB 0 #endif #if (MB/4)*4 != MB #error "MB must be multiple of 4!" #endif #ifdef DCPLX #define OFF 16 #define CMUL(i_) (2*(i_)) #else #define OFF 8 #define CMUL(i_) i_ #endif # # Integer register usage shown be these defines # #define pC %esi #define pA %ecx #define pB %edi #define incCn %eax #define stM %bl #define stN %bh #define pfB %edx #define pfA %ebp #define pA0 pA #define pB0 pB #define m0 %xmm0 #define m1 %xmm1 #define m2 %xmm2 #define m3 %xmm3 #define rC0 %xmm4 #define rC1 %xmm5 #define rC2 %xmm6 #define rC3 %xmm7 #define NB0so 0 #define NBso (KB*8) #define NB1so (KB*8) #define NB2so (NBso+NBso) #define NB3so (NBso+NBso+NBso) #define NB4so (NBso+NBso+NBso+NBso) #define NB5so (NBso+NBso+NBso+NBso+NBso) #define NB6so (NBso+NBso+NBso+NBso+NBso+NBso) #define NB7so (NB6so+NBso) #define NB8so (NB6so+NB2so) #define NB9so (NB6so+NB3so) #define NB10so (NB6so+NB4so) #define NB11so (NB6so+NB5so) #if MB != 0 #define MBKBso (MB*KB*8) #endif # # Prefetch defines # #if 1 #define pref2(mem) prefetcht0 mem #define prefB(mem) prefetcht0 mem #define prefC(mem) prefetcht0 mem #else #define pref2(mem) #define prefB(mem) #define prefC(mem) #endif # offset 4 8 12 16 # void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, # offset 24 28 32 36 # const TYPE *A, const int lda, const TYPE *B, const int ldb, # offset 40 48 52 # const TYPE beta, TYPE *C, const int ldc) # .text .global ATL_AUSERMM ATL_AUSERMM: # # Save callee-saved iregs; Save old stack pointer in eax, # so we can adjust for BETA alignment # #define FSIZE 28 #define BETAOFF FSIZE+40 #define COFF 16 subl $FSIZE, %esp movl %ebp, 12(%esp) movl %ebx, 8(%esp) movl %esi, 4(%esp) movl %edi, (%esp) # # Initialize pA = A; pB = B; pC = C; # #if MB == 0 movl FSIZE+4(%esp), %ebx movl %ebx, COFF+4(%esp) imul $NBso, %ebx movl %ebx, COFF+8(%esp) #endif movl FSIZE+24(%esp), pA movl FSIZE+32(%esp), pB movl FSIZE+48(%esp), pC #if NB == 0 movb FSIZE+8(%esp), stN #else movb $NB, stN #endif # # Set incCn = (ldc - MB)*sizeof # movl FSIZE+52(%esp), incCn #if MB == 0 subl COFF+4(%esp), incCn #else subl $MB, incCn #endif #ifdef DCPLX shl $4, incCn #else shl $3, incCn #endif # movl incCn, COFF(%esp) movl pA0, pfA #if MB == 0 addl COFF+8(%esp), pfA #else addl $MBKBso, pfA #endif addl $120, pA0 addl $120, pB0 NLOOP: #if MB == 0 movb COFF+4(%esp), stM #else movb $MB, stM #endif lea 120+NBso(pB0), pfB #if MB != -5 MLOOP: #ifdef BETA0 xorpd rC0, rC0 xorpd rC1, rC1 xorpd rC2, rC2 xorpd rC3, rC3 #else movsd (pC), rC0 movsd CMUL(8)(pC), rC1 movsd CMUL(16)(pC), rC2 movsd CMUL(24)(pC), rC3 #ifdef BETAX movlpd BETAOFF(%esp), m0 mulsd m0, rC0 mulsd m0, rC1 mulsd m0, rC2 mulsd m0, rC3 #endif #endif .align 4 movapd 0-120(pB0), m3 movapd 0-120(pA0), m0 movapd NBso+0-120(pA0), m1 movapd NB2so+0-120(pA0), m2 mulpd m3, m0 mulpd m3, m1 mulpd m3, m2 # # Unrolled & pipelined K-loop # mulpd NB3so+0-120(pA0), m3 addpd m0, rC0 movapd 16-120(pB0), m0 addpd m1, rC1 movapd NB0so+16-120(pA0), m1 mulpd m0, m1 addpd m2, rC2 movapd NB1so+16-120(pA0), m2 mulpd m0, m2 addpd m3, rC3 movapd NB2so+16-120(pA0), m3 mulpd m0, m3 mulpd NB3so+16-120(pA0), m0 addpd m1, rC0 movapd 32-120(pB0), m1 addpd m2, rC1 movapd NB0so+32-120(pA0), m2 mulpd m1, m2 addpd m3, rC2 movapd NB1so+32-120(pA0), m3 mulpd m1, m3 addpd m0, rC3 movapd NB2so+32-120(pA0), m0 mulpd m1, m0 mulpd NB3so+32-120(pA0), m1 addpd m2, rC0 movapd 48-120(pB0), m2 addpd m3, rC1 movapd NB0so+48-120(pA0), m3 mulpd m2, m3 addpd m0, rC2 movapd NB1so+48-120(pA0), m0 mulpd m2, m0 addpd m1, rC3 movapd NB2so+48-120(pA0), m1 mulpd m2, m1 mulpd NB3so+48-120(pA0), m2 addpd m3, rC0 movapd 64-120(pB0), m3 addpd m0, rC1 movapd NB0so+64-120(pA0), m0 mulpd m3, m0 addpd m1, rC2 movapd NB1so+64-120(pA0), m1 mulpd m3, m1 addpd m2, rC3 movapd NB2so+64-120(pA0), m2 mulpd m3, m2 mulpd NB3so+64-120(pA0), m3 addpd m0, rC0 movapd 80-120(pB0), m0 addpd m1, rC1 movapd NB0so+80-120(pA0), m1 mulpd m0, m1 addpd m2, rC2 movapd NB1so+80-120(pA0), m2 mulpd m0, m2 addpd m3, rC3 movapd NB2so+80-120(pA0), m3 mulpd m0, m3 mulpd NB3so+80-120(pA0), m0 addpd m1, rC0 movapd 96-120(pB0), m1 addpd m2, rC1 movapd NB0so+96-120(pA0), m2 mulpd m1, m2 addpd m3, rC2 movapd NB1so+96-120(pA0), m3 mulpd m1, m3 addpd m0, rC3 movapd NB2so+96-120(pA0), m0 mulpd m1, m0 mulpd NB3so+96-120(pA0), m1 addpd m2, rC0 movapd 112-120(pB0), m2 addpd m3, rC1 movapd NB0so+112-120(pA0), m3 mulpd m2, m3 addpd m0, rC2 movapd NB1so+112-120(pA0), m0 mulpd m2, m0 addpd m1, rC3 movapd NB2so+112-120(pA0), m1 mulpd m2, m1 mulpd NB3so+112-120(pA0), m2 addpd m3, rC0 movapd 128-120(pB0), m3 addpd m0, rC1 movapd NB0so+128-120(pA0), m0 mulpd m3, m0 addpd m1, rC2 movapd NB1so+128-120(pA0), m1 mulpd m3, m1 addpd m2, rC3 movapd NB2so+128-120(pA0), m2 mulpd m3, m2 mulpd NB3so+128-120(pA0), m3 addpd m0, rC0 movapd 144-120(pB0), m0 addpd m1, rC1 movapd NB0so+144-120(pA0), m1 mulpd m0, m1 addpd m2, rC2 movapd NB1so+144-120(pA0), m2 mulpd m0, m2 addpd m3, rC3 movapd NB2so+144-120(pA0), m3 mulpd m0, m3 mulpd NB3so+144-120(pA0), m0 addpd m1, rC0 movapd 160-120(pB0), m1 addpd m2, rC1 movapd NB0so+160-120(pA0), m2 mulpd m1, m2 addpd m3, rC2 movapd NB1so+160-120(pA0), m3 mulpd m1, m3 addpd m0, rC3 movapd NB2so+160-120(pA0), m0 mulpd m1, m0 mulpd NB3so+160-120(pA0), m1 addpd m2, rC0 movapd 176-120(pB0), m2 addpd m3, rC1 movapd NB0so+176-120(pA0), m3 mulpd m2, m3 addpd m0, rC2 movapd NB1so+176-120(pA0), m0 mulpd m2, m0 addpd m1, rC3 movapd NB2so+176-120(pA0), m1 mulpd m2, m1 mulpd NB3so+176-120(pA0), m2 addpd m3, rC0 movapd 192-120(pB0), m3 addpd m0, rC1 movapd NB0so+192-120(pA0), m0 mulpd m3, m0 addpd m1, rC2 movapd NB1so+192-120(pA0), m1 mulpd m3, m1 addpd m2, rC3 movapd NB2so+192-120(pA0), m2 mulpd m3, m2 mulpd NB3so+192-120(pA0), m3 addpd m0, rC0 movapd 208-120(pB0), m0 addpd m1, rC1 movapd NB0so+208-120(pA0), m1 mulpd m0, m1 addpd m2, rC2 movapd NB1so+208-120(pA0), m2 mulpd m0, m2 addpd m3, rC3 movapd NB2so+208-120(pA0), m3 mulpd m0, m3 mulpd NB3so+208-120(pA0), m0 addpd m1, rC0 movapd 224-120(pB0), m1 addpd m2, rC1 movapd NB0so+224-120(pA0), m2 mulpd m1, m2 addpd m3, rC2 movapd NB1so+224-120(pA0), m3 mulpd m1, m3 addpd m0, rC3 movapd NB2so+224-120(pA0), m0 mulpd m1, m0 mulpd NB3so+224-120(pA0), m1 addpd m2, rC0 movapd 240-120(pB0), m2 addpd m3, rC1 movapd NB0so+240-120(pA0), m3 mulpd m2, m3 addpd m0, rC2 movapd NB1so+240-120(pA0), m0 mulpd m2, m0 addpd m1, rC3 movapd NB2so+240-120(pA0), m1 mulpd m2, m1 mulpd NB3so+240-120(pA0), m2 addpd m3, rC0 movapd 256-120(pB0), m3 addpd m0, rC1 movapd NB0so+256-120(pA0), m0 mulpd m3, m0 addpd m1, rC2 movapd NB1so+256-120(pA0), m1 mulpd m3, m1 addpd m2, rC3 movapd NB2so+256-120(pA0), m2 mulpd m3, m2 mulpd NB3so+256-120(pA0), m3 addpd m0, rC0 movapd 272-120(pB0), m0 addpd m1, rC1 movapd NB0so+272-120(pA0), m1 mulpd m0, m1 addpd m2, rC2 movapd NB1so+272-120(pA0), m2 mulpd m0, m2 addpd m3, rC3 movapd NB2so+272-120(pA0), m3 mulpd m0, m3 mulpd NB3so+272-120(pA0), m0 addpd m1, rC0 movapd 288-120(pB0), m1 addpd m2, rC1 movapd NB0so+288-120(pA0), m2 mulpd m1, m2 addpd m3, rC2 movapd NB1so+288-120(pA0), m3 mulpd m1, m3 addpd m0, rC3 movapd NB2so+288-120(pA0), m0 mulpd m1, m0 mulpd NB3so+288-120(pA0), m1 addpd m2, rC0 movapd 304-120(pB0), m2 addpd m3, rC1 movapd NB0so+304-120(pA0), m3 mulpd m2, m3 addpd m0, rC2 movapd NB1so+304-120(pA0), m0 mulpd m2, m0 addpd m1, rC3 movapd NB2so+304-120(pA0), m1 mulpd m2, m1 mulpd NB3so+304-120(pA0), m2 addpd m3, rC0 movapd 320-120(pB0), m3 addpd m0, rC1 movapd NB0so+320-120(pA0), m0 mulpd m3, m0 addpd m1, rC2 movapd NB1so+320-120(pA0), m1 mulpd m3, m1 addpd m2, rC3 movapd NB2so+320-120(pA0), m2 mulpd m3, m2 mulpd NB3so+320-120(pA0), m3 addpd m0, rC0 movapd 336-120(pB0), m0 addpd m1, rC1 movapd NB0so+336-120(pA0), m1 mulpd m0, m1 addpd m2, rC2 movapd NB1so+336-120(pA0), m2 mulpd m0, m2 addpd m3, rC3 movapd NB2so+336-120(pA0), m3 mulpd m0, m3 mulpd NB3so+336-120(pA0), m0 addpd m1, rC0 addpd m2, rC1 addpd m3, rC2 addpd m0, rC3 # # Get these bastard things summed up # # rC0 = c0a c0b # rC1 = c1a c1b # rC2 = c2a c2b # rC3 = c3a c3b # movapd rC0, m0 unpcklpd rC1, rC0 # rC0 = c0a c1a prefB((pfB)) unpckhpd rC1, m0 # m0 = c0b c1b addl $32, pfB addpd m0, rC0 # rC0 = c0ab c1ab movapd rC2, m0 pref2((pfA)) unpcklpd rC3, rC2 # rC2 = c2a c3a unpckhpd rC3, m0 # m0 = c2b c3b addl $32, pfA addl $NB4so, pA0 addpd m0, rC2 # rC2 = c2ab c3ab # # Write results back to C # #ifdef DCPLX movlpd rC0, (pC) movhpd rC0, 16(pC) movlpd rC2, 32(pC) movhpd rC2, 48(pC) #else movupd rC0, (pC) movupd rC2, 16(pC) #endif # # pC += 6; pA += 2*NB # addl $CMUL(32), pC # # while (pA != stM); # subb $4, stM jnz MLOOP #endif # # pC += incCn; pA -= NBNB; pB += NB; # addl incCn, pC # addl COFF(%esp), pC #if MB == 0 subl COFF+8(%esp), pA0 #else subl $MBKBso, pA0 #endif addl $NBso, pB # # while (pB != stN); # sub $1, stN jnz NLOOP # # Restore callee-saved iregs # movl 12(%esp), %ebp movl 8(%esp), %ebx movl 4(%esp), %esi movl (%esp), %edi addl $FSIZE, %esp ret --- NEW FILE: ATL_smm4x1x60_4_sse2.c --- # # Efficeon-optimized 4x1x60 SGEMM. Pipelined to 4 (4 accumulators). # Prefetches the next col of B, and a col from the next block of A in the M-loop # Purposely kept small so it is retained in cache, and easy to translate when # not # #ifndef ATL_GAS_x8632 #error "This kernel requires gas x86-32 assembler!" #endif /* #if KB != 60 #error "KB must be 60!" #endif */ #if !defined(KB) || (KB == 0) #error "KB must be a compile-time constant!" #endif #if !defined(NB) #define NB 0 #endif #if !defined(MB) #define MB 0 #endif #if (MB/4)*4 != MB #error "MB must be multiple of 4!" #endif #ifdef SCPLX #define OFF 16 #define CMUL(i_) (2*(i_)) #else #define OFF 8 #define CMUL(i_) i_ #endif # # Integer register usage shown be these defines # #define pC %esi #define pA %ecx #define pB %edi #define incCn %eax #define stM %bl #define stN %bh #define pfB %edx #define pfA %ebp #define pA0 pA #define pB0 pB #define m0 %xmm0 #define m1 %xmm1 #define m2 %xmm2 #define m3 %xmm3 #define rC0 %xmm4 #define rC1 %xmm5 #define rC2 %xmm6 #define rC3 %xmm7 #define NB0so 0 #define NBso (KB*4) #define NB1so (KB*4) #define NB2so (NBso+NBso) #define NB3so (NBso+NBso+NBso) #define NB4so (NBso+NBso+NBso+NBso) #define NB5so (NBso+NBso+NBso+NBso+NBso) #define NB6so (NBso+NBso+NBso+NBso+NBso+NBso) #define NB7so (NB6so+NBso) #define NB8so (NB6so+NB2so) #define NB9so (NB6so+NB3so) #define NB10so (NB6so+NB4so) #define NB11so (NB6so+NB5so) #if MB != 0 #define MBKBso (MB*KB*4) #endif # # Prefetch defines # #if 1 #define pref2(mem) prefetcht0 mem #define prefB(mem) prefetcht0 mem #define prefC(mem) prefetcht0 mem #else #define pref2(mem) #define prefB(mem) #define prefC(mem) #endif # offset 4 8 12 16 # void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, # offset 20 24 28 32 # const TYPE *A, const int lda, const TYPE *B, const int ldb, # offset 36 40 44 # const TYPE beta, TYPE *C, const int ldc) # .text .global ATL_AUSERMM ATL_AUSERMM: # # Save callee-saved iregs; Save old stack pointer in eax, # so we can adjust for BETA alignment # #define FSIZE 28 #define BETAOFF FSIZE+36 #define COFF 16 subl $FSIZE, %esp movl %ebp, 12(%esp) movl %ebx, 8(%esp) movl %esi, 4(%esp) movl %edi, (%esp) # # Initialize pA = A; pB = B; pC = C; # #if MB == 0 movl FSIZE+4(%esp), %ebx movl %ebx, COFF+4(%esp) imul $NBso, %ebx movl %ebx, COFF+8(%esp) #endif movl FSIZE+20(%esp), pA movl FSIZE+28(%esp), pB movl FSIZE+40(%esp), pC #if NB == 0 movb FSIZE+8(%esp), stN #else movb $NB, stN #endif # # Set incCn = (ldc - MB)*sizeof # movl FSIZE+44(%esp), incCn #if MB == 0 subl COFF+4(%esp), incCn #else subl $MB, incCn #endif #ifdef SCPLX shl $3, incCn #else shl $2, incCn #endif # movl incCn, COFF(%esp) movl pA0, pfA #if MB == 0 addl COFF+8(%esp), pfA #else addl $MBKBso, pfA #endif addl $120, pA0 addl $120, pB0 NLOOP: #if MB == 0 movb COFF+4(%esp), stM #else movb $MB, stM #endif lea 120+NBso(pB0), pfB MLOOP: #ifdef BETA0 xorps rC0, rC0 xorps rC1, rC1 xorps rC2, rC2 xorps rC3, rC3 #else movss (pC), rC0 movss CMUL(4)(pC), rC1 movss CMUL(8)(pC), rC2 movss CMUL(12)(pC), rC3 #ifdef BETAX movss BETAOFF(%esp), m0 mulss m0, rC0 mulss m0, rC1 mulss m0, rC2 mulss m0, rC3 #endif #endif movaps 0-120(pB0), m3 movaps 0-120(pA0), m0 movaps NBso+0-120(pA0), m1 movaps NB2so+0-120(pA0), m2 mulps m3, m0 mulps m3, m1 mulps m3, m2 # # Unrolled & pipelined K-loop # mulps NB3so+0-120(pA0), m3 addps m0, rC0 movaps 16-120(pB0), m0 addps m1, rC1 movaps NB0so+16-120(pA0), m1 mulps m0, m1 addps m2, rC2 movaps NB1so+16-120(pA0), m2 mulps m0, m2 addps m3, rC3 movaps NB2so+16-120(pA0), m3 mulps m0, m3 mulps NB3so+16-120(pA0), m0 addps m1, rC0 movaps 32-120(pB0), m1 addps m2, rC1 movaps NB0so+32-120(pA0), m2 mulps m1, m2 addps m3, rC2 movaps NB1so+32-120(pA0), m3 mulps m1, m3 addps m0, rC3 movaps NB2so+32-120(pA0), m0 mulps m1, m0 mulps NB3so+32-120(pA0), m1 addps m2, rC0 movaps 48-120(pB0), m2 addps m3, rC1 movaps NB0so+48-120(pA0), m3 mulps m2, m3 addps m0, rC2 movaps NB1so+48-120(pA0), m0 mulps m2, m0 addps m1, rC3 movaps NB2so+48-120(pA0), m1 mulps m2, m1 mulps NB3so+48-120(pA0), m2 addps m3, rC0 movaps 64-120(pB0), m3 addps m0, rC1 movaps NB0so+64-120(pA0), m0 mulps m3, m0 addps m1, rC2 movaps NB1so+64-120(pA0), m1 mulps m3, m1 addps m2, rC3 movaps NB2so+64-120(pA0), m2 mulps m3, m2 mulps NB3so+64-120(pA0), m3 addps m0, rC0 movaps 80-120(pB0), m0 addps m1, rC1 movaps NB0so+80-120(pA0), m1 mulps m0, m1 addps m2, rC2 movaps NB1so+80-120(pA0), m2 mulps m0, m2 addps m3, rC3 movaps NB2so+80-120(pA0), m3 mulps m0, m3 mulps NB3so+80-120(pA0), m0 addps m1, rC0 movaps 96-120(pB0), m1 addps m2, rC1 movaps NB0so+96-120(pA0), m2 mulps m1, m2 addps m3, rC2 movaps NB1so+96-120(pA0), m3 mulps m1, m3 addps m0, rC3 movaps NB2so+96-120(pA0), m0 mulps m1, m0 mulps NB3so+96-120(pA0), m1 addps m2, rC0 movaps 112-120(pB0), m2 addps m3, rC1 movaps NB0so+112-120(pA0), m3 mulps m2, m3 addps m0, rC2 movaps NB1so+112-120(pA0), m0 mulps m2, m0 addps m1, rC3 movaps NB2so+112-120(pA0), m1 mulps m2, m1 mulps NB3so+112-120(pA0), m2 addps m3, rC0 movaps 128-120(pB0), m3 addps m0, rC1 movaps NB0so+128-120(pA0), m0 mulps m3, m0 addps m1, rC2 movaps NB1so+128-120(pA0), m1 mulps m3, m1 addps m2, rC3 movaps NB2so+128-120(pA0), m2 mulps m3, m2 mulps NB3so+128-120(pA0), m3 addps m0, rC0 movaps 144-120(pB0), m0 addps m1, rC1 movaps NB0so+144-120(pA0), m1 mulps m0, m1 addps m2, rC2 movaps NB1so+144-120(pA0), m2 mulps m0, m2 addps m3, rC3 movaps NB2so+144-120(pA0), m3 mulps m0, m3 mulps NB3so+144-120(pA0), m0 addps m1, rC0 movaps 160-120(pB0), m1 addps m2, rC1 movaps NB0so+160-120(pA0), m2 mulps m1, m2 addps m3, rC2 movaps NB1so+160-120(pA0), m3 mulps m1, m3 addps m0, rC3 movaps NB2so+160-120(pA0), m0 mulps m1, m0 mulps NB3so+160-120(pA0), m1 addps m2, rC0 movaps 176-120(pB0), m2 addps m3, rC1 movaps NB0so+176-120(pA0), m3 mulps m2, m3 addps m0, rC2 movaps NB1so+176-120(pA0), m0 mulps m2, m0 addps m1, rC3 movaps NB2so+176-120(pA0), m1 mulps m2, m1 mulps NB3so+176-120(pA0), m2 addps m3, rC0 movaps 192-120(pB0), m3 addps m0, rC1 movaps NB0so+192-120(pA0), m0 mulps m3, m0 addps m1, rC2 movaps NB1so+192-120(pA0), m1 mulps m3, m1 addps m2, rC3 movaps NB2so+192-120(pA0), m2 mulps m3, m2 mulps NB3so+192-120(pA0), m3 addps m0, rC0 movaps 208-120(pB0), m0 addps m1, rC1 movaps NB0so+208-120(pA0), m1 mulps m0, m1 addps m2, rC2 movaps NB1so+208-120(pA0), m2 mulps m0, m2 addps m3, rC3 movaps NB2so+208-120(pA0), m3 mulps m0, m3 mulps NB3so+208-120(pA0), m0 addps m1, rC0 movaps 224-120(pB0), m1 addps m2, rC1 movaps NB0so+224-120(pA0), m2 mulps m1, m2 addps m3, rC2 movaps NB1so+224-120(pA0), m3 mulps m1, m3 addps m0, rC3 movaps NB2so+224-120(pA0), m0 mulps m1, m0 mulps NB3so+224-120(pA0), m1 addps m2, rC0 addps m3, rC1 addps m0, rC2 addps m1, rC3 # # Get these bastard things summed up correctly # Note this summation is Camm's, as his sequence was faster # than the piece of crap I came up with # movaps rC0, m0 # m0 = c0d c0c c0b c0a unpcklps rC1, rC0 # rC0 = c1b c0b c1a c0d movaps rC2, m1 # m1 = c2d c2c c2b c2a unpckhps rC1, m0 # m0 = c1d c0d c1c c0c prefB((pfB)) unpcklps rC3, rC2 # rC2 = c3b c2b c3a c2a addl $16, pfB addps m0, rC0 # rC0 = c1bd c0bd c1ac c0ac unpckhps rC3, m1 # m1 = c3d c2d c3c c2c addl $NB4so, pA0 movaps rC0, m0 # m0 = c1bd c0bd c1ac c0ac addps m1, rC2 # rC2 = c3bd c2bd c3ac c2ac shufps $0x44,rC2,rC0 # rC0 = c3ac c2ac c1ac c0ac pref2((pfA)) shufps $0xEE,rC2,m0 # m0 = c3bd c2bd c1bd c0bd addl $16, pfA addps m0, rC0 # rC0 = c3abcd c2abcd c1abcd c0abcd # rC1 = c1a c1b # rC2 = c2a c2b # # # Write results back to C # #ifdef SCPLX # rC0 = c3 c2 c1 c0 pshufd $0xB1, rC0, rC1 # rC1 = c2 c3 c0 c1 movhlps rC0, rC2 # rC2 = X X c3 c2 movhlps rC1, rC3 # rC3 = X X c2 c3 movss rC0, (pC) movss rC1, 8(pC) movss rC2, 16(pC) movss rC3, 24(pC) #else movups rC0, (pC) #endif # # pC += 6; pA += 2*NB # addl $CMUL(16), pC # # while (pA != stM); # subb $4, stM jnz MLOOP # # pC += incCn; pA -= NBNB; pB += NB; # addl incCn, pC # addl COFF(%esp), pC #if MB == 0 subl COFF+8(%esp), pA0 #else subl $MBKBso, pA0 #endif addl $NBso, pB # # while (pB != stN); # sub $1, stN jnz NLOOP # # Restore callee-saved iregs # movl 12(%esp), %ebp movl 8(%esp), %ebx movl 4(%esp), %esi movl (%esp), %edi addl $FSIZE, %esp ret Index: ATL_dmm2x1x40_5pABC.c =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/kernel/ClintWhaley/ATL_dmm2x1x40_5pABC.c,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** ATL_dmm2x1x40_5pABC.c 23 Jun 2003 02:57:21 -0000 1.3 --- ATL_dmm2x1x40_5pABC.c 18 Jul 2004 00:00:03 -0000 1.4 *************** *** 67,74 **** # Prefetch defines # ! #define pref2(mem) prefetcht1 mem ! #define prefA(mem) prefetcht0 mem ! #define prefB(mem) prefetcht0 mem ! #define prefC(mem) prefetcht0 mem # byte: 28 32 36 40 # void ATL_AUSERMM(const int M, const int N, const int K, const TYPE alpha, --- 67,81 ---- # Prefetch defines # ! #ifdef ATL_SSE1 ! #define pref2(mem) prefetcht1 mem ! #define prefA(mem) prefetcht0 mem ! #define prefB(mem) prefetcht0 mem ! #define prefC(mem) prefetcht0 mem ! #else ! #define pref2(mem) ! #define prefA(mem) ! #define prefB(mem) ! #define prefC(mem) ! #endif # byte: 28 32 36 40 # void ATL_AUSERMM(const int M, const int N, const int K, const TYPE alpha, Index: ATL_dmm6x1x72_sse2.c =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/kernel/ClintWhaley/ATL_dmm6x1x72_sse2.c,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** ATL_dmm6x1x72_sse2.c 11 Jul 2004 16:41:50 -0000 1.13 --- ATL_dmm6x1x72_sse2.c 18 Jul 2004 00:00:04 -0000 1.14 *************** *** 63,74 **** # Prefetch defines # ! #if 1 ! #define pref2(mem) prefetcht1 mem ! #define prefB(mem) prefetcht1 mem ! #define prefC(mem) prefetcht0 mem #else ! #define pref2(mem) ! #define prefB(mem) ! #define prefC(mem) #endif # offset 4 8 12 16 --- 63,78 ---- # Prefetch defines # ! #if defined(ATL_ARCH_Efficeon) && KB <= 44 ! #define pref2(mem) prefetcht0 mem ! #define prefB(mem) prefetcht0 mem ! #define prefC(mem) prefetcht0 mem ! #elif 1 ! #define pref2(mem) prefetcht1 mem ! #define prefB(mem) prefetcht1 mem ! #define prefC(mem) prefetcht0 mem #else ! #define pref2(mem) ! #define prefB(mem) ! #define prefC(mem) #endif # offset 4 8 12 16 Index: clint.base =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/kernel/ClintWhaley/clint.base,v retrieving revision 1.21 retrieving revision 1.22 diff -C2 -d -r1.21 -r1.22 *** clint.base 11 Jul 2004 16:41:50 -0000 1.21 --- clint.base 18 Jul 2004 00:00:04 -0000 1.22 *************** *** 7,11 **** @endifdef @extract -b @(topd)/gen.inc what=crsetup ! @ROUT ATL_dmm6x1x60_sse2_32 ATL_smm6x1x120_sse ATL_smm6x1x80_sse @extract -b @(topd)/gen.inc @(cw04) what=cw @ROUT ATL_dmm6x1x30_x87 ATL_smm6x1x60_x87 --- 7,12 ---- @endifdef @extract -b @(topd)/gen.inc what=crsetup ! @ROUT ATL_dmm6x1x60_sse2_32 ATL_smm6x1x120_sse ATL_smm6x1x80_sse @\ ! ATL_dmm4x1x44_4_sse2 ATL_smm4x1x60_4_sse2 @extract -b @(topd)/gen.inc @(cw04) what=cw @ROUT ATL_dmm6x1x30_x87 ATL_smm6x1x60_x87 *************** *** 43,47 **** ATL_dmm14x1x56_sse2pABC_MN ATL_smm14x1x84_sse ATL_smm14x1x84_sseCU @\ ATL_dmm1x14x56_sse2pABC ATL_smm6x1x60_sse ATL_dmm6x1x60_sse2_32 @\ ! ATL_smm6x1x120_sse ATL_smm6x1x80_sse #ifndef Mjoin #define Mjoin(pre, nam) my_join(pre, nam) --- 44,49 ---- ATL_dmm14x1x56_sse2pABC_MN ATL_smm14x1x84_sse ATL_smm14x1x84_sseCU @\ ATL_dmm1x14x56_sse2pABC ATL_smm6x1x60_sse ATL_dmm6x1x60_sse2_32 @\ ! ATL_smm6x1x120_sse ATL_smm6x1x80_sse ATL_dmm4x1x44_4_sse2 @\ ! ATL_smm4x1x60_4_sse2 #ifndef Mjoin #define Mjoin(pre, nam) my_join(pre, nam) |