[Math-atlas-commits] CVS: AtlasBase/kernel/JulianRuhe ATL_dJIK30x30x30TN30x30x0_a1_b0.asm,NONE,1.1 A
Brought to you by:
rwhaley,
tonyc040457
Update of /cvsroot/math-atlas/AtlasBase/kernel/JulianRuhe In directory usw-pr-cvs1:/tmp/cvs-serv23277/kernel/JulianRuhe Added Files: ATL_dJIK30x30x30TN30x30x0_a1_b0.asm ATL_dJIK30x30x30TN30x30x0_a1_b0.cfg ATL_dJIK30x30x30TN30x30x0_a1_b1.asm ATL_dJIK30x30x30TN30x30x0_a1_b1.cfg ATL_dJIK30x30x30TN30x30x0_a1_bX.asm ATL_dJIK30x30x30TN30x30x0_a1_bX.cfg julian.base julian2_b0.o julian2_b1.o julian2_bX.o julian2_win_b0.o julian2_win_b1.o julian2_win_bX.o Log Message: changes for 3.3.8 --- NEW FILE: ATL_dJIK30x30x30TN30x30x0_a1_b0.asm --- ; ; ATL_dJIK30x30x30TN30x30x0_a1_b0.asm ; ; ATLAS "Speed of Light" DGEMM() kernel for AMD Athlon ; Code author: Julian Ruhe (ruh...@li... | Jul...@t-...) ; ; void ATL_dJIK30x30x30TN30x30x0_a1_b0(const int M, const int N, const int K, const double alpha, ; const double *A, const int lda, const double *B, const int ldb, ; const double beta, double *C, const int ldc) ; ; Compile with "nasmw -f win32 -DWIN32 ATL_dJIK30x30x30TN30x30x0_a1_b0.asm" (Windows) ; Compile with "nasm -f elf -DELF ATL_dJIK30x30x30TN30x30x0_a1_b0.asm" (LINUX) ; ; See config file (ATL_dJIK30x30x30TN30x30x0_a1_b0.cfg) for important macro definitions ; %include "ATL_dJIK30x30x30TN30x30x0_a1_b0.cfg" %include "ATL_dJIK30x30x30TN30x30x0_a1.mcr" %ifdef WIN32 global _ATL_dJIK30x30x30TN30x30x0_a1_b0 section .text _ATL_dJIK30x30x30TN30x30x0_a1_b0: %endif %ifdef ELF global ATL_dJIK30x30x30TN30x30x0_a1_b0 section .text ATL_dJIK30x30x30TN30x30x0_a1_b0: %endif push ebp mov ebp,esp push ebx push esi push edi femms mov eax,0 ;temporary variable t1 push eax ;t1->stack mov eax,[ebp+28] ;&A->eax add eax,NB*NB*8 ;&A+1->eax mov ebx,[ebp+36] ;&B->ebx sub eax,ebx ;calculate offset push eax ;&A+1+offset->stack mov eax,[ebp+56] ;ldc->eax lea eax,[8*eax] push eax ;8*ldc->stack mov eax,NB push eax ;loop counter->stack mov eax,[ebp+28] ;&A->eax mov ebx,[ebp+36] ;&B->ebx mov ecx,[ebp+52] ;&C->ecx add ecx,byte 15*8 ;calculate offsets add ebx,byte 15*8 add eax,5*NB*8+15*8 push eax ;&A+offset->stack push ebp ;ebp->stack mov edi,-1*NB*8 ;calculate offsets for dot products mov esi,-3*NB*8 mov ebp,-5*NB*8 mov edx,6*NB*8 ;offset for the next 6 dot products align 16 loopj_ fld qword [eax+DOTP1+ELM1] ;01+1 fmul qword [ebx+ELM1] nop fld qword [eax+DOTP2+ELM1] fmul qword [ebx+ELM1] nop fld qword [eax+DOTP3+ELM1] fmul qword [ebx+ELM1] nop fld qword [ebx+ELM2] fld qword [eax+DOTP5+ELM1] nop fmul qword [ebx+ELM1] fld qword [eax+DOTP6+ELM1] mov edx,edx fmul qword [ebx+ELM1] fld qword [eax+DOTP4+ELM1] nop rep fmul qword [ebx+ELM1] fxch st3 mov edx,edx OPERATION 2,3 ;02+1 OPERATION 3,4 ;03+1 OPERATION 4,5 ;04+1 OPERATION 5,6 ;05+1 OPERATION 6,7 ;06+1 OPERATION 7,8 ;07+1 OPERATION 8,9 ;08+1 OPERATION 9,10 ;09+1 OPERATION 10,11 ;10+1 OPERATION 11,12 ;11+1 OPERATION 12,13 ;12+1 OPERATION 13,14 ;13+1 OPERATION 14,15 ;14+1 OPERATION 15,16 ;15+1 OPERATION 16,17 ;16+1 OPERATION 17,18 ;17+1 OPERATION 18,19 ;18+1 OPERATION 19,20 ;19+1 OPERATION 20,21 ;20+1 OPERATION 21,22 ;21+1 OPERATION 22,23 ;22+1 OPERATION 23,24 ;23+1 OPERATION 24,25 ;24+1 OPERATION 25,26 ;25+1 OPERATION 26,27 ;26+1 OPERATION 27,28 ;27+1 OPERATION 28,29 ;28+1 OPERATION 29,30 ;29+1 fld qword [eax+DOTP1+ELM30] ;30+1 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8-2*64] fnop mov edx,edx %endif %ifdef PREB_DST4 prefetch [ebx+30*8-2*64] fnop mov edx,edx prefetch [ebx+30*8-1*64] nop prefetch [ebx+30*8+0*64] nop prefetch [ebx+30*8+1*64] nop prefetch [ebx+30*8+2*64] nop %endif fstp qword [ecx+ELM1] fxch st3 fstp qword [ecx+ELM2] fxch st1 fstp qword [ecx+ELM3] fstp qword [ecx+ELM4] fstp qword [ecx+ELM5] fstp qword [ecx+ELM6] add eax,edx fld qword [eax+DOTP1+ELM1] ;01+2 fmul qword [ebx+ELM1] nop fld qword [eax+DOTP2+ELM1] fmul qword [ebx+ELM1] nop fld qword [eax+DOTP3+ELM1] fmul qword [ebx+ELM1] nop fld qword [ebx+ELM2] fld qword [eax+DOTP5+ELM1] nop fmul qword [ebx+ELM1] fld qword [eax+DOTP6+ELM1] mov edx,edx fmul qword [ebx+ELM1] fld qword [eax+DOTP4+ELM1] nop rep fmul qword [ebx+ELM1] fxch st3 mov edx,edx OPERATION 2,3 ;02+2 OPERATION 3,4 ;03+2 OPERATION 4,5 ;04+2 OPERATION 5,6 ;05+2 OPERATION 6,7 ;06+2 OPERATION 7,8 ;07+2 OPERATION 8,9 ;08+2 OPERATION 9,10 ;09+2 OPERATION 10,11 ;10+2 OPERATION 11,12 ;11+2 OPERATION 12,13 ;12+2 OPERATION 13,14 ;13+2 OPERATION 14,15 ;14+2 OPERATION 15,16 ;15+2 OPERATION 16,17 ;16+2 OPERATION 17,18 ;17+2 OPERATION 18,19 ;18+2 OPERATION 19,20 ;19+2 OPERATION 20,21 ;20+2 OPERATION 21,22 ;21+2 OPERATION 22,23 ;22+2 OPERATION 23,24 ;23+2 OPERATION 24,25 ;24+2 OPERATION 25,26 ;25+2 OPERATION 26,27 ;26+2 OPERATION 27,28 ;27+2 OPERATION 28,29 ;28+2 OPERATION 29,30 ;29+2 fld qword [eax+DOTP1+ELM30] ;30+2 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8-1*64] nop %endif %ifdef PREB_DST3 prefetch [ebx+30*8-2*64] fnop mov edx,edx prefetch [ebx+30*8-1*64] nop prefetch [ebx+30*8+0*64] nop prefetch [ebx+30*8+1*64] nop prefetch [ebx+30*8+2*64] nop %endif fstp qword [ecx+ELM7] fxch st3 fstp qword [ecx+ELM8] fxch st1 fstp qword [ecx+ELM9] fstp qword [ecx+ELM10] fstp qword [ecx+ELM11] fstp qword [ecx+ELM12] add eax,edx fld qword [eax+DOTP1+ELM1] ;01+3 fmul qword [ebx+ELM1] nop fld qword [eax+DOTP2+ELM1] fmul qword [ebx+ELM1] nop fld qword [eax+DOTP3+ELM1] fmul qword [ebx+ELM1] nop fld qword [ebx+ELM2] fld qword [eax+DOTP5+ELM1] nop fmul qword [ebx+ELM1] fld qword [eax+DOTP6+ELM1] mov edx,edx fmul qword [ebx+ELM1] fld qword [eax+DOTP4+ELM1] nop rep fmul qword [ebx+ELM1] fxch st3 mov edx,edx OPERATION 2,3 ;02+3 OPERATION 3,4 ;03+3 OPERATION 4,5 ;04+3 OPERATION 5,6 ;05+3 OPERATION 6,7 ;06+3 OPERATION 7,8 ;07+3 OPERATION 8,9 ;08+3 OPERATION 9,10 ;09+3 OPERATION 10,11 ;10+3 OPERATION 11,12 ;11+3 OPERATION 12,13 ;12+3 OPERATION 13,14 ;13+3 OPERATION 14,15 ;14+3 OPERATION 15,16 ;15+3 OPERATION 16,17 ;16+3 OPERATION 17,18 ;17+3 OPERATION 18,19 ;18+3 OPERATION 19,20 ;19+3 OPERATION 20,21 ;20+3 OPERATION 21,22 ;21+3 OPERATION 22,23 ;22+3 OPERATION 23,24 ;23+3 OPERATION 24,25 ;24+3 OPERATION 25,26 ;25+3 OPERATION 26,27 ;26+3 OPERATION 27,28 ;27+3 OPERATION 28,29 ;28+3 OPERATION 29,30 ;29+3 fld qword [eax+DOTP1+ELM30] ;30+3 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8+0*64] nop %endif %ifdef PREB_DST2 prefetch [ebx+30*8-2*64] fnop mov edx,edx prefetch [ebx+30*8-1*64] nop prefetch [ebx+30*8+0*64] nop prefetch [ebx+30*8+1*64] nop prefetch [ebx+30*8+2*64] nop %endif fstp qword [ecx+ELM13] fxch st3 fstp qword [ecx+ELM14] rep fxch st1 fstp qword [ecx+ELM15] fstp qword [ecx+ELM16] fstp qword [ecx+ELM17] fstp qword [ecx+ELM18] add eax,edx fld qword [eax+DOTP1+ELM1] ;01+4 fmul qword [ebx+ELM1] nop fld qword [eax+DOTP2+ELM1] fmul qword [ebx+ELM1] nop fld qword [eax+DOTP3+ELM1] fmul qword [ebx+ELM1] nop fld qword [ebx+ELM2] fld qword [eax+DOTP5+ELM1] nop fmul qword [ebx+ELM1] fld qword [eax+DOTP6+ELM1] mov edx,edx fmul qword [ebx+ELM1] fld qword [eax+DOTP4+ELM1] nop rep fmul qword [ebx+ELM1] fxch st3 mov edx,edx OPERATION 2,3 ;02+4 OPERATION 3,4 ;03+4 OPERATION 4,5 ;04+4 OPERATION 5,6 ;05+4 OPERATION 6,7 ;06+4 OPERATION 7,8 ;07+4 OPERATION 8,9 ;08+4 OPERATION 9,10 ;09+4 OPERATION 10,11 ;10+4 OPERATION 11,12 ;11+4 OPERATION 12,13 ;12+4 OPERATION 13,14 ;13+4 OPERATION 14,15 ;14+4 OPERATION 15,16 ;15+4 OPERATION 16,17 ;16+4 OPERATION 17,18 ;17+4 OPERATION 18,19 ;18+4 OPERATION 19,20 ;19+4 OPERATION 20,21 ;20+4 OPERATION 21,22 ;21+4 OPERATION 22,23 ;22+4 OPERATION 23,24 ;23+4 OPERATION 24,25 ;24+4 OPERATION 25,26 ;25+4 OPERATION 26,27 ;26+4 OPERATION 27,28 ;27+4 OPERATION 28,29 ;28+4 OPERATION 29,30 ;29+4 fld qword [eax+DOTP1+ELM30] ;30+4 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8+1*64] nop %endif %ifdef PREB_DST1 prefetch [ebx+30*8-2*64] fnop mov edx,edx prefetch [ebx+30*8-1*64] nop prefetch [ebx+30*8+0*64] nop prefetch [ebx+30*8+1*64] nop prefetch [ebx+30*8+2*64] nop %endif fstp qword [ecx+ELM19] fxch st3 fstp qword [ecx+ELM20] fxch st1 fstp qword [ecx+ELM21] fstp qword [ecx+ELM22] fstp qword [ecx+ELM23] fstp qword [ecx+ELM24] add eax,edx fld qword [eax+DOTP1+ELM1] ;01+5 fmul qword [ebx+ELM1] nop fld qword [eax+DOTP2+ELM1] fmul qword [ebx+ELM1] nop fld qword [eax+DOTP3+ELM1] fmul qword [ebx+ELM1] nop fld qword [ebx+ELM2] fld qword [eax+DOTP5+ELM1] nop fmul qword [ebx+ELM1] fld qword [eax+DOTP6+ELM1] mov edx,edx fmul qword [ebx+ELM1] fld qword [eax+DOTP4+ELM1] nop rep fmul qword [ebx+ELM1] fxch st3 mov edx,edx OPERATION 2,3 ;02+5 OPERATION 3,4 ;03+5 OPERATION 4,5 ;04+5 OPERATION 5,6 ;05+5 OPERATION 6,7 ;06+5 OPERATION 7,8 ;07+5 OPERATION 8,9 ;08+5 OPERATION 9,10 ;09+5 OPERATION 10,11 ;10+5 OPERATION 11,12 ;11+5 OPERATION 12,13 ;12+5 OPERATION 13,14 ;13+5 OPERATION 14,15 ;14+5 OPERATION 15,16 ;15+5 OPERATION 16,17 ;16+5 OPERATION 17,18 ;17+5 OPERATION 18,19 ;18+5 OPERATION 19,20 ;19+5 OPERATION 20,21 ;20+5 OPERATION 21,22 ;21+5 OPERATION 22,23 ;22+5 OPERATION 23,24 ;23+5 OPERATION 24,25 ;24+5 OPERATION 25,26 ;25+5 OPERATION 26,27 ;26+5 OPERATION 27,28 ;27+5 OPERATION 28,29 ;28+5 OPERATION 29,30 ;29+5 fld qword [eax+DOTP1+ELM30] ;30+5 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8+2*64] nop %endif %ifdef PREA_EN mov [esp+20],edx ;save edx in t1 mov edx,[esp+16] ;&A+1->edx lea edx,[edx+ebx] prefetch [edx-2*64] nop prefetch [edx-1*64] prefetch [edx+0*64] nop prefetch [edx+1*64] prefetch [edx+2*64-8] mov edx,[esp+20] ;restore edx mov eax,eax fnop %endif fstp qword [ecx+ELM25] fxch st3 fstp qword [ecx+ELM26] fxch st1 fstp qword [ecx+ELM27] fstp qword [ecx+ELM28] fstp qword [ecx+ELM29] fstp qword [ecx+ELM30] sub ebx,edi ;next column of B mov eax,[esp+4] ;reset eax add ecx,[esp+12] ;next column of C (+ldc*8) dec dword [esp+8] ;dec counter jnz near loopj_ end_ femms pop ebp add esp,byte 5*4 ;remove local variables pop edi ;restore registers pop esi pop ebx leave ;mov esp,ebp / pop ebp ret --- NEW FILE: ATL_dJIK30x30x30TN30x30x0_a1_b0.cfg --- ; ; ATL_dJIK30x30x30TN30x30x0_a1_b0.cfg ; ; ATLAS "Speed of Light" DGEMM() kernel for AMD Athlon ; Code author: Julian Ruhe (ruh...@li... | Jul...@t-...) ; ;define PREB_DIS to disable prefetching of B ;define PREB_DST1 to set B-prefetching distance to 1 ;define PREB_DST2 to set B-prefetching distance to 2 ;define PREB_DST3 to set B-prefetching distance to 3 ;define PREB_DST4 to set B-prefetching distance to 4 ;define PREB_SPLIT to use an alternative prefetching strategy ; ;define PREA_EN to enable prefetching of A+1 ;define PREA_DIS to disable prefetching of A+1 ; ;All these macros can be defined in the NASM command line, if no %defines are active in this file ; %ifndef PREA_EN %ifndef PREA_DIS %define PREA_EN %endif %endif %ifndef PREB_DIS %ifndef PREB_DST1 %ifndef PREB_DST2 %ifndef PREB_DST3 %ifndef PREB_DST4 %define PREB_DST2 %endif %endif %endif %endif %endif --- NEW FILE: ATL_dJIK30x30x30TN30x30x0_a1_b1.asm --- ; ; ATL_dJIK30x30x30TN30x30x0_a1_b1.asm ; ; ATLAS "Speed of Light" DGEMM() kernel for AMD Athlon ; Code author: Julian Ruhe (ruh...@li... | Jul...@t-...) ; ; void ATL_dJIK30x30x30TN30x30x0_a1_b1(const int M, const int N, const int K, const double alpha, ; const double *A, const int lda, const double *B, const int ldb, ; const double beta, double *C, const int ldc) ; ; Compile with "nasmw -f win32 -DWIN32 ATL_dJIK30x30x30TN30x30x0_a1_b1.asm" (Windows) ; Compile with "nasm -f elf -DELF ATL_dJIK30x30x30TN30x30x0_a1_b1.asm" (LINUX) ; ; See config file (ATL_dJIK30x30x30TN30x30x0_a1_b1.cfg) for important macro definitions ; %include "ATL_dJIK30x30x30TN30x30x0_a1_b1.cfg" %include "ATL_dJIK30x30x30TN30x30x0_a1.mcr" %ifdef WIN32 global _ATL_dJIK30x30x30TN30x30x0_a1_b1 section .text _ATL_dJIK30x30x30TN30x30x0_a1_b1: %endif %ifdef ELF global ATL_dJIK30x30x30TN30x30x0_a1_b1 section .text ATL_dJIK30x30x30TN30x30x0_a1_b1: %endif push ebp mov ebp,esp push ebx push esi push edi femms mov eax,0 ;temporary variable t1 push eax ;t1->stack mov eax,[ebp+28] ;&A->eax add eax,NB*NB*8 ;&A+1->eax mov ebx,[ebp+36] ;&B->ebx sub eax,ebx ;calculate offset push eax ;&A+1+offset->stack mov eax,[ebp+56] ;ldc->eax lea eax,[8*eax] push eax ;8*ldc->stack mov eax,NB push eax ;loop counter->stack mov eax,[ebp+28] ;&A->eax mov ebx,[ebp+36] ;&B->ebx mov ecx,[ebp+52] ;&C->ecx add ecx,byte 15*8 ;calculate offsets add ebx,byte 15*8 add eax,5*NB*8 push eax ;&A+offset->stack push ebp ;ebp->stack mov edi,-1*NB*8 ;calculate offsets for dot products mov esi,-3*NB*8 mov ebp,-5*NB*8 mov edx,6*NB*8-15*8 ;offset for the next 6 dot products align 16 loopj_ fld qword [ebx+ELM1] ;01+1 fld qword [eax+DOTP2] fmul st0,st1 fadd qword [ecx+ELM2] fld qword [eax+DOTP3] fmul st0,st2 fadd qword [ecx+ELM3] fld qword [eax+DOTP1] fmul st0,st3 fadd qword [ecx+ELM1] fxch st0,st3 fld qword [eax+DOTP5] rep fmul st0,st1 fadd qword [ecx+ELM5] fld qword [eax+DOTP6] fmul st0,st2 fadd qword [ecx+ELM6] fld qword [eax+DOTP4] rep fmulp st3,st0 fld qword [ecx+ELM4] faddp st3,st0 fld qword [ebx+ELM2] add eax,byte 15*8 mov edx,edx OPERATION 2,3 ;02+1 OPERATION 3,4 ;03+1 OPERATION 4,5 ;04+1 OPERATION 5,6 ;05+1 OPERATION 6,7 ;06+1 OPERATION 7,8 ;07+1 OPERATION 8,9 ;08+1 OPERATION 9,10 ;09+1 OPERATION 10,11 ;10+1 OPERATION 11,12 ;11+1 OPERATION 12,13 ;12+1 OPERATION 13,14 ;13+1 OPERATION 14,15 ;14+1 OPERATION 15,16 ;15+1 OPERATION 16,17 ;16+1 OPERATION 17,18 ;17+1 OPERATION 18,19 ;18+1 OPERATION 19,20 ;19+1 OPERATION 20,21 ;20+1 OPERATION 21,22 ;21+1 OPERATION 22,23 ;22+1 OPERATION 23,24 ;23+1 OPERATION 24,25 ;24+1 OPERATION 25,26 ;25+1 OPERATION 26,27 ;26+1 OPERATION 27,28 ;27+1 OPERATION 28,29 ;28+1 OPERATION 29,30 ;29+1 fld qword [eax+DOTP1+ELM30] ;30+1 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8-2*64] fnop mov edx,edx %endif %ifdef PREB_DST4 prefetch [ebx+30*8-2*64] fnop mov edx,edx prefetch [ebx+30*8-1*64] nop prefetch [ebx+30*8+0*64] nop prefetch [ebx+30*8+1*64] nop prefetch [ebx+30*8+2*64] nop %endif fstp qword [ecx+ELM1] fxch st3 fstp qword [ecx+ELM2] fxch st1 fstp qword [ecx+ELM3] fstp qword [ecx+ELM4] fstp qword [ecx+ELM5] fstp qword [ecx+ELM6] add eax,edx fld qword [ebx+ELM1] ;01+2 fld qword [eax+DOTP2] fmul st0,st1 fadd qword [ecx+ELM8] fld qword [eax+DOTP3] fmul st0,st2 fadd qword [ecx+ELM9] fld qword [eax+DOTP1] fmul st0,st3 fadd qword [ecx+ELM7] fxch st0,st3 fld qword [eax+DOTP5] rep fmul st0,st1 fadd qword [ecx+ELM11] fld qword [eax+DOTP6] fmul st0,st2 fadd qword [ecx+ELM12] fld qword [eax+DOTP4] rep fmulp st3,st0 fld qword [ecx+ELM10] faddp st3,st0 fld qword [ebx+ELM2] add eax,byte 15*8 mov edx,edx OPERATION 2,3 ;02+2 OPERATION 3,4 ;03+2 OPERATION 4,5 ;04+2 OPERATION 5,6 ;05+2 OPERATION 6,7 ;06+2 OPERATION 7,8 ;07+2 OPERATION 8,9 ;08+2 OPERATION 9,10 ;09+2 OPERATION 10,11 ;10+2 OPERATION 11,12 ;11+2 OPERATION 12,13 ;12+2 OPERATION 13,14 ;13+2 OPERATION 14,15 ;14+2 OPERATION 15,16 ;15+2 OPERATION 16,17 ;16+2 OPERATION 17,18 ;17+2 OPERATION 18,19 ;18+2 OPERATION 19,20 ;19+2 OPERATION 20,21 ;20+2 OPERATION 21,22 ;21+2 OPERATION 22,23 ;22+2 OPERATION 23,24 ;23+2 OPERATION 24,25 ;24+2 OPERATION 25,26 ;25+2 OPERATION 26,27 ;26+2 OPERATION 27,28 ;27+2 OPERATION 28,29 ;28+2 OPERATION 29,30 ;29+2 fld qword [eax+DOTP1+ELM30] ;30+2 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8-1*64] nop %endif %ifdef PREB_DST3 prefetch [ebx+30*8-2*64] fnop mov edx,edx prefetch [ebx+30*8-1*64] nop prefetch [ebx+30*8+0*64] nop prefetch [ebx+30*8+1*64] nop prefetch [ebx+30*8+2*64] nop %endif fstp qword [ecx+ELM7] fxch st3 fstp qword [ecx+ELM8] fxch st1 fstp qword [ecx+ELM9] fstp qword [ecx+ELM10] fstp qword [ecx+ELM11] fstp qword [ecx+ELM12] add eax,edx fld qword [ebx+ELM1] ;01+3 fld qword [eax+DOTP2] fmul st0,st1 fadd qword [ecx+ELM14] fld qword [eax+DOTP3] fmul st0,st2 fadd qword [ecx+ELM15] fld qword [eax+DOTP1] fmul st0,st3 fadd qword [ecx+ELM13] fxch st0,st3 fld qword [eax+DOTP5] rep fmul st0,st1 fadd qword [ecx+ELM17] fld qword [eax+DOTP6] fmul st0,st2 fadd qword [ecx+ELM18] fld qword [eax+DOTP4] rep fmulp st3,st0 rep fld qword [ecx+ELM16] faddp st3,st0 fld qword [ebx+ELM2] add eax,byte 15*8 mov edx,edx OPERATION 2,3 ;02+3 OPERATION 3,4 ;03+3 OPERATION 4,5 ;04+3 OPERATION 5,6 ;05+3 OPERATION 6,7 ;06+3 OPERATION 7,8 ;07+3 OPERATION 8,9 ;08+3 OPERATION 9,10 ;09+3 OPERATION 10,11 ;10+3 OPERATION 11,12 ;11+3 OPERATION 12,13 ;12+3 OPERATION 13,14 ;13+3 OPERATION 14,15 ;14+3 OPERATION 15,16 ;15+3 OPERATION 16,17 ;16+3 OPERATION 17,18 ;17+3 OPERATION 18,19 ;18+3 OPERATION 19,20 ;19+3 OPERATION 20,21 ;20+3 OPERATION 21,22 ;21+3 OPERATION 22,23 ;22+3 OPERATION 23,24 ;23+3 OPERATION 24,25 ;24+3 OPERATION 25,26 ;25+3 OPERATION 26,27 ;26+3 OPERATION 27,28 ;27+3 OPERATION 28,29 ;28+3 OPERATION 29,30 ;29+3 fld qword [eax+DOTP1+ELM30] ;30+3 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8+0*64] nop %endif %ifdef PREB_DST2 prefetch [ebx+30*8-2*64] fnop mov edx,edx prefetch [ebx+30*8-1*64] nop prefetch [ebx+30*8+0*64] nop prefetch [ebx+30*8+1*64] nop prefetch [ebx+30*8+2*64] nop %endif fstp qword [ecx+ELM13] fxch st3 fstp qword [ecx+ELM14] rep fxch st1 fstp qword [ecx+ELM15] fstp qword [ecx+ELM16] fstp qword [ecx+ELM17] fstp qword [ecx+ELM18] add eax,edx fld qword [ebx+ELM1] ;01+4 fld qword [eax+DOTP2] fmul st0,st1 fadd qword [ecx+ELM20] fld qword [eax+DOTP3] fmul st0,st2 fadd qword [ecx+ELM21] fld qword [eax+DOTP1] fmul st0,st3 fadd qword [ecx+ELM19] fxch st0,st3 fld qword [eax+DOTP5] rep fmul st0,st1 fadd qword [ecx+ELM23] fld qword [eax+DOTP6] fmul st0,st2 fadd qword [ecx+ELM24] fld qword [eax+DOTP4] rep fmulp st3,st0 fld qword [ecx+ELM22] faddp st3,st0 fld qword [ebx+ELM2] add eax,byte 15*8 mov edx,edx OPERATION 2,3 ;02+4 OPERATION 3,4 ;03+4 OPERATION 4,5 ;04+4 OPERATION 5,6 ;05+4 OPERATION 6,7 ;06+4 OPERATION 7,8 ;07+4 OPERATION 8,9 ;08+4 OPERATION 9,10 ;09+4 OPERATION 10,11 ;10+4 OPERATION 11,12 ;11+4 OPERATION 12,13 ;12+4 OPERATION 13,14 ;13+4 OPERATION 14,15 ;14+4 OPERATION 15,16 ;15+4 OPERATION 16,17 ;16+4 OPERATION 17,18 ;17+4 OPERATION 18,19 ;18+4 OPERATION 19,20 ;19+4 OPERATION 20,21 ;20+4 OPERATION 21,22 ;21+4 OPERATION 22,23 ;22+4 OPERATION 23,24 ;23+4 OPERATION 24,25 ;24+4 OPERATION 25,26 ;25+4 OPERATION 26,27 ;26+4 OPERATION 27,28 ;27+4 OPERATION 28,29 ;28+4 OPERATION 29,30 ;29+4 fld qword [eax+DOTP1+ELM30] ;30+4 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8+1*64] nop %endif %ifdef PREB_DST1 prefetch [ebx+30*8-2*64] fnop mov edx,edx prefetch [ebx+30*8-1*64] nop prefetch [ebx+30*8+0*64] nop prefetch [ebx+30*8+1*64] nop prefetch [ebx+30*8+2*64] nop %endif fstp qword [ecx+ELM19] fxch st3 fstp qword [ecx+ELM20] fxch st1 fstp qword [ecx+ELM21] fstp qword [ecx+ELM22] fstp qword [ecx+ELM23] fstp qword [ecx+ELM24] add eax,edx fld qword [ebx+ELM1] ;01+5 fld qword [eax+DOTP2] fmul st0,st1 fadd qword [ecx+ELM26] fld qword [eax+DOTP3] fmul st0,st2 fadd qword [ecx+ELM27] fld qword [eax+DOTP1] fmul st0,st3 fadd qword [ecx+ELM25] fxch st0,st3 fld qword [eax+DOTP5] rep fmul st0,st1 fadd qword [ecx+ELM29] fld qword [eax+DOTP6] fmul st0,st2 fadd qword [ecx+ELM30] fld qword [eax+DOTP4] rep fmulp st3,st0 fld qword [ecx+ELM28] faddp st3,st0 fld qword [ebx+ELM2] add eax,byte 15*8 mov edx,edx OPERATION 2,3 ;02+5 OPERATION 3,4 ;03+5 OPERATION 4,5 ;04+5 OPERATION 5,6 ;05+5 OPERATION 6,7 ;06+5 OPERATION 7,8 ;07+5 OPERATION 8,9 ;08+5 OPERATION 9,10 ;09+5 OPERATION 10,11 ;10+5 OPERATION 11,12 ;11+5 OPERATION 12,13 ;12+5 OPERATION 13,14 ;13+5 OPERATION 14,15 ;14+5 OPERATION 15,16 ;15+5 OPERATION 16,17 ;16+5 OPERATION 17,18 ;17+5 OPERATION 18,19 ;18+5 OPERATION 19,20 ;19+5 OPERATION 20,21 ;20+5 OPERATION 21,22 ;21+5 OPERATION 22,23 ;22+5 OPERATION 23,24 ;23+5 OPERATION 24,25 ;24+5 OPERATION 25,26 ;25+5 OPERATION 26,27 ;26+5 OPERATION 27,28 ;27+5 OPERATION 28,29 ;28+5 OPERATION 29,30 ;29+5 fld qword [eax+DOTP1+ELM30] ;30+5 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8+2*64] nop %endif %ifdef PREA_EN mov [esp+20],edx ;save edx in t1 mov edx,[esp+16] ;&A+1->edx lea edx,[edx+ebx] prefetch [edx-2*64] nop prefetch [edx-1*64] prefetch [edx+0*64] nop prefetch [edx+1*64] prefetch [edx+2*64-8] mov edx,[esp+20] ;restore edx mov eax,eax fnop %endif fstp qword [ecx+ELM25] fxch st3 fstp qword [ecx+ELM26] fxch st1 fstp qword [ecx+ELM27] fstp qword [ecx+ELM28] fstp qword [ecx+ELM29] fstp qword [ecx+ELM30] sub ebx,edi ;next column of B mov eax,[esp+4] ;reset eax add ecx,[esp+12] ;next column of C (+ldc*8) dec dword [esp+8] ;dec counter jnz near loopj_ end_ femms pop ebp add esp,byte 5*4 ;remove local variables pop edi ;restore registers pop esi pop ebx leave ;mov esp,ebp / pop ebp ret --- NEW FILE: ATL_dJIK30x30x30TN30x30x0_a1_b1.cfg --- ; ; ATL_dJIK30x30x30TN30x30x0_a1_b1.cfg ; ; ATLAS "Speed of Light" DGEMM() kernel for AMD Athlon ; Code author: Julian Ruhe (ruh...@li... | Jul...@t-...) ; ;define PREB_DIS to disable prefetching of B ;define PREB_DST1 to set B-prefetching distance to 1 ;define PREB_DST2 to set B-prefetching distance to 2 ;define PREB_DST3 to set B-prefetching distance to 3 ;define PREB_DST4 to set B-prefetching distance to 4 ;define PREB_SPLIT to use an alternative prefetching strategy ; ;define PREA_EN to enable prefetching of A+1 ;define PREA_DIS to disable prefetching of A+1 ; ;All these macros can be defined in the NASM command line, if no %defines are active in this file ; %ifndef PREA_EN %ifndef PREA_DIS %define PREA_EN %endif %endif %ifndef PREB_DIS %ifndef PREB_DST1 %ifndef PREB_DST2 %ifndef PREB_DST3 %ifndef PREB_DST4 %define PREB_DST2 %endif %endif %endif %endif %endif --- NEW FILE: ATL_dJIK30x30x30TN30x30x0_a1_bX.asm --- ; ; ATL_dJIK30x30x30TN30x30x0_a1_bX.asm ; ; ATLAS "Speed of Light" DGEMM() kernel for AMD Athlon ; Code author: Julian Ruhe (ruh...@li... | Jul...@t-...) ; ; void ATL_dJIK30x30x30TN30x30x0_a1_bX(const int M, const int N, const int K, const double alpha, ; const double *A, const int lda, const double *B, const int ldb, ; const double beta, double *C, const int ldc) ; ; Compile with "nasmw -f win32 -DWIN32 ATL_dJIK30x30x30TN30x30x0_a1_bX.asm" (Windows) ; Compile with "nasm -f elf -DELF ATL_dJIK30x30x30TN30x30x0_a1_bX.asm" (LINUX) ; ; See config file (ATL_dJIK30x30x30TN30x30x0_a1_bX.cfg) for important macro definitions ; %include "ATL_dJIK30x30x30TN30x30x0_a1_bX.cfg" %include "ATL_dJIK30x30x30TN30x30x0_a1.mcr" %ifdef WIN32 global _ATL_dJIK30x30x30TN30x30x0_a1_bX section .text _ATL_dJIK30x30x30TN30x30x0_a1_bX: %endif %ifdef ELF global ATL_dJIK30x30x30TN30x30x0_a1_bX section .text ATL_dJIK30x30x30TN30x30x0_a1_bX: %endif push ebp mov ebp,esp push ebx push esi push edi femms fld qword [ebp+44] fstp qword [beta] mov eax,0 ;temporary variable t1 push eax ;t1->stack mov eax,[ebp+28] ;&A->eax add eax,NB*NB*8 ;&A+1->eax mov ebx,[ebp+36] ;&B->ebx sub eax,ebx ;calculate offset push eax ;&A+1+offset->stack mov eax,[ebp+56] ;ldc->eax lea eax,[8*eax] push eax ;8*ldc->stack mov eax,NB push eax ;loop counter->stack mov eax,[ebp+28] ;&A->eax mov ebx,[ebp+36] ;&B->ebx mov ecx,[ebp+52] ;&C->ecx add ecx,byte 15*8 ;calculate offsets add ebx,byte 15*8 add eax,5*NB*8+15*8 push eax ;&A+offset->stack push ebp ;ebp->stack mov edi,-1*NB*8 ;calculate offsets for dot products mov esi,-3*NB*8 mov ebp,-5*NB*8 mov edx,6*NB*8 ;offset for the next 6 dot products align 16 loopj_ fld qword [ecx+ELM1] fld qword [ecx+ELM2] mov edx,edx fld qword [beta] fmul st2,st0 fmul st1,st0 fld qword [ebx+ELM1] fld qword [ecx+ELM5] rep fmul st0,st2 fld qword [ecx+ELM6] fmul st0,st3 fld qword [ecx+ELM4] fmul st0,st4 fld qword [ecx+ELM3] rep fmulp st5,st0 rep fxch st0,st3 mov edx,edx OPERATION 1,2 ;01+1 OPERATION 2,3 ;02+1 OPERATION 3,4 ;03+1 OPERATION 4,5 ;04+1 OPERATION 5,6 ;05+1 OPERATION 6,7 ;06+1 OPERATION 7,8 ;07+1 OPERATION 8,9 ;08+1 OPERATION 9,10 ;09+1 OPERATION 10,11 ;10+1 OPERATION 11,12 ;11+1 OPERATION 12,13 ;12+1 OPERATION 13,14 ;13+1 OPERATION 14,15 ;14+1 OPERATION 15,16 ;15+1 OPERATION 16,17 ;16+1 OPERATION 17,18 ;17+1 OPERATION 18,19 ;18+1 OPERATION 19,20 ;19+1 OPERATION 20,21 ;20+1 OPERATION 21,22 ;21+1 OPERATION 22,23 ;22+1 OPERATION 23,24 ;23+1 OPERATION 24,25 ;24+1 OPERATION 25,26 ;25+1 OPERATION 26,27 ;26+1 OPERATION 27,28 ;27+1 OPERATION 28,29 ;28+1 OPERATION 29,30 ;29+1 fld qword [eax+DOTP1+ELM30] ;30+1 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8-2*64] fnop mov edx,edx %endif %ifdef PREB_DST4 prefetch [ebx+30*8-2*64] fnop mov edx,edx prefetch [ebx+30*8-1*64] nop prefetch [ebx+30*8+0*64] nop prefetch [ebx+30*8+1*64] nop prefetch [ebx+30*8+2*64] nop %endif fstp qword [ecx+ELM1] fxch st3 fstp qword [ecx+ELM2] fxch st1 fstp qword [ecx+ELM3] fstp qword [ecx+ELM4] fstp qword [ecx+ELM5] fstp qword [ecx+ELM6] add eax,edx fld qword [ecx+ELM7] fld qword [ecx+ELM8] mov edx,edx fld qword [beta] fmul st2,st0 fmul st1,st0 fld qword [ebx+ELM1] fld qword [ecx+ELM11] rep fmul st0,st2 fld qword [ecx+ELM12] fmul st0,st3 fld qword [ecx+ELM10] fmul st0,st4 fld qword [ecx+ELM9] rep fmulp st5,st0 rep fxch st0,st3 mov edx,edx OPERATION 1,2 ;01+2 OPERATION 2,3 ;02+2 OPERATION 3,4 ;03+2 OPERATION 4,5 ;04+2 OPERATION 5,6 ;05+2 OPERATION 6,7 ;06+2 OPERATION 7,8 ;07+2 OPERATION 8,9 ;08+2 OPERATION 9,10 ;09+2 OPERATION 10,11 ;10+2 OPERATION 11,12 ;11+2 OPERATION 12,13 ;12+2 OPERATION 13,14 ;13+2 OPERATION 14,15 ;14+2 OPERATION 15,16 ;15+2 OPERATION 16,17 ;16+2 OPERATION 17,18 ;17+2 OPERATION 18,19 ;18+2 OPERATION 19,20 ;19+2 OPERATION 20,21 ;20+2 OPERATION 21,22 ;21+2 OPERATION 22,23 ;22+2 OPERATION 23,24 ;23+2 OPERATION 24,25 ;24+2 OPERATION 25,26 ;25+2 OPERATION 26,27 ;26+2 OPERATION 27,28 ;27+2 OPERATION 28,29 ;28+2 OPERATION 29,30 ;29+2 fld qword [eax+DOTP1+ELM30] ;30+2 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8-1*64] nop %endif %ifdef PREB_DST3 prefetch [ebx+30*8-2*64] fnop mov edx,edx prefetch [ebx+30*8-1*64] nop prefetch [ebx+30*8+0*64] nop prefetch [ebx+30*8+1*64] nop prefetch [ebx+30*8+2*64] nop %endif fstp qword [ecx+ELM7] fxch st3 fstp qword [ecx+ELM8] fxch st1 fstp qword [ecx+ELM9] fstp qword [ecx+ELM10] fstp qword [ecx+ELM11] fstp qword [ecx+ELM12] add eax,edx fld qword [ecx+ELM13] fld qword [ecx+ELM14] mov edx,edx fld qword [beta] fmul st2,st0 fmul st1,st0 fld qword [ebx+ELM1] fld qword [ecx+ELM17] rep fmul st0,st2 fld qword [ecx+ELM18] fmul st0,st3 rep fld qword [ecx+ELM16] fmul st0,st4 fld qword [ecx+ELM15] rep fmulp st5,st0 rep fxch st0,st3 mov edx,edx OPERATION 1,2 ;01+3 OPERATION 2,3 ;02+3 OPERATION 3,4 ;03+3 OPERATION 4,5 ;04+3 OPERATION 5,6 ;05+3 OPERATION 6,7 ;06+3 OPERATION 7,8 ;07+3 OPERATION 8,9 ;08+3 OPERATION 9,10 ;09+3 OPERATION 10,11 ;10+3 OPERATION 11,12 ;11+3 OPERATION 12,13 ;12+3 OPERATION 13,14 ;13+3 OPERATION 14,15 ;14+3 OPERATION 15,16 ;15+3 OPERATION 16,17 ;16+3 OPERATION 17,18 ;17+3 OPERATION 18,19 ;18+3 OPERATION 19,20 ;19+3 OPERATION 20,21 ;20+3 OPERATION 21,22 ;21+3 OPERATION 22,23 ;22+3 OPERATION 23,24 ;23+3 OPERATION 24,25 ;24+3 OPERATION 25,26 ;25+3 OPERATION 26,27 ;26+3 OPERATION 27,28 ;27+3 OPERATION 28,29 ;28+3 OPERATION 29,30 ;29+3 fld qword [eax+DOTP1+ELM30] ;30+3 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8+0*64] nop %endif %ifdef PREB_DST2 prefetch [ebx+30*8-2*64] fnop mov edx,edx prefetch [ebx+30*8-1*64] nop prefetch [ebx+30*8+0*64] nop prefetch [ebx+30*8+1*64] nop prefetch [ebx+30*8+2*64] nop %endif fstp qword [ecx+ELM13] fxch st3 fstp qword [ecx+ELM14] rep fxch st1 fstp qword [ecx+ELM15] fstp qword [ecx+ELM16] fstp qword [ecx+ELM17] fstp qword [ecx+ELM18] add eax,edx fld qword [ecx+ELM19] fld qword [ecx+ELM20] mov edx,edx fld qword [beta] fmul st2,st0 fmul st1,st0 fld qword [ebx+ELM1] fld qword [ecx+ELM23] rep fmul st0,st2 fld qword [ecx+ELM24] fmul st0,st3 fld qword [ecx+ELM22] fmul st0,st4 fld qword [ecx+ELM21] rep fmulp st5,st0 rep fxch st0,st3 mov edx,edx OPERATION 1,2 ;01+4 OPERATION 2,3 ;02+4 OPERATION 3,4 ;03+4 OPERATION 4,5 ;04+4 OPERATION 5,6 ;05+4 OPERATION 6,7 ;06+4 OPERATION 7,8 ;07+4 OPERATION 8,9 ;08+4 OPERATION 9,10 ;09+4 OPERATION 10,11 ;10+4 OPERATION 11,12 ;11+4 OPERATION 12,13 ;12+4 OPERATION 13,14 ;13+4 OPERATION 14,15 ;14+4 OPERATION 15,16 ;15+4 OPERATION 16,17 ;16+4 OPERATION 17,18 ;17+4 OPERATION 18,19 ;18+4 OPERATION 19,20 ;19+4 OPERATION 20,21 ;20+4 OPERATION 21,22 ;21+4 OPERATION 22,23 ;22+4 OPERATION 23,24 ;23+4 OPERATION 24,25 ;24+4 OPERATION 25,26 ;25+4 OPERATION 26,27 ;26+4 OPERATION 27,28 ;27+4 OPERATION 28,29 ;28+4 OPERATION 29,30 ;29+4 fld qword [eax+DOTP1+ELM30] ;30+4 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8+1*64] nop %endif %ifdef PREB_DST1 prefetch [ebx+30*8-2*64] fnop mov edx,edx prefetch [ebx+30*8-1*64] nop prefetch [ebx+30*8+0*64] nop prefetch [ebx+30*8+1*64] nop prefetch [ebx+30*8+2*64] nop %endif fstp qword [ecx+ELM19] fxch st3 fstp qword [ecx+ELM20] fxch st1 fstp qword [ecx+ELM21] fstp qword [ecx+ELM22] fstp qword [ecx+ELM23] fstp qword [ecx+ELM24] add eax,edx fld qword [ecx+ELM25] fld qword [ecx+ELM26] mov edx,edx fld qword [beta] fmul st2,st0 fmul st1,st0 fld qword [ebx+ELM1] fld qword [ecx+ELM29] rep fmul st0,st2 fld qword [ecx+ELM30] fmul st0,st3 fld qword [ecx+ELM28] fmul st0,st4 fld qword [ecx+ELM27] rep fmulp st5,st0 rep fxch st0,st3 mov edx,edx OPERATION 1,2 ;01+5 OPERATION 2,3 ;02+5 OPERATION 3,4 ;03+5 OPERATION 4,5 ;04+5 OPERATION 5,6 ;05+5 OPERATION 6,7 ;06+5 OPERATION 7,8 ;07+5 OPERATION 8,9 ;08+5 OPERATION 9,10 ;09+5 OPERATION 10,11 ;10+5 OPERATION 11,12 ;11+5 OPERATION 12,13 ;12+5 OPERATION 13,14 ;13+5 OPERATION 14,15 ;14+5 OPERATION 15,16 ;15+5 OPERATION 16,17 ;16+5 OPERATION 17,18 ;17+5 OPERATION 18,19 ;18+5 OPERATION 19,20 ;19+5 OPERATION 20,21 ;20+5 OPERATION 21,22 ;21+5 OPERATION 22,23 ;22+5 OPERATION 23,24 ;23+5 OPERATION 24,25 ;24+5 OPERATION 25,26 ;25+5 OPERATION 26,27 ;26+5 OPERATION 27,28 ;27+5 OPERATION 28,29 ;28+5 OPERATION 29,30 ;29+5 fld qword [eax+DOTP1+ELM30] ;30+5 fmul st0,st1 faddp st7 fld qword [eax+DOTP2+ELM30] fmul st0,st1 faddp st6 fld qword [eax+DOTP3+ELM30] fmul st0,st1 faddp st5 fld qword [eax+DOTP4+ELM30] fmul st0,st1 faddp st4 fld qword [eax+DOTP5+ELM30] fmul st0,st1 faddp st3 rep fmul qword [eax+DOTP6+ELM30] faddp st1 fxch st5 %ifdef PREB_SPLIT prefetch [ebx+30*8+2*64] nop %endif %ifdef PREA_EN mov [esp+20],edx ;save edx in t1 mov edx,[esp+16] ;&A+1->edx lea edx,[edx+ebx] prefetch [edx-2*64] nop prefetch [edx-1*64] prefetch [edx+0*64] nop prefetch [edx+1*64] prefetch [edx+2*64-8] mov edx,[esp+20] ;restore edx mov eax,eax fnop %endif fstp qword [ecx+ELM25] fxch st3 fstp qword [ecx+ELM26] fxch st1 fstp qword [ecx+ELM27] fstp qword [ecx+ELM28] fstp qword [ecx+ELM29] fstp qword [ecx+ELM30] sub ebx,edi ;next column of B mov eax,[esp+4] ;reset eax add ecx,[esp+12] ;next column of C (+ldc*8) dec dword [esp+8] ;dec counter jnz near loopj_ end_ femms pop ebp add esp,byte 5*4 ;remove local variables pop edi ;restore registers pop esi pop ebx leave ;mov esp,ebp / pop ebp ret section .data times 64 db 0 align 8 beta dq 0.0 --- NEW FILE: ATL_dJIK30x30x30TN30x30x0_a1_bX.cfg --- ; ; ATL_dJIK30x30x30TN30x30x0_a1_bX.cfg ; ; ATLAS "Speed of Light" DGEMM() kernel for AMD Athlon ; Code author: Julian Ruhe (ruh...@li... | Jul...@t-...) ; ;define PREB_DIS to disable prefetching of B ;define PREB_DST1 to set B-prefetching distance to 1 ;define PREB_DST2 to set B-prefetching distance to 2 ;define PREB_DST3 to set B-prefetching distance to 3 ;define PREB_DST4 to set B-prefetching distance to 4 ;define PREB_SPLIT to use an alternative prefetching strategy ; ;define PREA_EN to enable prefetching of A+1 ;define PREA_DIS to disable prefetching of A+1 ; ;All these macros can be defined in the NASM command line, if no %defines are active in this file ; %ifndef PREA_EN %ifndef PREA_DIS %define PREA_EN %endif %endif %ifndef PREB_DIS %ifndef PREB_DST1 %ifndef PREB_DST2 %ifndef PREB_DST3 %ifndef PREB_DST4 %define PREB_DST2 %endif %endif %endif %endif %endif --- NEW FILE: julian.base --- @skip @skip wrapper basefile to get headers right, and make extraction easy. @skip directly uses contributed code @skip @ifdef ! topd @define topd @/home/rwhaley/Base/SF@ @endifdef @ROUT ! Makefile @extract -b @(topd)/gen.inc what=crsetup @extract -b @(topd)/gen.inc what=acw -def author "Julian Ruhe" -def cdate "2001" @extract -b @(topd)/kernel/JulianRuhe/@(rout) @ROUT Makefile @define PM @PREB_DST@ all: @whiledef be 0 1 X @whiledef dst 4 3 2 1 nasm -f elf -DELF -D@(PM)@(dst) -o julian@(dst)_b@(be).o \ ATL_dJIK30x30x30TN30x30x0_a1_b@(be).asm @endwhile @endwhile be --- NEW FILE: julian2_b0.o --- ELF --- NEW FILE: julian2_b1.o --- ELF _ --- NEW FILE: julian2_bX.o --- ELF --- NEW FILE: julian2_win_b0.o --- L --- NEW FILE: julian2_win_b1.o --- L --- NEW FILE: julian2_win_bX.o --- L |