[Math-atlas-commits] CVS: AtlasBase/kernel/ClintWhaley ATL_dmm4x4x2pf_av.c,1.3,1.4
Brought to you by:
rwhaley,
tonyc040457
From: R. C. W. <rw...@us...> - 2003-04-24 17:33:57
|
Update of /cvsroot/math-atlas/AtlasBase/kernel/ClintWhaley In directory sc8-pr-cvs1:/tmp/cvs-serv3610 Modified Files: ATL_dmm4x4x2pf_av.c Log Message: g4 speed increase Index: ATL_dmm4x4x2pf_av.c =================================================================== RCS file: /cvsroot/math-atlas/AtlasBase/kernel/ClintWhaley/ATL_dmm4x4x2pf_av.c,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** ATL_dmm4x4x2pf_av.c 23 Apr 2003 19:00:30 -0000 1.3 --- ATL_dmm4x4x2pf_av.c 24 Apr 2003 17:33:53 -0000 1.4 *************** *** 192,196 **** #if 0 r3 r4 r5 r6-r7,f2 ! void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, (r6) r8 (r7) r9 (r8) r10 (r9) 56(r1) const TYPE *A, const int lda, const TYPE *B, const int ldb, --- 192,196 ---- #if 0 r3 r4 r5 r6-r7,f2 ! void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, (r6) r8 (r7) r9 (r8) r10 (r9) 56(r1) const TYPE *A, const int lda, const TYPE *B, const int ldb, *************** *** 312,316 **** addi ctlC, ctlC, 4 slwi ctlC, ctlC, 16 ! or ctlC, ctlC, pC1 #ifdef DCPLX slwi incCn, incCn, 4 --- 312,316 ---- addi ctlC, ctlC, 4 slwi ctlC, ctlC, 16 ! or ctlC, ctlC, pC1 #ifdef DCPLX slwi incCn, incCn, 4 *************** *** 358,369 **** fmr rC10, ZERO lfd rA0, 0(pA0) - lfd rB0, 0(pB0) - lfd rA1, 0(pA1) - lfd rA2, 0(pA2) fmr rC20, ZERO fmr rC30, ZERO ! addi pC0, pC0, incCm fmr rC01, ZERO fmr rC11, ZERO fmr rC21, ZERO fmr rC31, ZERO --- 358,369 ---- fmr rC10, ZERO lfd rA0, 0(pA0) fmr rC20, ZERO + lfd rB0, 0(pB0) fmr rC30, ZERO ! lfd rA1, 0(pA1) fmr rC01, ZERO + lfd rA2, 0(pA2) fmr rC11, ZERO + addi pC0, pC0, incCm fmr rC21, ZERO fmr rC31, ZERO *************** *** 372,383 **** fmr rC12, ZERO lfd rA3, 0(pA3) - lfd rB1, 0(pB1) - lfd rB2, 0(pB2) - lfd rB3, 0(pB3) fmr rC22, ZERO fmr rC32, ZERO addi pC2, pC2, incCm fmr rC03, ZERO fmr rC13, ZERO fmr rC23, ZERO addi pC3, pC3, incCm --- 372,383 ---- fmr rC12, ZERO lfd rA3, 0(pA3) fmr rC22, ZERO + lfd rB1, 0(pB1) fmr rC32, ZERO addi pC2, pC2, incCm fmr rC03, ZERO + lfd rB2, 0(pB2) fmr rC13, ZERO + lfd rB3, 0(pB3) fmr rC23, ZERO addi pC3, pC3, incCm *************** *** 417,440 **** #ifdef BETAX fmul rC00, rC00, beta - fmul rC10, rC10, beta - fmul rC20, rC20, beta lfd rA0, 0(pA0) lfd rB0, 0(pB0) lfd rA1, 0(pA1) fmul rC30, rC30, beta - fmul rC01, rC01, beta - fmul rC11, rC11, beta lfd rA2, 0(pA2) lfd rA3, 0(pA3) lfd rB1, 0(pB1) fmul rC21, rC21, beta fmul rC31, rC31, beta fmul rC02, rC02, beta - lfd rB2, 0(pB2) lfd rB3, 0(pB3) fmul rC12, rC12, beta fmul rC22, rC22, beta fmul rC32, rC32, beta - mtctr stK fmul rC03, rC03, beta fmul rC13, rC13, beta --- 417,440 ---- #ifdef BETAX fmul rC00, rC00, beta lfd rA0, 0(pA0) + fmul rC10, rC10, beta lfd rB0, 0(pB0) + fmul rC20, rC20, beta lfd rA1, 0(pA1) fmul rC30, rC30, beta lfd rA2, 0(pA2) + fmul rC01, rC01, beta lfd rA3, 0(pA3) + fmul rC11, rC11, beta lfd rB1, 0(pB1) fmul rC21, rC21, beta + lfd rB2, 0(pB2) fmul rC31, rC31, beta fmul rC02, rC02, beta lfd rB3, 0(pB3) fmul rC12, rC12, beta + mtctr stK fmul rC22, rC22, beta fmul rC32, rC32, beta fmul rC03, rC03, beta fmul rC13, rC13, beta *************** *** 460,473 **** KLOOP: fmadd rC00, rA0, rB0, rC00 - fmadd rC10, rA1, rB0, rC10 - fmadd rC20, rA2, rB0, rC20 lfd ra0, 8(pA0) lfd ra1, 8(pA1) lfd ra2, 8(pA2) fmadd rC30, rA3, rB0, rC30 fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 lfd rB0, 8(pB0) - lfd ra3, 8(pA3) fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 --- 460,473 ---- KLOOP: fmadd rC00, rA0, rB0, rC00 lfd ra0, 8(pA0) + fmadd rC10, rA1, rB0, rC10 lfd ra1, 8(pA1) + fmadd rC20, rA2, rB0, rC20 lfd ra2, 8(pA2) fmadd rC30, rA3, rB0, rC30 fmadd rC01, rA0, rB1, rC01 + lfd ra3, 8(pA3) fmadd rC11, rA1, rB1, rC11 lfd rB0, 8(pB0) fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 *************** *** 485,498 **** fmadd rC00, ra0, rB0, rC00 - fmadd rC10, ra1, rB0, rC10 lfd rB3, 8(pB3) lfdu rA1, 16(pA1) - lfdu rA2, 16(pA2) fmadd rC20, ra2, rB0, rC20 fmadd rC30, ra3, rB0, rC30 fmadd rC01, ra0, rB1, rC01 lfdu rB0, 16(pB0) - lfdu rA3, 16(pA3) fmadd rC11, ra1, rB1, rC11 fmadd rC21, ra2, rB1, rC21 fmadd rC31, ra3, rB1, rC31 --- 485,498 ---- fmadd rC00, ra0, rB0, rC00 lfd rB3, 8(pB3) + fmadd rC10, ra1, rB0, rC10 lfdu rA1, 16(pA1) fmadd rC20, ra2, rB0, rC20 fmadd rC30, ra3, rB0, rC30 + lfdu rA2, 16(pA2) fmadd rC01, ra0, rB1, rC01 lfdu rB0, 16(pB0) fmadd rC11, ra1, rB1, rC11 + lfdu rA3, 16(pA3) fmadd rC21, ra2, rB1, rC21 fmadd rC31, ra3, rB1, rC31 *************** *** 628,632 **** */ add pA0, pA0, incAn ! dst pA0, ctlB, 1 dstst pC0, ctlC, 3 /* --- 628,632 ---- */ add pA0, pA0, incAn ! dst pA0, ctlB, 1 dstst pC0, ctlC, 3 /* *************** *** 669,675 **** addi r1, r1, FSIZE #else ! stw r13, 44+IROFF(r1) ! lwz r13, IROFF(r1) ! lwz r14, 4+IROFF(r1) lwz r0, 8(r1) mtlr r0 --- 669,673 ---- addi r1, r1, FSIZE #else ! lwz r13, 44+IROFF(r1) lwz r0, 8(r1) mtlr r0 |