[Win32forth-cvs] win32forth/src/lib/fmacro mm_fw.f, 1.1, 1.2 mm_fw_fm.f, 1.2, 1.3

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/win32forth/win32forth/src/lib/fmacro
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv11171/src/lib/fmacro

Modified Files:
	mm_fw.f mm_fw_fm.f 
Log Message:
Jos: The improved 2^x thanks to Elko Tchernev
Note: The benchmark also shows that problem with the last line in the console still exists.

Index: mm_fw_fm.f
===================================================================
RCS file: /cvsroot/win32forth/win32forth/src/lib/fmacro/mm_fw_fm.f,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** mm_fw_fm.f	27 Dec 2004 22:01:04 -0000	1.2
--- mm_fw_fm.f	7 Feb 2009 23:14:17 -0000	1.3
***************
*** 8,11 ****
--- 8,12 ----
  \ *               May 12th, 2003   J.v.d.Ven: Changed DDOT DAXPY do-WARNER() and DO-MAENO.
  \ *		  December 27th, 2004 J.v.d.Ven: Updated de results since Win32Forth and fmacro.f were updated
+ \ *               January 12th, 2009  J.v.d.Ven: Changed 2^x thanks to Elko Tchernev
  
  \ A changed MM benchmark from: http://home.iae.nl/users/mhx/mm.fw
***************
*** 16,24 ****
  CLK 400 MHz
  80x80 mm - normal algorithm                              5.36 MFlops,  74.19 ticks/flop,   0.190 s
! 80x80 mm - blocking, factor of 20                        3.46 MFlops, 114.85 ticks/flop,   0.295 s
  80x80 mm - transposed B matrix                           5.17 MFlops,  76.83 ticks/flop,   0.197 s
  80x80 mm - Robert's algorithm                            5.04 MFlops,  78.85 ticks/flop,   0.202 s
! 80x80 mm - T. Maeno's algorithm, subarray 20x20          3.19 MFlops, 124.60 ticks/flop,   0.320 s
! 80x80 mm - D. Warner's algorithm, subarray 20x20         2.66 MFlops, 149.06 ticks/flop,   0.383 s 
  
  ALL-TESTS \ Using Win32Forth Version: 6.09, fmacro.f date December 27th, 2004 and FSL-Utilities_1.04
--- 17,25 ----
  CLK 400 MHz
  80x80 mm - normal algorithm                              5.36 MFlops,  74.19 ticks/flop,   0.190 s
! ? 80x80 mm - blocking, factor of 20                        3.46 MFlops, 114.85 ticks/flop,   0.295 s
  80x80 mm - transposed B matrix                           5.17 MFlops,  76.83 ticks/flop,   0.197 s
  80x80 mm - Robert's algorithm                            5.04 MFlops,  78.85 ticks/flop,   0.202 s
! >80x80 mm - T. Maeno's algorithm, subarray 20x20          3.19 MFlops, 124.60 ticks/flop,   0.320 s
! 80x80 mm - D. Warner's algorithm, subarray 20x20         2.66 MFlops, 149.06 ticks/flop,   0.383 s
  
  ALL-TESTS \ Using Win32Forth Version: 6.09, fmacro.f date December 27th, 2004 and FSL-Utilities_1.04
***************
*** 30,34 ****
  80x80 mm - Robert's algorithm                           36.64 MFlops,  10.91 ticks/flop,   0.027 s
  80x80 mm - T. Maeno's algorithm, subarray 20x20         12.41 MFlops,  32.21 ticks/flop,   0.082 s
! 80x80 mm - D. Warner's algorithm, subarray 20x20        15.25 MFlops,  26.22 ticks/flop,   0.067 s 
  
  ))
--- 31,35 ----
  80x80 mm - Robert's algorithm                           36.64 MFlops,  10.91 ticks/flop,   0.027 s
  80x80 mm - T. Maeno's algorithm, subarray 20x20         12.41 MFlops,  32.21 ticks/flop,   0.082 s
! 80x80 mm - D. Warner's algorithm, subarray 20x20        15.25 MFlops,  26.22 ticks/flop,   0.067 s
  
  ))
***************
*** 194,198 ****
  [THEN]
  
! : 2^x  ( x -- 2^x ) 1 SWAP 0 ?DO  1 LSHIFT  LOOP ;
  
  CHAR x CONSTANT 'x'
--- 195,205 ----
  [THEN]
  
! : 2^X   ( x - 2^x )
!    dup 0<
!     if    0>
!     else  dup 31 >= abort" Out of range"
!           1 SWAP LSHIFT
!     then
!  ;
  
  CHAR x CONSTANT 'x'
***************
*** 348,352 ****
   *    the tuning guide indicates nb = 50 is reasonable for the
   *    ibm model 530 hence 25 should be reasonable for the 320
!  *    since the 320 has 32k rather than 64k of cache.      
   *    Inner loops unrolled to depth of 2
   *    The loop functions without clean up code at the end only
--- 355,359 ----
   *    the tuning guide indicates nb = 50 is reasonable for the
   *    ibm model 530 hence 25 should be reasonable for the 320
!  *    since the 320 has 32k rather than 64k of cache.
   *    Inner loops unrolled to depth of 2
   *    The loop functions without clean up code at the end only
***************
*** 412,421 ****
          .RESULT ;
  
! 0 [IF] =========================================================================== 
  Matrix Multiply tuned for SS-10/30;
   *                      Maeno Toshinori
   *                      Tokyo Institute of Technology
   *
!  * Using gcc-2.4.1 (-O2), this program ends in 12 seconds on SS-10/30. 
   *
   * in original algorithm - sub-area for cache tiling
--- 419,428 ----
          .RESULT ;
  
! 0 [IF] ===========================================================================
  Matrix Multiply tuned for SS-10/30;
   *                      Maeno Toshinori
   *                      Tokyo Institute of Technology
   *
!  * Using gcc-2.4.1 (-O2), this program ends in 12 seconds on SS-10/30.
   *
   * in original algorithm - sub-area for cache tiling
***************
*** 491,498 ****
          & bt{{ N N }}malloc
          &  c{{ N N }}malloc
!         &  d{{ N N }}malloc 
          SET-COEFFICIENTS
          FLUSH-CACHE
!         CASE 
           'n' OF NORMAL()    ENDOF
           't' OF TRANSPOSE() ENDOF
--- 498,505 ----
          & bt{{ N N }}malloc
          &  c{{ N N }}malloc
!         &  d{{ N N }}malloc
          SET-COEFFICIENTS
          FLUSH-CACHE
!         CASE
           'n' OF NORMAL()    ENDOF
           't' OF TRANSPOSE() ENDOF
***************
*** 556,561 ****
          CR .( Compile time = ) US? 1000 / DEC. .( ms, assuming clockspeed = ) PROCESSOR-CLOCK DEC. .( MHz.)
  
!   cr cr 
    ALL-TESTS
  \s
-