[inq-commits] SF.net SVN: inq: [467] trunk/client

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 467
          http://inq.svn.sourceforge.net/inq/?rev=467&view=rev
Author:   stargrave
Date:     2008-01-28 02:45:30 -0800 (Mon, 28 Jan 2008)

Log Message:
-----------
* Removed unneded test/bench directory creation
* Added BYTEmark-based benchmark

Modified Paths:
--------------
    trunk/client/Makefile
    trunk/client/lib/Makefile

Added Paths:
-----------
    trunk/client/lib/bytemark/
    trunk/client/lib/bytemark/COM.DAT
    trunk/client/lib/bytemark/Makefile
    trunk/client/lib/bytemark/NNET.DAT
    trunk/client/lib/bytemark/README
    trunk/client/lib/bytemark/bdoc.txt
    trunk/client/lib/bytemark/debugbit.good.gz
    trunk/client/lib/bytemark/emfloat.c
    trunk/client/lib/bytemark/emfloat.h
    trunk/client/lib/bytemark/hardware.c
    trunk/client/lib/bytemark/hardware.h
    trunk/client/lib/bytemark/hello.c
    trunk/client/lib/bytemark/misc.c
    trunk/client/lib/bytemark/misc.h
    trunk/client/lib/bytemark/nbench0.c
    trunk/client/lib/bytemark/nbench0.h
    trunk/client/lib/bytemark/nbench1.c
    trunk/client/lib/bytemark/nbench1.h
    trunk/client/lib/bytemark/nmglobal.h
    trunk/client/lib/bytemark/pointer.c
    trunk/client/lib/bytemark/sysinfo.c.example
    trunk/client/lib/bytemark/sysinfo.c.template
    trunk/client/lib/bytemark/sysinfo.sh
    trunk/client/lib/bytemark/sysinfoc.c.example
    trunk/client/lib/bytemark/sysinfoc.c.template
    trunk/client/lib/bytemark/sysspec.c
    trunk/client/lib/bytemark/sysspec.h
    trunk/client/lib/bytemark/wordcat.h
    trunk/client/test/bytemark

Modified: trunk/client/Makefile
===================================================================

--- trunk/client/Makefile	2008-01-28 10:26:35 UTC (rev 466)
+++ trunk/client/Makefile	2008-01-28 10:45:30 UTC (rev 467)
@@ -26,8 +26,7 @@
 		$(DESTDIR)$(SHARE_DIR)/test \
 		$(DESTDIR)$(SHARE_DIR)/monitoring \
 		$(DESTDIR)$(SHARE_DIR)/self-id \
-		$(DESTDIR)$(SHARE_DIR)/test/additional \
-		$(DESTDIR)$(SHARE_DIR)/test/bench
+		$(DESTDIR)$(SHARE_DIR)/test/additional
 	find detect -type f -and ! -path '*.svn*' | xargs install -m644 -t $(DESTDIR)$(SHARE_DIR)/detect/
 	find test -type f -and ! -path '*.svn*' | xargs install -m755 -t $(DESTDIR)$(SHARE_DIR)/test/
 	find test/additional -type f -and ! -path '*.svn*' | xargs install -m755 -t $(DESTDIR)$(SHARE_DIR)/test/additional/

Modified: trunk/client/lib/Makefile
===================================================================
--- trunk/client/lib/Makefile	2008-01-28 10:26:35 UTC (rev 466)
+++ trunk/client/lib/Makefile	2008-01-28 10:45:30 UTC (rev 467)
@@ -1,7 +1,7 @@
 .EXPORT_ALL_VARIABLES:
 
 #SUBDIRS=alert hide-mouse memtest pci-patch wait-serial watchdog inquisitor thermometer
-SUBDIRS=watchdog wait-string einarc stream-mem whetstone dhrystone
+SUBDIRS=watchdog wait-string einarc stream-mem whetstone dhrystone bytemark
 
 include ../../Makefile.config
 

Added: trunk/client/lib/bytemark/COM.DAT
===================================================================
--- trunk/client/lib/bytemark/COM.DAT	                        (rev 0)
+++ trunk/client/lib/bytemark/COM.DAT	2008-01-28 10:45:30 UTC (rev 467)
@@ -0,0 +1,11 @@
+ALLSTATS=T
+DONUMSORT=T
+DOSTRINGSORT=T
+DOBITFIELD=T
+DOEMF=T
+DOFOUR=T
+DOASSIGN=T
+DOIDEA=T
+DOHUFF=T
+DONNET=T
+DOLU=T

Added: trunk/client/lib/bytemark/Makefile
===================================================================
--- trunk/client/lib/bytemark/Makefile	                        (rev 0)
+++ trunk/client/lib/bytemark/Makefile	2008-01-28 10:45:30 UTC (rev 467)
@@ -0,0 +1,64 @@
+include ../../../Makefile.config
+
+default: nbench
+CC = gcc
+CFLAGS = -s -static -Wall -O3 -fomit-frame-pointer -funroll-loops
+MACHINE=
+DEFINES= -DLINUX $(NO_UNAME)
+
+all: nbench
+
+sysinfoc.c: Makefile
+	./sysinfo.sh $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)
+
+sysinfo.c: Makefile
+	./sysinfo.sh $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)
+
+hardware.o: hardware.c hardware.h Makefile
+	$(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\
+		-c hardware.c
+
+nbench0.o: nbench0.h nbench0.c nmglobal.h pointer.h hardware.h\
+	   Makefile sysinfo.c sysinfoc.c
+	$(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\
+		-c nbench0.c
+
+emfloat.o: emfloat.h emfloat.c nmglobal.h pointer.h Makefile
+	$(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\
+		-c emfloat.c
+
+pointer.h: pointer Makefile
+	$(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\
+		-o pointer pointer.c
+	rm -f pointer.h
+	if [ "4" = `./pointer` ] ; then touch pointer.h ;\
+	else echo "#define LONG64" >pointer.h ; fi
+
+misc.o: misc.h misc.c Makefile
+	$(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\
+		-c misc.c
+
+nbench1.o: nbench1.h nbench1.c wordcat.h nmglobal.h pointer.h Makefile
+	$(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\
+		-c nbench1.c
+
+sysspec.o: sysspec.h sysspec.c nmglobal.h pointer.h Makefile
+	$(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\
+		-c sysspec.c
+
+nbench: emfloat.o misc.o nbench0.o nbench1.o sysspec.o hardware.o
+	$(CC) $(MACHINE) $(DEFINES) $(CFLAGS) $(LINKFLAGS)\
+		emfloat.o misc.o nbench0.o nbench1.o sysspec.o hardware.o\
+		-o nbench -lm
+clean:
+	- /bin/rm -f *.o *~ \#* core a.out hello sysinfo.c sysinfoc.c \
+		 bug pointer pointer.h debugbit.dat
+
+mrproper: clean
+	- /bin/rm -f nbench
+
+install:
+	mkdir -p $(DESTDIR)$(BIN_DIR) $(DESTDIR)$(SHARE_DIR)/test/additional
+	install -m755 nbench $(DESTDIR)$(BIN_DIR)/
+	cp NNET.DAT $(DESTDIR)$(SHARE_DIR)/test/additional/
+	cp COM.DAT $(DESTDIR)$(SHARE_DIR)/test/additional/

Added: trunk/client/lib/bytemark/NNET.DAT
===================================================================
--- trunk/client/lib/bytemark/NNET.DAT	                        (rev 0)
+++ trunk/client/lib/bytemark/NNET.DAT	2008-01-28 10:45:30 UTC (rev 467)
@@ -0,0 +1,210 @@
+5  7  8 
+26
+0  0  1  0  0
+0  1  0  1  0
+1  0  0  0  1
+1  0  0  0  1
+1  1  1  1  1
+1  0  0  0  1
+1  0  0  0  1
+0  1  0  0  0  0  0  1
+1  1  1  1  0
+1  0  0  0  1
+1  0  0  0  1
+1  1  1  1  0
+1  0  0  0  1
+1  0  0  0  1
+1  1  1  1  0
+0  1  0  0  0  0  1  0
+0  1  1  1  0
+1  0  0  0  1
+1  0  0  0  0
+1  0  0  0  0
+1  0  0  0  0
+1  0  0  0  1
+0  1  1  1  0
+0  1  0  0  0  0  1  1
+1  1  1  1  0
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+1  1  1  1  0
+0  1  0  0  0  1  0  0
+1  1  1  1  1
+1  0  0  0  0
+1  0  0  0  0
+1  1  1  0  0
+1  0  0  0  0
+1  0  0  0  0
+1  1  1  1  1
+0  1  0  0  0  1  0  1
+1  1  1  1  1
+1  0  0  0  0
+1  0  0  0  0
+1  1  1  0  0
+1  0  0  0  0
+1  0  0  0  0
+1  0  0  0  0
+0  1  0  0  0  1  1  0
+0  1  1  1  0
+1  0  0  0  1
+1  0  0  0  0
+1  0  0  0  0
+1  0  0  1  1
+1  0  0  0  1
+0  1  1  1  0
+0  1  0  0  0  1  1  1
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+1  1  1  1  1
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+0  1  0  0  1  0  0  0
+0  1  1  1  0
+0  0  1  0  0
+0  0  1  0  0
+0  0  1  0  0
+0  0  1  0  0
+0  0  1  0  0
+0  1  1  1  0
+0  1  0  0  1  0  0  1
+0  0  0  0  1
+0  0  0  0  1
+0  0  0  0  1
+0  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+0  1  1  1  0
+0  1  0  0  1  0  1  0
+1  0  0  0  1
+1  0  0  1  0
+1  0  1  0  0
+1  1  0  0  0
+1  0  1  0  0
+1  0  0  1  0
+1  0  0  0  1
+0  1  0  0  1  0  1  1
+1  0  0  0  0
+1  0  0  0  0
+1  0  0  0  0
+1  0  0  0  0
+1  0  0  0  0
+1  0  0  0  0
+1  1  1  1  1
+0  1  0  0  1  1  0  0
+1  0  0  0  1
+1  1  0  1  1
+1  0  1  0  1
+1  0  1  0  1
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+0  1  0  0  1  1  0  1
+1  0  0  0  1
+1  1  0  0  1
+1  0  1  0  1
+1  0  1  0  1
+1  0  1  0  1
+1  0  0  1  1
+1  0  0  0  1
+0  1  0  0  1  1  1  0
+0  1  1  1  0
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+0  1  1  1  0
+0  1  0  0  1  1  1  1
+1  1  1  1  0
+1  0  0  0  1
+1  0  0  0  1
+1  1  1  1  0
+1  0  0  0  0
+1  0  0  0  0
+1  0  0  0  0
+0  1  0  1  0  0  0  0
+0  1  1  1  0
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+1  0  1  0  1
+1  0  0  1  1
+0  1  1  1  1
+0  1  0  1  0  0  0  1
+1  1  1  1  0  
+1  0  0  0  1
+1  0  0  0  1
+1  1  1  1  0
+1  0  1  0  0
+1  0  0  1  0
+1  0  0  0  1
+0  1  0  1  0  0  1  0
+0  1  1  1  1
+1  0  0  0  0
+1  0  0  0  0
+0  1  1  1  0
+0  0  0  0  1
+0  0  0  0  1
+1  1  1  1  0
+0  1  0  1  0  0  1  1
+1  1  1  1  1
+0  0  1  0  0
+0  0  1  0  0
+0  0  1  0  0
+0  0  1  0  0
+0  0  1  0  0
+0  0  1  0  0
+0  1  0  1  0  1  0  0
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+0  1  1  1  0
+0  1  0  1  0  1  0  1
+1  0  0  0  1
+1  0  0  0  1
+0  1  0  1  0
+0  1  0  1  0
+0  1  0  1  0
+0  1  0  1  0
+0  0  1  0  0
+0  1  0  1  0  1  1  0
+1  0  0  0  1
+1  0  0  0  1
+1  0  0  0  1
+1  0  1  0  1
+1  0  1  0  1
+1  0  1  0  1
+0  1  0  1  0
+0  1  0  1  0  1  1  1
+1  0  0  0  1
+0  1  0  1  0
+0  1  0  1  0
+0  0  1  0  0
+0  1  0  1  0
+0  1  0  1  0
+1  0  0  0  1
+0  1  0  1  1  0  0  0
+1  0  0  0  1
+0  1  0  1  0
+0  1  0  1  0
+0  0  1  0  0
+0  0  1  0  0
+0  0  1  0  0
+0  0  1  0  0
+0  1  0  1  1  0  0  1
+1  1  1  1  1
+0  0  0  1  0
+0  0  0  1  0
+0  0  1  0  0
+0  1  0  0  0
+0  1  0  0  0
+1  1  1  1  1
+0  1  0  1  1  0  1  0

Added: trunk/client/lib/bytemark/README
===================================================================
--- trunk/client/lib/bytemark/README	                        (rev 0)
+++ trunk/client/lib/bytemark/README	2008-01-28 10:45:30 UTC (rev 467)
@@ -0,0 +1,66 @@
+February 18, 2003
+-----------------
+Bug-fix release.
+
+December 9, 1997
+----------------
+This release is based on beta release 2 of BYTE Magazine's BYTEmark
+benchmark program (previously known as BYTE's Native Mode
+Benchmarks). This document covers the Native Mode (a.k.a. Algorithm
+Level) tests; benchmarks designed to expose the capabilities of a
+system's CPU, FPU, and memory system.
+
+Running a "make" will create the binary if all goes well. It is called
+"nbench" and performs a suite of 10 tests and compares the results to
+a Dell Pentium 90 with 16 MB RAM and 256 KB L2 cache running MSDOS and
+compiling with the Watcom 10.0 C/C++ compiler. If you define -DLINUX
+during compilation (the default) then you also get a comparison to an
+AMD K6/233 with 32 MB RAM and 512 KB L2-cache running Linux 2.0.32 and
+using a binary which was compiled with GNU gcc version 2.7.2.3 and GNU
+libc-5.4.38.
+
+For more verbose output specify -v as an argument.
+
+The primary web site is: http://www.tux.org/~mayer/linux/bmark.html
+
+The port to Linux/Unix was done by Uwe F. Mayer <ma...@tu...>.
+
+The index-split was done by Andrew D. Balsa, and reflects the
+realization that memory management is important in CPU design. The
+original tests have been left alone, however, the tests NUMERIC SORT,
+FP EMULATION, IDEA, and HUFFMAN now constitute the integer-arithmetic
+focused benchmark index, while the tests STRING SORT, BITFIELD, and
+ASSIGNMENT make up the new memory index.
+
+The algorithms were not changed from the source which was obtained
+from the BYTE web site at http://www.byte.com/bmark/bmark.htm on
+December 14, 1996.  However, the source was modified to better work
+with 64-bit machines (in particular the random number generator was
+modified to always work with 32 bit, no matter what kind of hardware
+you run it on). Furthermore, for some of the algorithms additional
+resettings of the data was added to increase the consistency across
+different hardware. Some extra debugging code was added, which has no
+impact on normal runs.
+
+In case there is uneven system load due to other processes while this
+benchmark suite executes, it might take longer to run than on an
+unloaded system. This is because the benchmark does some statistical
+analysis to make sure that the reported results are statistically
+significant, and an increased variation in individual runs requires
+more runs to achieve the required statistical confidence.
+
+This is a single-threaded benchmark and is not designed to measure the
+performance gain on multi-processor machines.
+
+For details and customization read bdoc.txt.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Added: trunk/client/lib/bytemark/bdoc.txt
===================================================================
--- trunk/client/lib/bytemark/bdoc.txt	                        (rev 0)
+++ trunk/client/lib/bytemark/bdoc.txt	2008-01-28 10:45:30 UTC (rev 467)
@@ -0,0 +1,2109 @@
+http://www.byte.com/bmark/bmark.htm
+----------------------------------------------------------------------------
+
+BYTEmark
+
+----------------------------------------------------------------------------
+
+This is release 2 of BYTE Magazine's BYTEmark benchmark program (previously
+known as BYTE's Native Mode Benchmarks). This document covers the Native
+Mode (a.k.a. Algorithm Level) tests; benchmarks designed to expose the
+capabilities of a system's CPU, FPU, and memory system. Another group of
+benchmarks within the BYTEmark suite includes the Application Simulation
+Benchmarks. They are detailed in a separate document. [NOTE: The
+documentation for the Application simulation benchmarks should appear before
+the end of March, 95. -- RG].
+
+The Tests
+
+The Native Mode portion of the BYTEmark consists of a number of well-known
+algorithms; some BYTE has used before in earlier versions of the benchmark,
+others are new. The complete suite consists of 10 tests:
+
+Numeric sort - Sorts an array of 32-bit integers.
+
+String sort - Sorts an array of strings of arbitrary length.
+
+Bitfield - Executes a variety of bit manipulation functions.
+
+Emulated floating-point - A small software floating-point package.
+
+Fourier coefficients - A numerical analysis routine for calculating series
+approximations of waveforms.
+
+Assignment algorithm - A well-known task allocation algorithm.
+
+Huffman compression - A well-known text and graphics compression algorithm.
+
+IDEA encryption - A relatively new block cipher algorithm.
+
+Neural Net - A small but functional back-propagation network simulator.
+
+LU Decomposition - A robust algorithm for solving linear equations.
+
+A more complete description of each test can be found in later sections of
+this document.
+
+BYTE built the BYTEmark with the multiplatform world foremost in mind. There
+were, of course, other considerations that we kept high on the list:
+
+Real-world algorithms. The algorithms should actually do something. Previous
+benchmarks often moved gobs of bytes from one point to another, added or
+subtracted piles and piles of numbers, or (in some cases) actually executed
+NOP instructions. We should not belittle those tests of yesterday, they had
+their place. However, we think it better that tests be based on activities
+that are more complex in nature.
+
+Easy to port. All the benchmarks are written in "vanilla" ANSI C. This
+provides us with the best chance of moving them quickly and accurately to
+new processors and operating systems as they appear. It also simplifies
+maintenance.
+
+This means that as new 64-bit (and, perhaps, 128-bit) processors appear, the
+benchmarks can test them as soon as a compiler is available.
+
+Comprehensive. The algorithms were derived from a variety of sources. Some
+are routines that BYTE had been using for some time. Others are routines
+derived from well-known texts in the computer science world. Furthermore,
+the algorithms differ in structure. Some simply "walk" sequentially through
+one-dimensional arrays. Others build and manipulate two-dimensional arrays.
+Finally, some benchmarks are "integer" tests, while others exercise the
+floating-point coprocessor (if one is available).
+
+Scalable. We wanted these benchmarks to be useful across as wide a variety
+of systems as possible. We also wanted to give them a lifetime beyond the
+next wave of new processors.
+
+To that end, we incorporated "dynamic workload adjustment." A complete
+description of this appears in a later section. In a nutshell, this allows
+the tests to "expand or contract" depending on the capabilities of the
+system under test, all the while providing consistent results so that fair
+and accurate comparisons are possible.
+
+Honesty In Advertising
+
+We'd be lying if we said that the BYTEmark was all the benchmarking that
+anyone would ever need to run on a system. It would be equally inaccurate to
+suggest that the tests are completely free of inadequacies. There are many
+things the tests do not do, there are shortcomings, and there are problems.
+
+BYTE will continue to improve the BYTEmark. The source code is freely
+available, and we encourage vendors and users to examine the routines and
+provide us with their feedback. In this way, we assure fairness,
+comprehensiveness, and accuracy.
+
+Still, as we mentioned, there are some shortcomings. Here are those we
+consider the most significant. Keep them in mind as you examine the results
+of the benchmarks now and in the future.
+
+At the mercy of C compilers. Being written in ANSI C, the benchmark program
+is highly portable. This is a reflection of the "world we live in." If this
+were a one-processor world, we might stand a chance at hand-crafting a
+benchmark in assembly language. (At one time, that's exactly what BYTE did.)
+Not today, no way.
+
+The upshot is that the benchmarks must be compiled. For broadest coverage,
+we selected ANSI C. And when they're compiled, the resulting executable's
+performance can be highly dependent on the capabilities of the C compiler.
+Today's benchmark results can be blown out of the water tomorrow if someone
+new enters the scene with an optimizing strategy that outperforms existing
+competition.
+
+This concern is not easily waved off. It will require you to keep careful
+track of compiler version and optimization switches. As BYTE builds its
+database of benchmark results, version number and switch setting will become
+an integral part of that data. This will be true for published information
+as well, so that you can make comparisons fairly and accurately. BYTE will
+control the distribution of test results so that all relevant compiler
+information is attached to the data.
+
+As a faint justification -- for those who think this situation results in
+"polluted" tests -- we should point out that we are in the same boat as all
+the other developers (at least, all those using C compilers -- and that's
+quite a sizeable group). If the only C compilers for a given system happen
+to be poor ones, everyone suffers. It's a fact that a given platform's
+ultimate potential depends as much on the development software available as
+on the technical achievements of the hardware design.
+
+It's just CPU and FPU. It's very tempting to try to capture the performance
+of a machine in a single number. That has never been possible -- though it's
+been tried a lot -- and the gap between that ideal and reality will forever
+widen.
+
+These benchmarks are meant to expose the theoretical upper limit of the CPU,
+FPU, and memory architecture of a system. They cannot measure video, disk,
+or network throughput (those are the domains of a different set of
+benchmarks). You should, therefore, use the results of these tests as part,
+not all, of any evaluation of a system.
+
+Single threaded. Currently, each benchmark test uses only a single execution
+thread. It's unlikely that you'll find any modern operating system that does
+not have some multitasking component. How a system "scales" as more tasks
+are run simultaneously is an effect that the current benchmarks cannot
+explore.
+
+BYTE is working on a future version of the tests that will solve this
+problem.
+
+The tests are synthetic. This quite reasonable argument is based on the fact
+that people don't run benchmarks for a living, they run applications.
+Consequently, the only true measure of a system is how well it performs
+whatever applications you will be running. This, in fact, is the philosophy
+behind the BAPCo benchmarks.
+
+This is not a point with which we would disagree. BYTE regularly makes use
+of a variety of application benchmarks. None of this suggests, however, that
+the BYTEmark benchmarks serve no purpose.
+
+BYTEmark's results should be used as predictors. They can be moved to a new
+platform long before native applications will be ported. The BYTEmark
+benchmarks will therefore provide an early look at the potential of the
+machine. Additionally, the BYTEmark permits you to "home in" on an aspect of
+the overall architecture. How well does the system perform when executing
+floating-point computations? Does its memory architecture help or hinder the
+management of memory buffers that may fall on arbitrary address boundaries?
+How does the cache work with a program whose memory access favors moving
+randomly through memory as opposed to moving sequentially through memory?
+
+The answers to these questions can give you a good idea of how well a system
+would support a particular class of applications. Only a synthetic benchmark
+can give the narrow view necessary to find the answers.
+
+Dynamic Workloads
+
+Our long history of benchmarking has taught us one thing above all others:
+Tomorrow's system will go faster than today's by an amount exceeding your
+wildest guess -- and then some. Dealing with this can become an unending
+race.
+
+It goes like this: You design a benchmark algorithm, you specify its
+parameters (how big the array is, how many loops, etc.), you run it on
+today's latest super-microcomputer, collect your data, and go home. A new
+machine arrives the next day, you run your benchmark, and discover that the
+test executes so quickly that the resolution of the clock routine you're
+using can't keep up with it (i.e., the test is over and done before the
+system clock even has a chance to tick).
+
+If you modify your routine, the figures you collected yesterday are no good.
+If you create a better clock routine by sneaking down into the system
+hardware, you can kiss portability goodbye.
+
+The BYTEmark benchmarks solve this problem by a process we'll refer to as
+"dynamic workload adjustment." In principle, it simply means that if the
+test runs so fast that the system clock can't time it, the benchmark
+increases the test workload -- and keeps increasing it -- until enough time
+is consumed to gather reliable test results.
+
+Here's an example.
+
+The BYTEmark benchmarks perform timing using a "stopwatch" paradigm. The
+routine StartStopwatch() begins timing; StopStopwatch() ends timing and
+reports the elapsed time in clock ticks. Now, "clock ticks" is a value that
+varies from system to system. We'll presume that our test system provides
+1000 clock ticks per second. (We'll also presume that the system actually
+updates its clock 1000 times per second. Surprisingly, some systems don't do
+that. One we know of will tell you that the clock provides 100 ticks per
+second, but updates the clock in 5- or 6-tick increments. The resolution is
+no better than somewhere around 1/18th of a second.) Here, when we say
+"system" we mean not only the computer system, but the environment provided
+by the C compiler. Interestingly, different C compilers for the same system
+will report different clock ticks per second.
+
+Built into the benchmarks is a global variable called GLOBALMINTICKS. This
+variable is the minimum number of clock ticks that the benchmark will allow
+StopStopwatch() to report.
+
+Suppose you run the Numeric Sort benchmark. The benchmark program will
+construct an array filled with random numbers, call StartStopwatch(), sort
+the array, and call StopStopwatch(). If the time reported in StopStopwatch()
+is less than GLOBALMINTICKS, then the benchmark will build two arrays, and
+try again. If sorting two arrays took less time than GLOBALMINTICKS, the
+process repeats with more arrays.
+
+This goes on until the benchmark makes enough work so that an interval
+between StartStopwatch() and StopStopwatch() exceeds GLOBALMINTICKS. Once
+that happens, the test is actually run, and scores are calculated.
+
+Notice that the benchmark didn't make bigger arrays, it made more arrays.
+That's because the time taken by the sort test does not increase linearly as
+the array grows, it increases by a factor of N*log(N) (where N is the size
+of the array).
+
+This principle is applied to all the benchmark tests. A machine with a less
+accurate clock may be forced to sort more arrays at a time, but the results
+are given in arrays per second. In this way fast machines, slow machines,
+machines with accurate clocks, machines with less accurate clocks, can all
+be tested with the same code.
+
+Confidence Intervals
+
+Another built-in feature of the BYTEmark is a set of statistical-analysis
+routines. Running benchmarks is one thing; the question arises as to how
+many times should a test be run until you know you have a good sampling.
+Also, can you determine whether the test is stable (i.e., do results vary
+widely from one execution of the benchmark to the next)?
+
+The BYTEmark keeps score as follows: Each test (a test being a numeric
+sort, a string sort, etc.) is run five times. These five scores are
+averaged, the standard deviation is determined, and a 95% confidence
+half-interval for the mean is calculated (using the student t
+distribution). This tells us that the true average lies -- with a 95%
+probability -- within plus or minus the confidence half-interval of
+the calculated average. If this half-interval is within 5% of the
+calculated average, the benchmarking stops. Otherwise, a new test is
+run and the calculations are repeated with all of the runs done so
+far, including the new one. The benchmark proceeds this way up to a
+total of 30 runs. If the length of the half-interval is still bigger
+than 5% of the calculated average then a warning issued that the
+results might not be statistically certain before the average is
+displayed.
+
+** Fixed a statistical bug here. Uwe F. Mayer
+
+The upshot is that, for each benchmark test, the true average is -- with a
+95% level of confidence -- within 5% of the average reported. Here, the
+"true average" is the average we would get were we able to run the tests
+over and over again an infinite number of times.
+
+This specification ensures that the calculation of results is controlled;
+that someone running the tests in California will use the same technique for
+determining benchmark results as someone running the tests in New York.
+
+In case there is uneven system load due to other processes while this
+benchmark suite executes, it might take longer to run the benchmark suite
+as compared to a run an unloaded system. This is because the benchmark does
+some statistical analysis to make sure that the reported results are
+statistically significant (as explained above), and a high variation in
+individual runs requires more runs to achieve the required statistical
+confidence.
+
+*** added last the paragraph, Uwe F. Mayer
+
+Interpreting Results
+
+Of course, running the benchmarks can present you with a boatload of data.
+It can get mystifying, and some of the more esoteric statistical information
+is valuable only to a limited audience. The big question is: What does it
+all mean?
+
+First, we should point out that the BYTEmark reports both "raw" and indexed
+scores for each test. The raw score for a particular test amounts to the
+"iterations per second" of that test. For example, the numeric sort test
+reports as its raw score the number of arrays it was able to sort per
+second.
+
+The indexed score is the raw score of the system under test divided by the
+raw score obtained on the baseline machine. As of this release, the
+baseline machine is a DELL 90 Mhz Pentium XPS/90 with 16 MB of RAM and 256K
+of external processor cache. (The compiler used was the Watcom C/C++ 10.0
+compiler; optimizations set to "fastest possible code", 4-byte structure
+alignment, Pentium code generation with Pentium register-based calling. The
+operating system was MSDOS.) The indexed score serves to "normalize" the
+raw scores, reducing their dynamic range and making them easier to
+grasp. Simply put, if your machine has an index score of 2.0 on the numeric
+sort test, it performed that test twice as fast as this 90 Mhz Pentium.
+
+If you run all the tests (as you'll see, it is possible to perform "custom
+runs", which execute only a subset of the tests) the BYTEmark will also
+produce two overall index figures: Integer index and Floating-point index.
+The Integer index is the geometric mean of those tests that involve only
+integer processing -- numeric sort, string sort, bitfield, emulated
+floating-point, assignment, Huffman, and IDEA -- while the Floating-point
+index is the geometric mean of those tests that require the floating-point
+coprocessor -- Fourier, neural net, and LU decomposition. You can use these
+scores to get a general feel for the performance of the machine under test
+as compared to the baseline 90 Mhz Pentium.
+
+The Linux/Unix port has a second baseline machine, it is an AMD K6/233 with
+32 MB RAM and 512 KB L2-cache running Linux 2.0.32 and using GNU gcc
+version 2.7.2.3 and libc-5.4.38.  The integer index was split as suggested
+by Andrew D. Balsa <and...@us...>, and reflects the realization that
+memory management is important in CPU design. The original tests have been
+left alone, however, the geometric mean of the tests NUMERIC SORT, FP
+EMULATION, IDEA, and HUFFMAN now constitutes the integer-arithmetic focused
+benchmark index, while the geometric mean of the tests STRING SORT,
+BITFIELD, and ASSIGNMENT makes up the new memory index. The floating point
+index has been left alone, it is still the geometric mean of FOURIER,
+NEURAL NET, and LU DECOMPOSITION.
+
+*** added the section on Linux, Uwe F. Mayer
+
+What follows is a list of the benchmarks and associated brief remarks that
+describe what the tests do: What they exercise; what a "good" result or a
+"bad" result means. Keep in mind that, in this expanding universe of faster
+processors, bigger caches, more elaborate memory architectures, "good" and
+"bad" are indeed relative terms. A good score on today's hot new processor
+will be a bad score on tomorrow's hot new processor.
+
+These remarks are based on empirical data and profiling that we have done to
+date. (NOTE: The profiling is limited to Intel and Motorola 68K on this
+release. As more data is gathered, we will be refining this section.
+3/14/95--RG)
+
+Benchmark                            Description
+
+Numeric sort                         Generic integer performance.  Should
+                                     exercise non-sequential performance
+                                     of cache (or memory if cache is less
+                                     than 8K).  Moves 32-bit longs at a
+                                     time, so 16-bit processors will be
+                                     at a disadvantage.
+
+
+
+String sort                          Tests memory-move performance.
+                                     Should exercise non-sequential
+                                     performance of cache, with added
+                                     burden that moves are byte-wide and
+                                     can occur on odd address boundaries.
+                                      May tax the performance of
+                                     cell-based processors that must
+                                     perform additional shift operations
+                                     to deal with bytes.
+
+
+
+Bitfield                             Exercises "bit twiddling"
+                                     performance.  Travels through memory
+                                     in a somewhat sequential fashion;
+                                     different from sorts in that data is
+                                     merely altered in place.  If
+                                     properly compiled, takes into
+                                     account 64-bit processors, which
+                                     should see a boost.
+
+
+
+Emulated F.P.                        Past experience has shown this test
+                                     to be a good measurement of overall
+                                     performance.
+
+
+
+Fourier                              Good measure of transcendental and
+                                     trigonometric performance of FPU.
+                                     Little array activity, so this test
+                                     should not be dependent of cache or
+                                     memory architecture.
+
+
+
+Assignment                           The test moves through large integer
+                                     arrays in both row-wise and
+                                     column-wise fashion.  Cache/memory
+                                     with good sequential performance
+                                     should see a boost (memory is
+                                     altered in place -- no moving as in
+                                     a sort operation).   Processing is
+                                     done in 32-bit chunks -- no
+                                     advantage given to 64-bit
+                                     processors.
+
+
+
+Huffman                              A combination of byte operations,
+                                     bit twiddling, and overall integer
+                                     manipulation.  Should be a good
+                                     general measurement.
+
+
+
+IDEA                                 Moves through data sequentially in
+                                     16-bit chunks.  Should provide a
+                                     good indication of raw speed.
+
+
+
+Neural Net                           Small-array floating-point test
+                                     heavily dependent on the exponential
+                                     function; less dependent on overall
+                                     FPU performance.  Small arrays, so
+                                     cache/memory architecture should not
+                                     come into play.
+
+
+
+LU decomposition.                           A floating-point test that moves
+                                     through arrays in both row-wise and
+                                     column-wise fashion.  Exercises only
+                                     fundamental math operations (+, -,
+                                     *, /).
+
+The Command File
+
+Purpose
+
+The BYTEmark program allows you to override many of its default parameters
+using a command file. The command file also lets you request statistical
+information, as well as specify an output file to hold the test results for
+later use.
+
+You identify the command file using a command-line argument. E.G.,
+
+C:NBENCH -cCOMFILE.DAT
+
+tells the benchmark program to read from COMFILE.DAT in the current
+directory.
+
+The content of the command file is simply a series of parameter names and
+values, each on a single line. The parameters control internal variables
+that are either global in nature (i.e., they effect all tests in the
+program) or are specific to a given benchmark test.
+
+The parameters are listed in a reference guide that follows, arranged in the
+following groups:
+
+Global Parameters
+
+Numeric Sort
+
+String Sort
+
+Bitfield
+
+Emulated floating-point
+
+Fourier coefficients
+
+Assignment algorithm
+
+IDEA encryption
+
+Huffman compression
+
+Neural net
+
+LU decomposition
+
+As mentioned above, those items listed under "Global Parameters" affect all
+tests; the rest deal with specific benchmarks. There is no required ordering
+to parameters as they appear in the command file. You can specify them in
+any sequence you wish.
+
+You should be judicious in your use of a command file. Some parameters will
+override the "dynamic workload" adjustment that each test performs. Doing
+this completely bypasses the benchmark code that is designed to produce an
+accurate reading from your system clock. Other parameters will alter default
+settings, yielding test results that cannot be compared with published
+benchmark results.
+
+A Sample Command File
+
+Suppose you built a command file that contained the following:
+
+ALLSTATS=T
+
+CUSTOMRUN=T
+
+OUTFILE=D:\DATA.DAT
+
+DONUMSORT=T
+
+DOLU=T
+
+Here's what this file tells the benchmark program:
+
+ALLSTATS=T means that you've requested a "dump" of all the statistics the
+test gathers. This includes not only the standard deviations of tests run,
+it also produces test-specific information such as the number of arrays
+built, the array size, etc.
+
+CUSTOMRUN=T tells the system that this is a custom run. Only tests
+explicitly specified will be executed.
+
+OUTFILE=D:\DATA.DAT will write the output of the benchmark to the file
+DATA.DAT on the root of the D: drive. (If DATA.DAT already exists, output
+will be appended to the file.)
+
+DONUMSORT=T tells the system to run the numeric sort benchmark. (This was
+necessary on account of the CUSTOMRUN=T line, above.)
+
+DOLU=T tells the system to run the LU decomposition benchmark.
+
+Command File Parameters Reference
+
+(NOTE: Altering some global parameters can invalidate results for comparison
+purposes. Those parameters are indicated in the following section by a bold
+asterisk (*). If you alter any parameters so indicated, you may NOT publish
+the resulting data as BYTEmark scores.)
+
+Global Parameters
+
+GLOBALMINTICKS=<n>
+
+This overrides the default global_min_ticks value (defined in NBENCH1.H).
+The global_min_ticks value is defined as the minimum number of clock ticks
+per iteration of a particular benchmark. For example, if global_min_ticks is
+set to 100 and the numeric sort benchmark is run; each iteration MUST take
+at least 100 ticks, or the system will expand the work-per-iteration.
+
+MINSECONDS=<n>
+
+Sets the minimum number of seconds any particular test will run. This has
+the effect of controlling the number of repetitions done. Default: 5.
+
+ALLSTATS=<T|F>
+
+Set this flag to T for a "dump" of all statistics. The information displayed
+varies from test to test. Default: F.
+
+OUTFILE=<path>
+
+Specifies that output should go to the specified output file. Any test
+results and statistical data displayed on-screen will also be written to the
+file. If the file does not exist, it will be created; otherwise, new output
+will be appended to an existing file. This allows you to "capture" several
+runs into a single file for later review.
+
+Note: the path should not appear in quotes. For example, something like the
+following would work: OUTFILE=C:\BENCH\DUMP.DAT
+
+CUSTOMRUN=<T|F>
+
+Set this flag to T for a custom run. A "custom run" means that the program
+will run only the benchmark tests that you explicitly specify. So, use this
+flag to run a subset of the tests. Default: F.
+
+Numeric Sort
+
+DONUMSORT=<T|F>
+
+Indicates whether to do the numeric sort. Default is T, unless this is a
+custom run (CUSTOMRUN=T), in which case default is F.
+
+NUMNUMARRAYS=<n>
+
+Indicates the number of numeric arrays the system will build. Setting this
+value will override the program's "dynamic workload" adjustment for this
+test.*
+
+NUMARRAYSIZE=<n>
+
+Indicates the number of elements in each numeric array. Default is 8001
+entries. (NOTE: Altering this value will invalidate the test for comparison
+purposes. The performance of the numeric sort test is not related to the
+array size as a linear function; i.e., an array twice as big will not take
+twice as long. The relationship involves a logarithmic function.)*
+
+NUMMINSECONDS=<n>
+
+Overrides MINSECONDS for the numeric sort test.
+
+String Sort
+
+DOSTRINGSORT=<T|F>
+
+Indicates whether to do the string sort. Default is T, unless this is a
+custom run (CUSTOMRUN=T), in which case the default is F.
+
+STRARRAYSIZE=<n>
+
+Sets the size of the string array. Default is 8111. (NOTE: Altering this
+value will invalidate the test for comparison purposes. The performance of
+the string sort test is not related to the array size as a linear function;
+i.e., an array twice as big will not take twice as long. The relationship
+involves a logarithmic function.)*
+
+NUMSTRARRAYS=<n>
+
+Sets the number of string arrays that will be created to run the test.
+Setting this value will override the program's "dynamic workload" adjustment
+for this test.*
+
+STRMINSECONDS=<n>
+
+Overrides MINSECONDS for the string sort test.
+
+Bitfield
+
+DOBITFIELD=<T|F>
+
+Indicates whether to do the bitfield test. Default is T, unless this is a
+custom run (CUSTOMRUN=T), in which case the default is F.
+
+NUMBITOPS=<n>
+
+Sets the number of bitfield operations that will be performed. Setting this
+value will override the program's "dynamic workload" adjustment for this
+test.*
+
+BITFIELDSIZE=<n>
+
+Sets the number of 32-bit elements in the bitfield arrays. The default value
+is dependent on the size of a long as defined by the current compiler. For a
+typical compiler that defines a long to be 32 bits, the default is 32768.
+(NOTE: Altering this parameter will invalidate test results for comparison
+purposes.)*
+
+BITMINSECONDS=<n>
+
+Overrides MINSECONDS for the bitfield test.
+
+Emulated floating-point
+
+DOEMF=<T|F>
+
+Indicates whether to do the emulated floating-point test. Default is T,
+unless this is a custom run (CUSTOMRUN=T), in which case the default is F.
+
+EMFARRAYSIZE=<n>
+
+Sets the size (number of elements) of the emulated floating-point benchmark.
+Default is 3000. The test builds three arrays, each of equal size. This
+parameter sets the number of elements for EACH array. (NOTE: Altering this
+parameter will invalidate test results for comparison purposes.)*
+
+EMFLOOPS=<n>
+
+Sets the number of loops per iteration of the floating-point test. Setting
+this value will override the program's "dynamic workload" adjustment for
+this test.*
+
+EMFMINSECONDS=<n>
+
+Overrides MINSECONDS for the emulated floating-point test.
+
+Fourier coefficients
+
+DOFOUR=<T|F>
+
+Indicates whether to do the Fourier test. Default is T, unless this is a
+custom run (CUSTOMRUN=T), in which case the default is F.
+
+FOURASIZE=<n>
+
+Sets the size of the array for the Fourier test. This sets the number of
+coefficients the test will derive. NOTE: Specifying this value will override
+the system's "dynamic workload" adjustment for this test, and may make the
+results invalid for comparison purposes.*
+
+FOURMINSECONDS=<n>
+
+Overrides MINSECONDS for the Fourier test.
+
+Assignment Algorithm
+
+DOASSIGN=<T|F>
+
+Indicates whether to do the assignment algorithm test. Default is T, unless
+this is a custom run (CUSTOMRUN=T), in which case the default is F.
+
+ASSIGNARRAYS=<n>
+
+Indicates the number of arrays that will be built for the test. Specifying
+this value will override the system's "dynamic workload" adjustment for this
+test. (NOTE: The size of the arrays in the assignment algorithm is fixed at
+101 x 101. Altering the array size requires adjusting global constants and
+recompiling; to do so, however, would invalidate test results.)*
+
+ASSIGNMINSECONDS=<n>
+
+Overrides MINSECONDS for the assignment algorithm test.
+
+IDEA encryption
+
+DOIDEA=<T|F>
+
+Indicates whether to do the IDEA encryption test. Default is T, unless this
+is a custom run (CUSTOMRUN=T), in which case the default is F.
+
+IDEAARRAYSIZE=<n>
+
+Sets the size of the plain-text character array that will be encrypted by the
+test. Default is 4000. The benchmark actually builds 3 arrays: 1st
+plain-text, encrypted version, and 2nd plain-text. The 2nd plain-text array is
+the destination for the decryption process [part of the test]. All arrays
+are set to the same size. (NOTE: Specifying this value will invalidate test
+results for comparison purposes.)*
+
+IDEALOOPS=<n>
+
+Indicates the number of loops in the IDEA test. Specifying this value will
+override the system's "dynamic workload" adjustment for this test.*
+
+IDEAMINSECONDS=<n>
+
+Overrides MINSECONDS for the IDEA test.
+
+Huffman compression
+
+DOHUFF=<T|F>
+
+Indicates whether to do the Huffman test. Default is T, unless this is a
+custom run (CUSTOMRUN=T), in which case the default is F.
+
+HUFFARRAYSIZE=<n>
+
+Sets the size of the string buffer that will be compressed using the Huffman
+test. The default is 5000. (NOTE: Altering this value will invalidate test
+results for comparison purposes.)*
+
+HUFFLOOPS=<n>
+
+Sets the number of loops in the Huffman test. Specifying this value will
+override the system's "dynamic workload" adjustment for this test.*
+
+HUFFMINSECONDS=<n>
+
+Overrides MINSECONDS for the Huffman test.
+
+Neural net
+
+DONNET=<T|F>
+
+Indicates whether to do the Neural Net test. Default is T, unless this is a
+custom run (CUSTOMRUN=T), in which case the default is F.
+
+NNETLOOPS=<n>
+
+Sets the number of loops in the Neural Net test. NOTE: Altering this value
+overrides the benchmark's "dynamic workload" adjustment algorithm, and may
+invalidate the results for comparison purposes.*
+
+NNETMINSECONDS=<n>
+
+Overrides MINSECONDS for the Neural Net test.
+
+LU decomposition
+
+DOLU=<T|F>
+
+Indicates whether to do the LU decomposition test. Default is T, unless this
+is a custom run (CUSTOMRUN=T), in which case the default is F.
+
+LUNUMARRAYS=<n>
+
+Sets the number of arrays in each iteration of the LU decomposition test.
+Specifying this value will override the system's "dynamic workload"
+adjustment for this test.*
+
+LUMINSECONDS=<n>
+
+Overrides MINSECONDS for the LU decomposition test.
+
+Numeric Sort
+
+Description
+
+This benchmark is designed to explore how well the system sorts a numeric
+array. In this case, a numeric array is a one-dimensional collection of
+signed, 32-bit integers. The actual sorting is performed by a heapsort
+algorithm (see the text box following for a description of the heapsort
+algorithm).
+
+It's probably unnecessary to point out (but we'll do it anyway) that sorting
+is a fundamental operation in computer application software. You'll likely
+find sorting routines nestled deep inside a variety of applications;
+everything from database systems to operating-systems kernels.
+
+The numeric sort benchmark reports the number of arrays it was able to sort
+per second. The array size is set by a global constant (it can be overridden
+by the command file -- see below).
+
+Analysis
+
+Optimized 486 code: Profiling of the numeric sort benchmark using Watcom's
+profiler (Watcom C/C++ 10.0) indicates that the algorithm spends most of its
+time in the numsift() function (specifically, about 90% of the benchmark's
+time takes place in numsift()). Within numsift(), two if statements dominate
+time spent:
+
+if(array[k]<array[k+1L]) and if(array[i]<array[k])
+
+Both statements involve indexes into arrays, so it's likely the processor is
+spending a lot of time resolving the array references. (Though both
+statements involve "less-than" comparisons, we doubt that much time is
+consumed in performing the signed compare operation.) Though the first
+statement involves array elements that are adjacent to one another, the
+second does not. In fact, the second statement will probably involve
+elements that are far apart from one another during early passes through the
+sifting process. We expect that systems whose caching system pre-fetches
+contiguous elements (often in "burst" line fills) will not have any great
+advantage of systems without pre-fetch mechanisms.
+
+Similar results were found when we profiled the numeric sort algorithm under
+the Borland C/C++ compiler.
+
+680x0 Code (Macintosh CodeWarrior): CodeWarrior's profiler is function
+based; consequently, it does not allow for line-by-line analysis as does the
+Watcom compiler's profiler.
+
+However, the CodeWarrior profiler does give us enough information to note
+that NumSift() only accounts for about 28% of the time consumed by the
+benchmark. The outer routine, NumHeapSort() accounts for around 71% of the
+time taken. It will require additional analysis to determine why the two
+compilers -- Watcom and CodeWarrior divide the workload so differently. (It
+may have something to do with compiler architecture, or the act of profiling
+the code may produce results that are significantly different than how the
+program runs under normal conditions, though that would lead one to wonder
+what use profilers would be.)
+
+Porting Considerations
+
+The numeric sort routine should represent a trivial porting exercise. It is
+not an overly large benchmark in terms of source code. Additionally, the
+only external routines it calls on are for allocating and releasing memory,
+and managing the stopwatch.
+
+The numeric sort benchmark depends on the following global definitions (note
+that these may be overridden by the command file):
+
+NUMNUMARRAYS -- Sets the upper limit on the number of arrays that the
+benchmark will attempt to build. The numeric sort benchmark creates work for
+itself by requiring the system to sort more and more arrays...not bigger and
+bigger arrays. (The latter case would skew results, because the sorting time
+for heapsort is N log2 N - e.g., doubling the array size does not double the
+sort time.) This constant sets the upper limit to the number of arrays the
+system will build before it signals an error. The default value is 100, and
+may be changed if your system exceeds this limit.
+
+NUMARRAYSIZE - Determines the size of each array built. It has been set to
+8111L and should not be tampered with. The command file entry
+NUMARRAYSIZE=<n> can be used to change this value, but results produced by
+doing this will make your results incompatible with other runs of the
+benchmark (since results will be skewed -- see preceding paragraph).
+
+To test for a correct execution of the numeric sort benchmark, #define the
+DEBUG symbol. This will enable code that verifies that arrays are properly
+sorted. You should run the benchmark program using a command file that has
+only the numeric sort test enabled. If there is an error, the program will
+display "SORT ERROR" (If this happens, it's possible that tons of "SORT
+ERROR" messages will be emitted, so it's best not to redirect output to a
+file), otherwise it will print "Numeric sort: OK" (also quite a few times).
+
+References
+
+Gonnet, G.H. 1984, Handbook of Algorithms and Data Structures (Reading, MA:
+Addison-Wesley).
+
+Knuth, Donald E. 1968, Fundamental Algorithms, vol 1 of The Art of Computer
+Programming (Reading, MA: Addison-Wesley).
+
+Press, William H., Flannery, Brian P., Teukolsky, Saul A., and Vetterling,
+William T. 1989, Numerical Recipes in Pascal (Cambridge: Cambridge
+University Press).
+
+Heapsort
+
+The heapsort algorithm is well-covered in a number of the popular
+computer-science textbooks. In fact, it gets a pat on the back in Numerical
+Recipes (Press et. al.), where the authors write:
+
+Heapsort is our favorite sorting routine. It can be recommended
+wholeheartedly for a variety of sorting applications. It is a true
+"in-place" sort, requiring no auxiliary storage.
+
+Heapsort works by building the array into a kind of a queue called a heap.
+You can imagine this heap as being a form of in-memory binary tree. The
+topmost (root) element of the tree is the element that -- were the array
+sorted -- would be the largest element in the array. Sorting takes place by
+first constructing the heap, then pulling the root off the tree, promoting
+the next largest element to the root, pulling it off, and so on. (The
+promotion process is known as "sifting up.")
+
+Heapsort executes in N log2 N time even in its worst case. Unlike some other
+sorting algorithms, it does not benefit from a partially sorted array
+(though Gonnet does refer to a variation of heapsort, called "smoothsort,"
+which does -- see references).
+
+String Sort
+
+Description
+
+This benchmark is designed to gauge how well the system moves bytes around.
+By that we mean, how well the system can copy a string of bytes from one
+location to another; source and destination being aligned to arbitrary
+addresses. (This is unlike the numeric sort array, which moves bytes
+longword-at-a-time.) The strings themselves are built so as to be of random
+length, ranging from no fewer than 4 bytes and no greater than 80 bytes. The
+mixture of random lengths means that processors will be forced to deal with
+strings that begin and end on arbitrary address boundaries.
+
+The string sort benchmark uses the heapsort algorithm; this is the same
+algorithm as is used in the numeric sort benchmark (see the sidebar on the
+heapsort for a detailed description of the algorithm).
+
+Manipulation of the strings is actually handled by two arrays. One array
+holds the strings themselves; the other is a pointers array. Each member of
+the pointers array carries an offset that points into the string array, so
+that the ith pointer carries the offset to the ith string. This allows the
+benchmark to rapidly locate the position of the ith string. (The sorting
+algorithm requires exchanges of items that might be "distant" from one
+another in the array. It's critical that the routine be able to rapidly find
+a string based on its indexed position in the array.)
+
+The string sort benchmark reports the number of string arrays it was able to
+sort per second. The size of the array is set by a global constant.
+
+Analysis
+
+Optimized 486 code (Watcom C/C++ 10.0): Profiling of the string sort
+benchmark indicates that it spends most of its time in the C library routine
+memmove(). Within that routine, most of the execution is consumed by a pair
+of instructions: rep movsw and rep movsd. These are repeated string move --
+word width and repeated string move -- doubleword width, respectively.
+
+This is precisely where we want to see the time spent. It's interesting to
+note that the memmove() of the particular compiler/profiler tested (Watcom
+C/C++ 10.0) was "smart" enough to do most of the moving on word or
+doubleword boundaries. The string sort benchmark specifically sets arbitrary
+boundaries, so we'd expect to see lots of byte-wide moves. The "smart"
+memmove() is able to move bytes only when it has to, and does the remainder
+of the work via words and doublewords (which can move more bits at a time).
+
+680x0 Code (Macintosh CodeWarrior): Because CodeWarrior's profiler is
+function based, it is impossible to get an idea of how much time the test
+spends in library routines such as memmove(). Fortunately, as an artifact of
+the early version of the benchmark, the string sort algorithm makes use of
+the MoveMemory() routine in the sysspec.c file (system specific routines).
+This call, on anything other than a 16-bit DOS system, calls memmove()
+directly. Hence, we can get a good approximation of how much time is spent
+moving bytes.
+
+The answer is that nearly 78% of the benchmark's time is consumed by
+MoveMemory(), the rest being taken up by the other routines (the
+str_is_less() routine, which performs string comparisons, takes about 7% of
+the time). As above, we can guess that most of the benchmark's time is
+dependent on the performance of the library's memmove() routine.
+
+Porting Considerations
+
+As with the numeric sort routine, the string sort benchmark should be simple
+to port. Simpler, in fact. The string sort benchmark routine is not
+dependent on any typedef that may change from machine to machine (unless a
+char type is not 8 bits).
+
+The string sort benchmark depends on the following global definitions:
+
+NUMSTRARRAYS - Sets the upper limit on the number of arrays that the
+benchmark will attempt to build. The string sort benchmark creates work for
+itself by requiring the system to sort more and more arrays, not bigger and
+bigger arrays. (See section on Numeric Sort for an explanation.) This
+constant sets the upper limit to the number of arrays the system will build
+before it signals an error. The default value is 100, and may be changed if
+your system exceeds this limit.
+
+STRARRAYSIZE - Sets the default size of the string arrays built. We say
+"arrays" because, as with the numeric sort benchmark, the system adds work
+not by expanding the size of the array, but by adding more arrays. This
+value is set to 8111, and should not be modified, since results would not be
+comparable with other runs of the same benchmark on other machines.
+
+To test for a correct execution of the string sort benchmark, #define
+the DEBUG symbol. This will enable code that verifies the arrays are
+properly sorted. Set up a command file that runs only the string sort,
+and execute the benchmark program. If the routine is operating
+properly, the benchmark will print "String sort: OK", this message is
+printed quite often. Otherwise, the program will display "SORT ERROR"
+for each pair of strings it finds out of order (which can be really
+often).
+
+References
+
+See the references for the Numeric Sort benchmark.
+
+Bitfield Operations
+
+Description
+
+The purpose of this benchmark is to explore how efficiently the system
+executes operations that deal with "twiddling bits." The test is set up to
+simulate a "bit map"; a data structure used to keep track of storage usage.
+(Don't confuse this meaning of "bitmap" with its use in describing a
+graphics data structure.)
+
+Systems often use bit maps to keep an inventory of memory blocks or (more
+frequently) disk blocks. In the case of a bit map that manages disk usage,
+an operating system will set aside a buffer in memory so that each bit in
+that buffer corresponds to a block on the disk drive. A 0 bit means that the
+corresponding block is free; a 1 bit means the block is in use. Whenever a
+file requests a new block of disk storage, the operating system searches the
+bit map for the first 0 bit, sets the bit (to indicate that the block is now
+spoken for), and returns the number of the corresponding disk block to the
+requesting file.
+
+These types of operations are precisely what this test simulates. A block of
+memory is set allocated for the bit map. Another block of memory is
+allocated, and set up to hold a series of "bit map commands". Each bitmap
+command tells the simulation to do 1 of 3 things:
+
+1) Clear a series of consecutive bits,
+
+2) Set a series of consecutive bits, or
+
+3) Complement (1->0 and 0->1) a series of consecutive bits.
+
+The bit map command block is loaded with a set of random bit map commands
+(each command covers an random number of bits), and simulation routine steps
+sequentially through the command block, grabbing a command and executing it.
+
+The bitfield benchmark reports the number of bits it was able to operate on
+per second. The size of the bit map is constant; the bitfield operations
+array is adjusted based on the capabilities of the processor. (See the
+section describing the auto-adjust feature of the benchmarks.)
+
+Analysis
+
+Optimized 486 code: Using the Watcom C/C++ 10.0 profiler, the Bitfield
+benchmark appears to spend all of its time in two routines: ToggleBitRun()
+(74% of the time) and DoBitFieldIteration() (24% of the time). We say
+"appears" because this is misleading, as we will explain.
+
+First, it is important to recall that the test performs one of three
+operations for each run of bits (see above). The routine ToggleBitRun()
+handles two of those three operations: setting a run of bits and clearing a
+run of bits. An if() statement inside ToggleBitRun() decides which of the
+two operations is performed. (Speed freaks will quite rightly point out that
+this slows the entire algorithm. ToggleBitRun() is called by a switch()
+statement which has already decided whether bits should be set or cleared;
+it's a waste of time to have ToggleBitRun() have to make that decision yet
+again.)
+
+DoBitFieldIteration() is the "outer" routine that calls ToggleBitRun().
+DoBitFieldIt...
 
[truncated message content]