[inq-commits] SF.net SVN: inq: [467] trunk/client
Brought to you by:
greycat
From: <sta...@us...> - 2008-01-28 10:45:24
|
Revision: 467 http://inq.svn.sourceforge.net/inq/?rev=467&view=rev Author: stargrave Date: 2008-01-28 02:45:30 -0800 (Mon, 28 Jan 2008) Log Message: ----------- * Removed unneded test/bench directory creation * Added BYTEmark-based benchmark Modified Paths: -------------- trunk/client/Makefile trunk/client/lib/Makefile Added Paths: ----------- trunk/client/lib/bytemark/ trunk/client/lib/bytemark/COM.DAT trunk/client/lib/bytemark/Makefile trunk/client/lib/bytemark/NNET.DAT trunk/client/lib/bytemark/README trunk/client/lib/bytemark/bdoc.txt trunk/client/lib/bytemark/debugbit.good.gz trunk/client/lib/bytemark/emfloat.c trunk/client/lib/bytemark/emfloat.h trunk/client/lib/bytemark/hardware.c trunk/client/lib/bytemark/hardware.h trunk/client/lib/bytemark/hello.c trunk/client/lib/bytemark/misc.c trunk/client/lib/bytemark/misc.h trunk/client/lib/bytemark/nbench0.c trunk/client/lib/bytemark/nbench0.h trunk/client/lib/bytemark/nbench1.c trunk/client/lib/bytemark/nbench1.h trunk/client/lib/bytemark/nmglobal.h trunk/client/lib/bytemark/pointer.c trunk/client/lib/bytemark/sysinfo.c.example trunk/client/lib/bytemark/sysinfo.c.template trunk/client/lib/bytemark/sysinfo.sh trunk/client/lib/bytemark/sysinfoc.c.example trunk/client/lib/bytemark/sysinfoc.c.template trunk/client/lib/bytemark/sysspec.c trunk/client/lib/bytemark/sysspec.h trunk/client/lib/bytemark/wordcat.h trunk/client/test/bytemark Modified: trunk/client/Makefile =================================================================== --- trunk/client/Makefile 2008-01-28 10:26:35 UTC (rev 466) +++ trunk/client/Makefile 2008-01-28 10:45:30 UTC (rev 467) @@ -26,8 +26,7 @@ $(DESTDIR)$(SHARE_DIR)/test \ $(DESTDIR)$(SHARE_DIR)/monitoring \ $(DESTDIR)$(SHARE_DIR)/self-id \ - $(DESTDIR)$(SHARE_DIR)/test/additional \ - $(DESTDIR)$(SHARE_DIR)/test/bench + $(DESTDIR)$(SHARE_DIR)/test/additional find detect -type f -and ! -path '*.svn*' | xargs install -m644 -t $(DESTDIR)$(SHARE_DIR)/detect/ find test -type f -and ! -path '*.svn*' | xargs install -m755 -t $(DESTDIR)$(SHARE_DIR)/test/ find test/additional -type f -and ! -path '*.svn*' | xargs install -m755 -t $(DESTDIR)$(SHARE_DIR)/test/additional/ Modified: trunk/client/lib/Makefile =================================================================== --- trunk/client/lib/Makefile 2008-01-28 10:26:35 UTC (rev 466) +++ trunk/client/lib/Makefile 2008-01-28 10:45:30 UTC (rev 467) @@ -1,7 +1,7 @@ .EXPORT_ALL_VARIABLES: #SUBDIRS=alert hide-mouse memtest pci-patch wait-serial watchdog inquisitor thermometer -SUBDIRS=watchdog wait-string einarc stream-mem whetstone dhrystone +SUBDIRS=watchdog wait-string einarc stream-mem whetstone dhrystone bytemark include ../../Makefile.config Added: trunk/client/lib/bytemark/COM.DAT =================================================================== --- trunk/client/lib/bytemark/COM.DAT (rev 0) +++ trunk/client/lib/bytemark/COM.DAT 2008-01-28 10:45:30 UTC (rev 467) @@ -0,0 +1,11 @@ +ALLSTATS=T +DONUMSORT=T +DOSTRINGSORT=T +DOBITFIELD=T +DOEMF=T +DOFOUR=T +DOASSIGN=T +DOIDEA=T +DOHUFF=T +DONNET=T +DOLU=T Added: trunk/client/lib/bytemark/Makefile =================================================================== --- trunk/client/lib/bytemark/Makefile (rev 0) +++ trunk/client/lib/bytemark/Makefile 2008-01-28 10:45:30 UTC (rev 467) @@ -0,0 +1,64 @@ +include ../../../Makefile.config + +default: nbench +CC = gcc +CFLAGS = -s -static -Wall -O3 -fomit-frame-pointer -funroll-loops +MACHINE= +DEFINES= -DLINUX $(NO_UNAME) + +all: nbench + +sysinfoc.c: Makefile + ./sysinfo.sh $(CC) $(MACHINE) $(DEFINES) $(CFLAGS) + +sysinfo.c: Makefile + ./sysinfo.sh $(CC) $(MACHINE) $(DEFINES) $(CFLAGS) + +hardware.o: hardware.c hardware.h Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c hardware.c + +nbench0.o: nbench0.h nbench0.c nmglobal.h pointer.h hardware.h\ + Makefile sysinfo.c sysinfoc.c + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c nbench0.c + +emfloat.o: emfloat.h emfloat.c nmglobal.h pointer.h Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c emfloat.c + +pointer.h: pointer Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -o pointer pointer.c + rm -f pointer.h + if [ "4" = `./pointer` ] ; then touch pointer.h ;\ + else echo "#define LONG64" >pointer.h ; fi + +misc.o: misc.h misc.c Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c misc.c + +nbench1.o: nbench1.h nbench1.c wordcat.h nmglobal.h pointer.h Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c nbench1.c + +sysspec.o: sysspec.h sysspec.c nmglobal.h pointer.h Makefile + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS)\ + -c sysspec.c + +nbench: emfloat.o misc.o nbench0.o nbench1.o sysspec.o hardware.o + $(CC) $(MACHINE) $(DEFINES) $(CFLAGS) $(LINKFLAGS)\ + emfloat.o misc.o nbench0.o nbench1.o sysspec.o hardware.o\ + -o nbench -lm +clean: + - /bin/rm -f *.o *~ \#* core a.out hello sysinfo.c sysinfoc.c \ + bug pointer pointer.h debugbit.dat + +mrproper: clean + - /bin/rm -f nbench + +install: + mkdir -p $(DESTDIR)$(BIN_DIR) $(DESTDIR)$(SHARE_DIR)/test/additional + install -m755 nbench $(DESTDIR)$(BIN_DIR)/ + cp NNET.DAT $(DESTDIR)$(SHARE_DIR)/test/additional/ + cp COM.DAT $(DESTDIR)$(SHARE_DIR)/test/additional/ Added: trunk/client/lib/bytemark/NNET.DAT =================================================================== --- trunk/client/lib/bytemark/NNET.DAT (rev 0) +++ trunk/client/lib/bytemark/NNET.DAT 2008-01-28 10:45:30 UTC (rev 467) @@ -0,0 +1,210 @@ +5 7 8 +26 +0 0 1 0 0 +0 1 0 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 0 0 0 0 0 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +0 1 0 0 0 0 1 0 +0 1 1 1 0 +1 0 0 0 1 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 0 0 0 1 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +0 1 0 0 0 1 0 0 +1 1 1 1 1 +1 0 0 0 0 +1 0 0 0 0 +1 1 1 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 1 1 1 1 +0 1 0 0 0 1 0 1 +1 1 1 1 1 +1 0 0 0 0 +1 0 0 0 0 +1 1 1 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +0 1 0 0 0 1 1 0 +0 1 1 1 0 +1 0 0 0 1 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 1 1 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 0 0 1 1 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 0 0 1 0 0 0 +0 1 1 1 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 1 1 1 0 +0 1 0 0 1 0 0 1 +0 0 0 0 1 +0 0 0 0 1 +0 0 0 0 1 +0 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 0 1 0 1 0 +1 0 0 0 1 +1 0 0 1 0 +1 0 1 0 0 +1 1 0 0 0 +1 0 1 0 0 +1 0 0 1 0 +1 0 0 0 1 +0 1 0 0 1 0 1 1 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +1 1 1 1 1 +0 1 0 0 1 1 0 0 +1 0 0 0 1 +1 1 0 1 1 +1 0 1 0 1 +1 0 1 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 0 0 1 1 0 1 +1 0 0 0 1 +1 1 0 0 1 +1 0 1 0 1 +1 0 1 0 1 +1 0 1 0 1 +1 0 0 1 1 +1 0 0 0 1 +0 1 0 0 1 1 1 0 +0 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 0 1 1 1 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +1 0 0 0 0 +1 0 0 0 0 +1 0 0 0 0 +0 1 0 1 0 0 0 0 +0 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 1 0 1 +1 0 0 1 1 +0 1 1 1 1 +0 1 0 1 0 0 0 1 +1 1 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 1 1 1 0 +1 0 1 0 0 +1 0 0 1 0 +1 0 0 0 1 +0 1 0 1 0 0 1 0 +0 1 1 1 1 +1 0 0 0 0 +1 0 0 0 0 +0 1 1 1 0 +0 0 0 0 1 +0 0 0 0 1 +1 1 1 1 0 +0 1 0 1 0 0 1 1 +1 1 1 1 1 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 1 0 1 0 1 0 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 1 1 0 +0 1 0 1 0 1 0 1 +1 0 0 0 1 +1 0 0 0 1 +0 1 0 1 0 +0 1 0 1 0 +0 1 0 1 0 +0 1 0 1 0 +0 0 1 0 0 +0 1 0 1 0 1 1 0 +1 0 0 0 1 +1 0 0 0 1 +1 0 0 0 1 +1 0 1 0 1 +1 0 1 0 1 +1 0 1 0 1 +0 1 0 1 0 +0 1 0 1 0 1 1 1 +1 0 0 0 1 +0 1 0 1 0 +0 1 0 1 0 +0 0 1 0 0 +0 1 0 1 0 +0 1 0 1 0 +1 0 0 0 1 +0 1 0 1 1 0 0 0 +1 0 0 0 1 +0 1 0 1 0 +0 1 0 1 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 0 1 0 0 +0 1 0 1 1 0 0 1 +1 1 1 1 1 +0 0 0 1 0 +0 0 0 1 0 +0 0 1 0 0 +0 1 0 0 0 +0 1 0 0 0 +1 1 1 1 1 +0 1 0 1 1 0 1 0 Added: trunk/client/lib/bytemark/README =================================================================== --- trunk/client/lib/bytemark/README (rev 0) +++ trunk/client/lib/bytemark/README 2008-01-28 10:45:30 UTC (rev 467) @@ -0,0 +1,66 @@ +February 18, 2003 +----------------- +Bug-fix release. + +December 9, 1997 +---------------- +This release is based on beta release 2 of BYTE Magazine's BYTEmark +benchmark program (previously known as BYTE's Native Mode +Benchmarks). This document covers the Native Mode (a.k.a. Algorithm +Level) tests; benchmarks designed to expose the capabilities of a +system's CPU, FPU, and memory system. + +Running a "make" will create the binary if all goes well. It is called +"nbench" and performs a suite of 10 tests and compares the results to +a Dell Pentium 90 with 16 MB RAM and 256 KB L2 cache running MSDOS and +compiling with the Watcom 10.0 C/C++ compiler. If you define -DLINUX +during compilation (the default) then you also get a comparison to an +AMD K6/233 with 32 MB RAM and 512 KB L2-cache running Linux 2.0.32 and +using a binary which was compiled with GNU gcc version 2.7.2.3 and GNU +libc-5.4.38. + +For more verbose output specify -v as an argument. + +The primary web site is: http://www.tux.org/~mayer/linux/bmark.html + +The port to Linux/Unix was done by Uwe F. Mayer <ma...@tu...>. + +The index-split was done by Andrew D. Balsa, and reflects the +realization that memory management is important in CPU design. The +original tests have been left alone, however, the tests NUMERIC SORT, +FP EMULATION, IDEA, and HUFFMAN now constitute the integer-arithmetic +focused benchmark index, while the tests STRING SORT, BITFIELD, and +ASSIGNMENT make up the new memory index. + +The algorithms were not changed from the source which was obtained +from the BYTE web site at http://www.byte.com/bmark/bmark.htm on +December 14, 1996. However, the source was modified to better work +with 64-bit machines (in particular the random number generator was +modified to always work with 32 bit, no matter what kind of hardware +you run it on). Furthermore, for some of the algorithms additional +resettings of the data was added to increase the consistency across +different hardware. Some extra debugging code was added, which has no +impact on normal runs. + +In case there is uneven system load due to other processes while this +benchmark suite executes, it might take longer to run than on an +unloaded system. This is because the benchmark does some statistical +analysis to make sure that the reported results are statistically +significant, and an increased variation in individual runs requires +more runs to achieve the required statistical confidence. + +This is a single-threaded benchmark and is not designed to measure the +performance gain on multi-processor machines. + +For details and customization read bdoc.txt. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Added: trunk/client/lib/bytemark/bdoc.txt =================================================================== --- trunk/client/lib/bytemark/bdoc.txt (rev 0) +++ trunk/client/lib/bytemark/bdoc.txt 2008-01-28 10:45:30 UTC (rev 467) @@ -0,0 +1,2109 @@ +http://www.byte.com/bmark/bmark.htm +---------------------------------------------------------------------------- + +BYTEmark + +---------------------------------------------------------------------------- + +This is release 2 of BYTE Magazine's BYTEmark benchmark program (previously +known as BYTE's Native Mode Benchmarks). This document covers the Native +Mode (a.k.a. Algorithm Level) tests; benchmarks designed to expose the +capabilities of a system's CPU, FPU, and memory system. Another group of +benchmarks within the BYTEmark suite includes the Application Simulation +Benchmarks. They are detailed in a separate document. [NOTE: The +documentation for the Application simulation benchmarks should appear before +the end of March, 95. -- RG]. + +The Tests + +The Native Mode portion of the BYTEmark consists of a number of well-known +algorithms; some BYTE has used before in earlier versions of the benchmark, +others are new. The complete suite consists of 10 tests: + +Numeric sort - Sorts an array of 32-bit integers. + +String sort - Sorts an array of strings of arbitrary length. + +Bitfield - Executes a variety of bit manipulation functions. + +Emulated floating-point - A small software floating-point package. + +Fourier coefficients - A numerical analysis routine for calculating series +approximations of waveforms. + +Assignment algorithm - A well-known task allocation algorithm. + +Huffman compression - A well-known text and graphics compression algorithm. + +IDEA encryption - A relatively new block cipher algorithm. + +Neural Net - A small but functional back-propagation network simulator. + +LU Decomposition - A robust algorithm for solving linear equations. + +A more complete description of each test can be found in later sections of +this document. + +BYTE built the BYTEmark with the multiplatform world foremost in mind. There +were, of course, other considerations that we kept high on the list: + +Real-world algorithms. The algorithms should actually do something. Previous +benchmarks often moved gobs of bytes from one point to another, added or +subtracted piles and piles of numbers, or (in some cases) actually executed +NOP instructions. We should not belittle those tests of yesterday, they had +their place. However, we think it better that tests be based on activities +that are more complex in nature. + +Easy to port. All the benchmarks are written in "vanilla" ANSI C. This +provides us with the best chance of moving them quickly and accurately to +new processors and operating systems as they appear. It also simplifies +maintenance. + +This means that as new 64-bit (and, perhaps, 128-bit) processors appear, the +benchmarks can test them as soon as a compiler is available. + +Comprehensive. The algorithms were derived from a variety of sources. Some +are routines that BYTE had been using for some time. Others are routines +derived from well-known texts in the computer science world. Furthermore, +the algorithms differ in structure. Some simply "walk" sequentially through +one-dimensional arrays. Others build and manipulate two-dimensional arrays. +Finally, some benchmarks are "integer" tests, while others exercise the +floating-point coprocessor (if one is available). + +Scalable. We wanted these benchmarks to be useful across as wide a variety +of systems as possible. We also wanted to give them a lifetime beyond the +next wave of new processors. + +To that end, we incorporated "dynamic workload adjustment." A complete +description of this appears in a later section. In a nutshell, this allows +the tests to "expand or contract" depending on the capabilities of the +system under test, all the while providing consistent results so that fair +and accurate comparisons are possible. + +Honesty In Advertising + +We'd be lying if we said that the BYTEmark was all the benchmarking that +anyone would ever need to run on a system. It would be equally inaccurate to +suggest that the tests are completely free of inadequacies. There are many +things the tests do not do, there are shortcomings, and there are problems. + +BYTE will continue to improve the BYTEmark. The source code is freely +available, and we encourage vendors and users to examine the routines and +provide us with their feedback. In this way, we assure fairness, +comprehensiveness, and accuracy. + +Still, as we mentioned, there are some shortcomings. Here are those we +consider the most significant. Keep them in mind as you examine the results +of the benchmarks now and in the future. + +At the mercy of C compilers. Being written in ANSI C, the benchmark program +is highly portable. This is a reflection of the "world we live in." If this +were a one-processor world, we might stand a chance at hand-crafting a +benchmark in assembly language. (At one time, that's exactly what BYTE did.) +Not today, no way. + +The upshot is that the benchmarks must be compiled. For broadest coverage, +we selected ANSI C. And when they're compiled, the resulting executable's +performance can be highly dependent on the capabilities of the C compiler. +Today's benchmark results can be blown out of the water tomorrow if someone +new enters the scene with an optimizing strategy that outperforms existing +competition. + +This concern is not easily waved off. It will require you to keep careful +track of compiler version and optimization switches. As BYTE builds its +database of benchmark results, version number and switch setting will become +an integral part of that data. This will be true for published information +as well, so that you can make comparisons fairly and accurately. BYTE will +control the distribution of test results so that all relevant compiler +information is attached to the data. + +As a faint justification -- for those who think this situation results in +"polluted" tests -- we should point out that we are in the same boat as all +the other developers (at least, all those using C compilers -- and that's +quite a sizeable group). If the only C compilers for a given system happen +to be poor ones, everyone suffers. It's a fact that a given platform's +ultimate potential depends as much on the development software available as +on the technical achievements of the hardware design. + +It's just CPU and FPU. It's very tempting to try to capture the performance +of a machine in a single number. That has never been possible -- though it's +been tried a lot -- and the gap between that ideal and reality will forever +widen. + +These benchmarks are meant to expose the theoretical upper limit of the CPU, +FPU, and memory architecture of a system. They cannot measure video, disk, +or network throughput (those are the domains of a different set of +benchmarks). You should, therefore, use the results of these tests as part, +not all, of any evaluation of a system. + +Single threaded. Currently, each benchmark test uses only a single execution +thread. It's unlikely that you'll find any modern operating system that does +not have some multitasking component. How a system "scales" as more tasks +are run simultaneously is an effect that the current benchmarks cannot +explore. + +BYTE is working on a future version of the tests that will solve this +problem. + +The tests are synthetic. This quite reasonable argument is based on the fact +that people don't run benchmarks for a living, they run applications. +Consequently, the only true measure of a system is how well it performs +whatever applications you will be running. This, in fact, is the philosophy +behind the BAPCo benchmarks. + +This is not a point with which we would disagree. BYTE regularly makes use +of a variety of application benchmarks. None of this suggests, however, that +the BYTEmark benchmarks serve no purpose. + +BYTEmark's results should be used as predictors. They can be moved to a new +platform long before native applications will be ported. The BYTEmark +benchmarks will therefore provide an early look at the potential of the +machine. Additionally, the BYTEmark permits you to "home in" on an aspect of +the overall architecture. How well does the system perform when executing +floating-point computations? Does its memory architecture help or hinder the +management of memory buffers that may fall on arbitrary address boundaries? +How does the cache work with a program whose memory access favors moving +randomly through memory as opposed to moving sequentially through memory? + +The answers to these questions can give you a good idea of how well a system +would support a particular class of applications. Only a synthetic benchmark +can give the narrow view necessary to find the answers. + +Dynamic Workloads + +Our long history of benchmarking has taught us one thing above all others: +Tomorrow's system will go faster than today's by an amount exceeding your +wildest guess -- and then some. Dealing with this can become an unending +race. + +It goes like this: You design a benchmark algorithm, you specify its +parameters (how big the array is, how many loops, etc.), you run it on +today's latest super-microcomputer, collect your data, and go home. A new +machine arrives the next day, you run your benchmark, and discover that the +test executes so quickly that the resolution of the clock routine you're +using can't keep up with it (i.e., the test is over and done before the +system clock even has a chance to tick). + +If you modify your routine, the figures you collected yesterday are no good. +If you create a better clock routine by sneaking down into the system +hardware, you can kiss portability goodbye. + +The BYTEmark benchmarks solve this problem by a process we'll refer to as +"dynamic workload adjustment." In principle, it simply means that if the +test runs so fast that the system clock can't time it, the benchmark +increases the test workload -- and keeps increasing it -- until enough time +is consumed to gather reliable test results. + +Here's an example. + +The BYTEmark benchmarks perform timing using a "stopwatch" paradigm. The +routine StartStopwatch() begins timing; StopStopwatch() ends timing and +reports the elapsed time in clock ticks. Now, "clock ticks" is a value that +varies from system to system. We'll presume that our test system provides +1000 clock ticks per second. (We'll also presume that the system actually +updates its clock 1000 times per second. Surprisingly, some systems don't do +that. One we know of will tell you that the clock provides 100 ticks per +second, but updates the clock in 5- or 6-tick increments. The resolution is +no better than somewhere around 1/18th of a second.) Here, when we say +"system" we mean not only the computer system, but the environment provided +by the C compiler. Interestingly, different C compilers for the same system +will report different clock ticks per second. + +Built into the benchmarks is a global variable called GLOBALMINTICKS. This +variable is the minimum number of clock ticks that the benchmark will allow +StopStopwatch() to report. + +Suppose you run the Numeric Sort benchmark. The benchmark program will +construct an array filled with random numbers, call StartStopwatch(), sort +the array, and call StopStopwatch(). If the time reported in StopStopwatch() +is less than GLOBALMINTICKS, then the benchmark will build two arrays, and +try again. If sorting two arrays took less time than GLOBALMINTICKS, the +process repeats with more arrays. + +This goes on until the benchmark makes enough work so that an interval +between StartStopwatch() and StopStopwatch() exceeds GLOBALMINTICKS. Once +that happens, the test is actually run, and scores are calculated. + +Notice that the benchmark didn't make bigger arrays, it made more arrays. +That's because the time taken by the sort test does not increase linearly as +the array grows, it increases by a factor of N*log(N) (where N is the size +of the array). + +This principle is applied to all the benchmark tests. A machine with a less +accurate clock may be forced to sort more arrays at a time, but the results +are given in arrays per second. In this way fast machines, slow machines, +machines with accurate clocks, machines with less accurate clocks, can all +be tested with the same code. + +Confidence Intervals + +Another built-in feature of the BYTEmark is a set of statistical-analysis +routines. Running benchmarks is one thing; the question arises as to how +many times should a test be run until you know you have a good sampling. +Also, can you determine whether the test is stable (i.e., do results vary +widely from one execution of the benchmark to the next)? + +The BYTEmark keeps score as follows: Each test (a test being a numeric +sort, a string sort, etc.) is run five times. These five scores are +averaged, the standard deviation is determined, and a 95% confidence +half-interval for the mean is calculated (using the student t +distribution). This tells us that the true average lies -- with a 95% +probability -- within plus or minus the confidence half-interval of +the calculated average. If this half-interval is within 5% of the +calculated average, the benchmarking stops. Otherwise, a new test is +run and the calculations are repeated with all of the runs done so +far, including the new one. The benchmark proceeds this way up to a +total of 30 runs. If the length of the half-interval is still bigger +than 5% of the calculated average then a warning issued that the +results might not be statistically certain before the average is +displayed. + +** Fixed a statistical bug here. Uwe F. Mayer + +The upshot is that, for each benchmark test, the true average is -- with a +95% level of confidence -- within 5% of the average reported. Here, the +"true average" is the average we would get were we able to run the tests +over and over again an infinite number of times. + +This specification ensures that the calculation of results is controlled; +that someone running the tests in California will use the same technique for +determining benchmark results as someone running the tests in New York. + +In case there is uneven system load due to other processes while this +benchmark suite executes, it might take longer to run the benchmark suite +as compared to a run an unloaded system. This is because the benchmark does +some statistical analysis to make sure that the reported results are +statistically significant (as explained above), and a high variation in +individual runs requires more runs to achieve the required statistical +confidence. + +*** added last the paragraph, Uwe F. Mayer + +Interpreting Results + +Of course, running the benchmarks can present you with a boatload of data. +It can get mystifying, and some of the more esoteric statistical information +is valuable only to a limited audience. The big question is: What does it +all mean? + +First, we should point out that the BYTEmark reports both "raw" and indexed +scores for each test. The raw score for a particular test amounts to the +"iterations per second" of that test. For example, the numeric sort test +reports as its raw score the number of arrays it was able to sort per +second. + +The indexed score is the raw score of the system under test divided by the +raw score obtained on the baseline machine. As of this release, the +baseline machine is a DELL 90 Mhz Pentium XPS/90 with 16 MB of RAM and 256K +of external processor cache. (The compiler used was the Watcom C/C++ 10.0 +compiler; optimizations set to "fastest possible code", 4-byte structure +alignment, Pentium code generation with Pentium register-based calling. The +operating system was MSDOS.) The indexed score serves to "normalize" the +raw scores, reducing their dynamic range and making them easier to +grasp. Simply put, if your machine has an index score of 2.0 on the numeric +sort test, it performed that test twice as fast as this 90 Mhz Pentium. + +If you run all the tests (as you'll see, it is possible to perform "custom +runs", which execute only a subset of the tests) the BYTEmark will also +produce two overall index figures: Integer index and Floating-point index. +The Integer index is the geometric mean of those tests that involve only +integer processing -- numeric sort, string sort, bitfield, emulated +floating-point, assignment, Huffman, and IDEA -- while the Floating-point +index is the geometric mean of those tests that require the floating-point +coprocessor -- Fourier, neural net, and LU decomposition. You can use these +scores to get a general feel for the performance of the machine under test +as compared to the baseline 90 Mhz Pentium. + +The Linux/Unix port has a second baseline machine, it is an AMD K6/233 with +32 MB RAM and 512 KB L2-cache running Linux 2.0.32 and using GNU gcc +version 2.7.2.3 and libc-5.4.38. The integer index was split as suggested +by Andrew D. Balsa <and...@us...>, and reflects the realization that +memory management is important in CPU design. The original tests have been +left alone, however, the geometric mean of the tests NUMERIC SORT, FP +EMULATION, IDEA, and HUFFMAN now constitutes the integer-arithmetic focused +benchmark index, while the geometric mean of the tests STRING SORT, +BITFIELD, and ASSIGNMENT makes up the new memory index. The floating point +index has been left alone, it is still the geometric mean of FOURIER, +NEURAL NET, and LU DECOMPOSITION. + +*** added the section on Linux, Uwe F. Mayer + +What follows is a list of the benchmarks and associated brief remarks that +describe what the tests do: What they exercise; what a "good" result or a +"bad" result means. Keep in mind that, in this expanding universe of faster +processors, bigger caches, more elaborate memory architectures, "good" and +"bad" are indeed relative terms. A good score on today's hot new processor +will be a bad score on tomorrow's hot new processor. + +These remarks are based on empirical data and profiling that we have done to +date. (NOTE: The profiling is limited to Intel and Motorola 68K on this +release. As more data is gathered, we will be refining this section. +3/14/95--RG) + +Benchmark Description + +Numeric sort Generic integer performance. Should + exercise non-sequential performance + of cache (or memory if cache is less + than 8K). Moves 32-bit longs at a + time, so 16-bit processors will be + at a disadvantage. + + + +String sort Tests memory-move performance. + Should exercise non-sequential + performance of cache, with added + burden that moves are byte-wide and + can occur on odd address boundaries. + May tax the performance of + cell-based processors that must + perform additional shift operations + to deal with bytes. + + + +Bitfield Exercises "bit twiddling" + performance. Travels through memory + in a somewhat sequential fashion; + different from sorts in that data is + merely altered in place. If + properly compiled, takes into + account 64-bit processors, which + should see a boost. + + + +Emulated F.P. Past experience has shown this test + to be a good measurement of overall + performance. + + + +Fourier Good measure of transcendental and + trigonometric performance of FPU. + Little array activity, so this test + should not be dependent of cache or + memory architecture. + + + +Assignment The test moves through large integer + arrays in both row-wise and + column-wise fashion. Cache/memory + with good sequential performance + should see a boost (memory is + altered in place -- no moving as in + a sort operation). Processing is + done in 32-bit chunks -- no + advantage given to 64-bit + processors. + + + +Huffman A combination of byte operations, + bit twiddling, and overall integer + manipulation. Should be a good + general measurement. + + + +IDEA Moves through data sequentially in + 16-bit chunks. Should provide a + good indication of raw speed. + + + +Neural Net Small-array floating-point test + heavily dependent on the exponential + function; less dependent on overall + FPU performance. Small arrays, so + cache/memory architecture should not + come into play. + + + +LU decomposition. A floating-point test that moves + through arrays in both row-wise and + column-wise fashion. Exercises only + fundamental math operations (+, -, + *, /). + +The Command File + +Purpose + +The BYTEmark program allows you to override many of its default parameters +using a command file. The command file also lets you request statistical +information, as well as specify an output file to hold the test results for +later use. + +You identify the command file using a command-line argument. E.G., + +C:NBENCH -cCOMFILE.DAT + +tells the benchmark program to read from COMFILE.DAT in the current +directory. + +The content of the command file is simply a series of parameter names and +values, each on a single line. The parameters control internal variables +that are either global in nature (i.e., they effect all tests in the +program) or are specific to a given benchmark test. + +The parameters are listed in a reference guide that follows, arranged in the +following groups: + +Global Parameters + +Numeric Sort + +String Sort + +Bitfield + +Emulated floating-point + +Fourier coefficients + +Assignment algorithm + +IDEA encryption + +Huffman compression + +Neural net + +LU decomposition + +As mentioned above, those items listed under "Global Parameters" affect all +tests; the rest deal with specific benchmarks. There is no required ordering +to parameters as they appear in the command file. You can specify them in +any sequence you wish. + +You should be judicious in your use of a command file. Some parameters will +override the "dynamic workload" adjustment that each test performs. Doing +this completely bypasses the benchmark code that is designed to produce an +accurate reading from your system clock. Other parameters will alter default +settings, yielding test results that cannot be compared with published +benchmark results. + +A Sample Command File + +Suppose you built a command file that contained the following: + +ALLSTATS=T + +CUSTOMRUN=T + +OUTFILE=D:\DATA.DAT + +DONUMSORT=T + +DOLU=T + +Here's what this file tells the benchmark program: + +ALLSTATS=T means that you've requested a "dump" of all the statistics the +test gathers. This includes not only the standard deviations of tests run, +it also produces test-specific information such as the number of arrays +built, the array size, etc. + +CUSTOMRUN=T tells the system that this is a custom run. Only tests +explicitly specified will be executed. + +OUTFILE=D:\DATA.DAT will write the output of the benchmark to the file +DATA.DAT on the root of the D: drive. (If DATA.DAT already exists, output +will be appended to the file.) + +DONUMSORT=T tells the system to run the numeric sort benchmark. (This was +necessary on account of the CUSTOMRUN=T line, above.) + +DOLU=T tells the system to run the LU decomposition benchmark. + +Command File Parameters Reference + +(NOTE: Altering some global parameters can invalidate results for comparison +purposes. Those parameters are indicated in the following section by a bold +asterisk (*). If you alter any parameters so indicated, you may NOT publish +the resulting data as BYTEmark scores.) + +Global Parameters + +GLOBALMINTICKS=<n> + +This overrides the default global_min_ticks value (defined in NBENCH1.H). +The global_min_ticks value is defined as the minimum number of clock ticks +per iteration of a particular benchmark. For example, if global_min_ticks is +set to 100 and the numeric sort benchmark is run; each iteration MUST take +at least 100 ticks, or the system will expand the work-per-iteration. + +MINSECONDS=<n> + +Sets the minimum number of seconds any particular test will run. This has +the effect of controlling the number of repetitions done. Default: 5. + +ALLSTATS=<T|F> + +Set this flag to T for a "dump" of all statistics. The information displayed +varies from test to test. Default: F. + +OUTFILE=<path> + +Specifies that output should go to the specified output file. Any test +results and statistical data displayed on-screen will also be written to the +file. If the file does not exist, it will be created; otherwise, new output +will be appended to an existing file. This allows you to "capture" several +runs into a single file for later review. + +Note: the path should not appear in quotes. For example, something like the +following would work: OUTFILE=C:\BENCH\DUMP.DAT + +CUSTOMRUN=<T|F> + +Set this flag to T for a custom run. A "custom run" means that the program +will run only the benchmark tests that you explicitly specify. So, use this +flag to run a subset of the tests. Default: F. + +Numeric Sort + +DONUMSORT=<T|F> + +Indicates whether to do the numeric sort. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case default is F. + +NUMNUMARRAYS=<n> + +Indicates the number of numeric arrays the system will build. Setting this +value will override the program's "dynamic workload" adjustment for this +test.* + +NUMARRAYSIZE=<n> + +Indicates the number of elements in each numeric array. Default is 8001 +entries. (NOTE: Altering this value will invalidate the test for comparison +purposes. The performance of the numeric sort test is not related to the +array size as a linear function; i.e., an array twice as big will not take +twice as long. The relationship involves a logarithmic function.)* + +NUMMINSECONDS=<n> + +Overrides MINSECONDS for the numeric sort test. + +String Sort + +DOSTRINGSORT=<T|F> + +Indicates whether to do the string sort. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +STRARRAYSIZE=<n> + +Sets the size of the string array. Default is 8111. (NOTE: Altering this +value will invalidate the test for comparison purposes. The performance of +the string sort test is not related to the array size as a linear function; +i.e., an array twice as big will not take twice as long. The relationship +involves a logarithmic function.)* + +NUMSTRARRAYS=<n> + +Sets the number of string arrays that will be created to run the test. +Setting this value will override the program's "dynamic workload" adjustment +for this test.* + +STRMINSECONDS=<n> + +Overrides MINSECONDS for the string sort test. + +Bitfield + +DOBITFIELD=<T|F> + +Indicates whether to do the bitfield test. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +NUMBITOPS=<n> + +Sets the number of bitfield operations that will be performed. Setting this +value will override the program's "dynamic workload" adjustment for this +test.* + +BITFIELDSIZE=<n> + +Sets the number of 32-bit elements in the bitfield arrays. The default value +is dependent on the size of a long as defined by the current compiler. For a +typical compiler that defines a long to be 32 bits, the default is 32768. +(NOTE: Altering this parameter will invalidate test results for comparison +purposes.)* + +BITMINSECONDS=<n> + +Overrides MINSECONDS for the bitfield test. + +Emulated floating-point + +DOEMF=<T|F> + +Indicates whether to do the emulated floating-point test. Default is T, +unless this is a custom run (CUSTOMRUN=T), in which case the default is F. + +EMFARRAYSIZE=<n> + +Sets the size (number of elements) of the emulated floating-point benchmark. +Default is 3000. The test builds three arrays, each of equal size. This +parameter sets the number of elements for EACH array. (NOTE: Altering this +parameter will invalidate test results for comparison purposes.)* + +EMFLOOPS=<n> + +Sets the number of loops per iteration of the floating-point test. Setting +this value will override the program's "dynamic workload" adjustment for +this test.* + +EMFMINSECONDS=<n> + +Overrides MINSECONDS for the emulated floating-point test. + +Fourier coefficients + +DOFOUR=<T|F> + +Indicates whether to do the Fourier test. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +FOURASIZE=<n> + +Sets the size of the array for the Fourier test. This sets the number of +coefficients the test will derive. NOTE: Specifying this value will override +the system's "dynamic workload" adjustment for this test, and may make the +results invalid for comparison purposes.* + +FOURMINSECONDS=<n> + +Overrides MINSECONDS for the Fourier test. + +Assignment Algorithm + +DOASSIGN=<T|F> + +Indicates whether to do the assignment algorithm test. Default is T, unless +this is a custom run (CUSTOMRUN=T), in which case the default is F. + +ASSIGNARRAYS=<n> + +Indicates the number of arrays that will be built for the test. Specifying +this value will override the system's "dynamic workload" adjustment for this +test. (NOTE: The size of the arrays in the assignment algorithm is fixed at +101 x 101. Altering the array size requires adjusting global constants and +recompiling; to do so, however, would invalidate test results.)* + +ASSIGNMINSECONDS=<n> + +Overrides MINSECONDS for the assignment algorithm test. + +IDEA encryption + +DOIDEA=<T|F> + +Indicates whether to do the IDEA encryption test. Default is T, unless this +is a custom run (CUSTOMRUN=T), in which case the default is F. + +IDEAARRAYSIZE=<n> + +Sets the size of the plain-text character array that will be encrypted by the +test. Default is 4000. The benchmark actually builds 3 arrays: 1st +plain-text, encrypted version, and 2nd plain-text. The 2nd plain-text array is +the destination for the decryption process [part of the test]. All arrays +are set to the same size. (NOTE: Specifying this value will invalidate test +results for comparison purposes.)* + +IDEALOOPS=<n> + +Indicates the number of loops in the IDEA test. Specifying this value will +override the system's "dynamic workload" adjustment for this test.* + +IDEAMINSECONDS=<n> + +Overrides MINSECONDS for the IDEA test. + +Huffman compression + +DOHUFF=<T|F> + +Indicates whether to do the Huffman test. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +HUFFARRAYSIZE=<n> + +Sets the size of the string buffer that will be compressed using the Huffman +test. The default is 5000. (NOTE: Altering this value will invalidate test +results for comparison purposes.)* + +HUFFLOOPS=<n> + +Sets the number of loops in the Huffman test. Specifying this value will +override the system's "dynamic workload" adjustment for this test.* + +HUFFMINSECONDS=<n> + +Overrides MINSECONDS for the Huffman test. + +Neural net + +DONNET=<T|F> + +Indicates whether to do the Neural Net test. Default is T, unless this is a +custom run (CUSTOMRUN=T), in which case the default is F. + +NNETLOOPS=<n> + +Sets the number of loops in the Neural Net test. NOTE: Altering this value +overrides the benchmark's "dynamic workload" adjustment algorithm, and may +invalidate the results for comparison purposes.* + +NNETMINSECONDS=<n> + +Overrides MINSECONDS for the Neural Net test. + +LU decomposition + +DOLU=<T|F> + +Indicates whether to do the LU decomposition test. Default is T, unless this +is a custom run (CUSTOMRUN=T), in which case the default is F. + +LUNUMARRAYS=<n> + +Sets the number of arrays in each iteration of the LU decomposition test. +Specifying this value will override the system's "dynamic workload" +adjustment for this test.* + +LUMINSECONDS=<n> + +Overrides MINSECONDS for the LU decomposition test. + +Numeric Sort + +Description + +This benchmark is designed to explore how well the system sorts a numeric +array. In this case, a numeric array is a one-dimensional collection of +signed, 32-bit integers. The actual sorting is performed by a heapsort +algorithm (see the text box following for a description of the heapsort +algorithm). + +It's probably unnecessary to point out (but we'll do it anyway) that sorting +is a fundamental operation in computer application software. You'll likely +find sorting routines nestled deep inside a variety of applications; +everything from database systems to operating-systems kernels. + +The numeric sort benchmark reports the number of arrays it was able to sort +per second. The array size is set by a global constant (it can be overridden +by the command file -- see below). + +Analysis + +Optimized 486 code: Profiling of the numeric sort benchmark using Watcom's +profiler (Watcom C/C++ 10.0) indicates that the algorithm spends most of its +time in the numsift() function (specifically, about 90% of the benchmark's +time takes place in numsift()). Within numsift(), two if statements dominate +time spent: + +if(array[k]<array[k+1L]) and if(array[i]<array[k]) + +Both statements involve indexes into arrays, so it's likely the processor is +spending a lot of time resolving the array references. (Though both +statements involve "less-than" comparisons, we doubt that much time is +consumed in performing the signed compare operation.) Though the first +statement involves array elements that are adjacent to one another, the +second does not. In fact, the second statement will probably involve +elements that are far apart from one another during early passes through the +sifting process. We expect that systems whose caching system pre-fetches +contiguous elements (often in "burst" line fills) will not have any great +advantage of systems without pre-fetch mechanisms. + +Similar results were found when we profiled the numeric sort algorithm under +the Borland C/C++ compiler. + +680x0 Code (Macintosh CodeWarrior): CodeWarrior's profiler is function +based; consequently, it does not allow for line-by-line analysis as does the +Watcom compiler's profiler. + +However, the CodeWarrior profiler does give us enough information to note +that NumSift() only accounts for about 28% of the time consumed by the +benchmark. The outer routine, NumHeapSort() accounts for around 71% of the +time taken. It will require additional analysis to determine why the two +compilers -- Watcom and CodeWarrior divide the workload so differently. (It +may have something to do with compiler architecture, or the act of profiling +the code may produce results that are significantly different than how the +program runs under normal conditions, though that would lead one to wonder +what use profilers would be.) + +Porting Considerations + +The numeric sort routine should represent a trivial porting exercise. It is +not an overly large benchmark in terms of source code. Additionally, the +only external routines it calls on are for allocating and releasing memory, +and managing the stopwatch. + +The numeric sort benchmark depends on the following global definitions (note +that these may be overridden by the command file): + +NUMNUMARRAYS -- Sets the upper limit on the number of arrays that the +benchmark will attempt to build. The numeric sort benchmark creates work for +itself by requiring the system to sort more and more arrays...not bigger and +bigger arrays. (The latter case would skew results, because the sorting time +for heapsort is N log2 N - e.g., doubling the array size does not double the +sort time.) This constant sets the upper limit to the number of arrays the +system will build before it signals an error. The default value is 100, and +may be changed if your system exceeds this limit. + +NUMARRAYSIZE - Determines the size of each array built. It has been set to +8111L and should not be tampered with. The command file entry +NUMARRAYSIZE=<n> can be used to change this value, but results produced by +doing this will make your results incompatible with other runs of the +benchmark (since results will be skewed -- see preceding paragraph). + +To test for a correct execution of the numeric sort benchmark, #define the +DEBUG symbol. This will enable code that verifies that arrays are properly +sorted. You should run the benchmark program using a command file that has +only the numeric sort test enabled. If there is an error, the program will +display "SORT ERROR" (If this happens, it's possible that tons of "SORT +ERROR" messages will be emitted, so it's best not to redirect output to a +file), otherwise it will print "Numeric sort: OK" (also quite a few times). + +References + +Gonnet, G.H. 1984, Handbook of Algorithms and Data Structures (Reading, MA: +Addison-Wesley). + +Knuth, Donald E. 1968, Fundamental Algorithms, vol 1 of The Art of Computer +Programming (Reading, MA: Addison-Wesley). + +Press, William H., Flannery, Brian P., Teukolsky, Saul A., and Vetterling, +William T. 1989, Numerical Recipes in Pascal (Cambridge: Cambridge +University Press). + +Heapsort + +The heapsort algorithm is well-covered in a number of the popular +computer-science textbooks. In fact, it gets a pat on the back in Numerical +Recipes (Press et. al.), where the authors write: + +Heapsort is our favorite sorting routine. It can be recommended +wholeheartedly for a variety of sorting applications. It is a true +"in-place" sort, requiring no auxiliary storage. + +Heapsort works by building the array into a kind of a queue called a heap. +You can imagine this heap as being a form of in-memory binary tree. The +topmost (root) element of the tree is the element that -- were the array +sorted -- would be the largest element in the array. Sorting takes place by +first constructing the heap, then pulling the root off the tree, promoting +the next largest element to the root, pulling it off, and so on. (The +promotion process is known as "sifting up.") + +Heapsort executes in N log2 N time even in its worst case. Unlike some other +sorting algorithms, it does not benefit from a partially sorted array +(though Gonnet does refer to a variation of heapsort, called "smoothsort," +which does -- see references). + +String Sort + +Description + +This benchmark is designed to gauge how well the system moves bytes around. +By that we mean, how well the system can copy a string of bytes from one +location to another; source and destination being aligned to arbitrary +addresses. (This is unlike the numeric sort array, which moves bytes +longword-at-a-time.) The strings themselves are built so as to be of random +length, ranging from no fewer than 4 bytes and no greater than 80 bytes. The +mixture of random lengths means that processors will be forced to deal with +strings that begin and end on arbitrary address boundaries. + +The string sort benchmark uses the heapsort algorithm; this is the same +algorithm as is used in the numeric sort benchmark (see the sidebar on the +heapsort for a detailed description of the algorithm). + +Manipulation of the strings is actually handled by two arrays. One array +holds the strings themselves; the other is a pointers array. Each member of +the pointers array carries an offset that points into the string array, so +that the ith pointer carries the offset to the ith string. This allows the +benchmark to rapidly locate the position of the ith string. (The sorting +algorithm requires exchanges of items that might be "distant" from one +another in the array. It's critical that the routine be able to rapidly find +a string based on its indexed position in the array.) + +The string sort benchmark reports the number of string arrays it was able to +sort per second. The size of the array is set by a global constant. + +Analysis + +Optimized 486 code (Watcom C/C++ 10.0): Profiling of the string sort +benchmark indicates that it spends most of its time in the C library routine +memmove(). Within that routine, most of the execution is consumed by a pair +of instructions: rep movsw and rep movsd. These are repeated string move -- +word width and repeated string move -- doubleword width, respectively. + +This is precisely where we want to see the time spent. It's interesting to +note that the memmove() of the particular compiler/profiler tested (Watcom +C/C++ 10.0) was "smart" enough to do most of the moving on word or +doubleword boundaries. The string sort benchmark specifically sets arbitrary +boundaries, so we'd expect to see lots of byte-wide moves. The "smart" +memmove() is able to move bytes only when it has to, and does the remainder +of the work via words and doublewords (which can move more bits at a time). + +680x0 Code (Macintosh CodeWarrior): Because CodeWarrior's profiler is +function based, it is impossible to get an idea of how much time the test +spends in library routines such as memmove(). Fortunately, as an artifact of +the early version of the benchmark, the string sort algorithm makes use of +the MoveMemory() routine in the sysspec.c file (system specific routines). +This call, on anything other than a 16-bit DOS system, calls memmove() +directly. Hence, we can get a good approximation of how much time is spent +moving bytes. + +The answer is that nearly 78% of the benchmark's time is consumed by +MoveMemory(), the rest being taken up by the other routines (the +str_is_less() routine, which performs string comparisons, takes about 7% of +the time). As above, we can guess that most of the benchmark's time is +dependent on the performance of the library's memmove() routine. + +Porting Considerations + +As with the numeric sort routine, the string sort benchmark should be simple +to port. Simpler, in fact. The string sort benchmark routine is not +dependent on any typedef that may change from machine to machine (unless a +char type is not 8 bits). + +The string sort benchmark depends on the following global definitions: + +NUMSTRARRAYS - Sets the upper limit on the number of arrays that the +benchmark will attempt to build. The string sort benchmark creates work for +itself by requiring the system to sort more and more arrays, not bigger and +bigger arrays. (See section on Numeric Sort for an explanation.) This +constant sets the upper limit to the number of arrays the system will build +before it signals an error. The default value is 100, and may be changed if +your system exceeds this limit. + +STRARRAYSIZE - Sets the default size of the string arrays built. We say +"arrays" because, as with the numeric sort benchmark, the system adds work +not by expanding the size of the array, but by adding more arrays. This +value is set to 8111, and should not be modified, since results would not be +comparable with other runs of the same benchmark on other machines. + +To test for a correct execution of the string sort benchmark, #define +the DEBUG symbol. This will enable code that verifies the arrays are +properly sorted. Set up a command file that runs only the string sort, +and execute the benchmark program. If the routine is operating +properly, the benchmark will print "String sort: OK", this message is +printed quite often. Otherwise, the program will display "SORT ERROR" +for each pair of strings it finds out of order (which can be really +often). + +References + +See the references for the Numeric Sort benchmark. + +Bitfield Operations + +Description + +The purpose of this benchmark is to explore how efficiently the system +executes operations that deal with "twiddling bits." The test is set up to +simulate a "bit map"; a data structure used to keep track of storage usage. +(Don't confuse this meaning of "bitmap" with its use in describing a +graphics data structure.) + +Systems often use bit maps to keep an inventory of memory blocks or (more +frequently) disk blocks. In the case of a bit map that manages disk usage, +an operating system will set aside a buffer in memory so that each bit in +that buffer corresponds to a block on the disk drive. A 0 bit means that the +corresponding block is free; a 1 bit means the block is in use. Whenever a +file requests a new block of disk storage, the operating system searches the +bit map for the first 0 bit, sets the bit (to indicate that the block is now +spoken for), and returns the number of the corresponding disk block to the +requesting file. + +These types of operations are precisely what this test simulates. A block of +memory is set allocated for the bit map. Another block of memory is +allocated, and set up to hold a series of "bit map commands". Each bitmap +command tells the simulation to do 1 of 3 things: + +1) Clear a series of consecutive bits, + +2) Set a series of consecutive bits, or + +3) Complement (1->0 and 0->1) a series of consecutive bits. + +The bit map command block is loaded with a set of random bit map commands +(each command covers an random number of bits), and simulation routine steps +sequentially through the command block, grabbing a command and executing it. + +The bitfield benchmark reports the number of bits it was able to operate on +per second. The size of the bit map is constant; the bitfield operations +array is adjusted based on the capabilities of the processor. (See the +section describing the auto-adjust feature of the benchmarks.) + +Analysis + +Optimized 486 code: Using the Watcom C/C++ 10.0 profiler, the Bitfield +benchmark appears to spend all of its time in two routines: ToggleBitRun() +(74% of the time) and DoBitFieldIteration() (24% of the time). We say +"appears" because this is misleading, as we will explain. + +First, it is important to recall that the test performs one of three +operations for each run of bits (see above). The routine ToggleBitRun() +handles two of those three operations: setting a run of bits and clearing a +run of bits. An if() statement inside ToggleBitRun() decides which of the +two operations is performed. (Speed freaks will quite rightly point out that +this slows the entire algorithm. ToggleBitRun() is called by a switch() +statement which has already decided whether bits should be set or cleared; +it's a waste of time to have ToggleBitRun() have to make that decision yet +again.) + +DoBitFieldIteration() is the "outer" routine that calls ToggleBitRun(). +DoBitFieldIt... [truncated message content] |