From: Nikodemus Siivola <demoss@us...>  20110805 13:13:52

The branch "master" has been updated in SBCL: via 7ab4cdd5eaf3dce3cc596b348bfc98aaa27469d5 (commit) from fb15ad0ff2373a50b3b0717f705c49339b39f996 (commit)  Log  commit 7ab4cdd5eaf3dce3cc596b348bfc98aaa27469d5 Author: Lutz Euler <lutz.euler@...> Date: Mon Jul 25 02:47:43 2011 +0200 Optimize integer division by a constant in several cases. Convert integer division by a constant into multiplication to gain a large speedup as the machine instructions for multiplication are typically executed much faster than those for division. This is implemented using a deftransform on TRUNCATE that triggers if the dividend is known to fit in an unsigned machine word and if the divisor is a constant, also fitting in an unsigned machine word. (The cases that are optimized by other existing transforms, for example if the divisor is a power of two, are left to these transforms.) The replacement code is based on a widening multiply (that is already available as bignum calculations need it) and possibly some shifts and an addition to calculate the quotient. If the remainder is needed, additionally a (normal) multiplication and a subtraction are generated. As several other integer division operations are implemented using TRUNCATE, this also affects CEILING, FLOOR, MOD and REM with the same argument types. CEILING and FLOOR, however, are optimized only when SAFETY=0 since they are declared MAYBEINLINE.  NEWS  2 + src/compiler/srctran.lisp  106 +++++++++++++++++++++++++++++++++++++++++++++ tests/arith.pure.lisp  64 +++++++++++++++++++++++++++ 3 files changed, 172 insertions(+), 0 deletions() diff git a/NEWS b/NEWS index 61e48f0..82ae01e 100644  a/NEWS +++ b/NEWS @@ 4,6 +4,8 @@ changes relative to sbcl1.0.50: and probe counts on Linux. * enhancement: building 32bit SBCL on Linux/x8664 now works without a chroot. (Use "SBCL_ARCH=x86 sh make.sh" to build.) + * optimization: unsigned integer divisions by a constant are implemented + using multiplication (affects CEILING, FLOOR, TRUNCATE, MOD, and REM.) * bug fix: correct RIP offset calculation in SSE comparison and shuffle instructions. (lp#814688) * bug fix: COERCE to unfinalized extended sequence classes now works. diff git a/src/compiler/srctran.lisp b/src/compiler/srctran.lisp index ce1ac5a..1379d9f 100644  a/src/compiler/srctran.lisp +++ b/src/compiler/srctran.lisp @@ 3264,6 +3264,112 @@ `(if (minusp x) ( (logand ( x) ,mask)) (logand x ,mask))))) + +;;; Return an expression to calculate the integer quotient of X and +;;; constant Y, using multiplication, shift and add/sub instead of +;;; division. Both arguments must be unsigned, fit in a machine word and +;;; Y must neither be zero nor a power of two. The quotient is rounded +;;; towards zero. +;;; The algorithm is taken from the paper "Division by Invariant +;;; Integers using Multiplication", 1994 by TorbjÃ¶rn Granlund and Peter +;;; L. Montgomery, Figures 4.2 and 6.2, modified to exclude the case of +;;; division by powers of two. +;;; The following two examples show an average case and the worst case +;;; with respect to the complexity of the generated expression, under +;;; a word size of 64 bits: +;;; +;;; (UNSIGNEDDIVTRANSFORMER 10) > +;;; (ASH (%MULTIPLY (ASH X 0) 14757395258967641293) 3) +;;; +;;; (UNSIGNEDDIVTRANSFORMER 7) > +;;; (LET* ((NUM X) +;;; (T1 (%MULTIPLY NUM 2635249153387078803))) +;;; (ASH (LDB (BYTE 64 0) +;;; (+ T1 (ASH (LDB (BYTE 64 0) +;;; ( NUM T1)) +;;; 1))) +;;; 2)) +;;; +(defun genunsigneddivbyconstantexpr (y) + (declare (type (integer 3 #.mostpositiveword) y)) + (aver (not (zerop (logand y (1 y))))) + (labels ((ld (x) + ;; the floor of the binary logarithm of (positive) X + (integerlength (1 x))) + (choosemultiplier (y precision) + (do* ((l (ld y)) + (shift l (1 shift)) + (expt2n+l (expt 2 (+ sb!vm:nwordbits l))) + (mlow (truncate expt2n+l y) (ash mlow 1)) + (mhigh (truncate (+ expt2n+l + (ash expt2n+l ( precision))) + y) + (ash mhigh 1))) + ((not (and (< (ash mlow 1) (ash mhigh 1)) + (> shift 0))) + (values mhigh shift))))) + (let ((n (expt 2 sb!vm:nwordbits)) + (shift1 0)) + (multiplevaluebind (m shift2) + (choosemultiplier y sb!vm:nwordbits) + (when (and (>= m n) (evenp y)) + (setq shift1 (ld (logand y ( y)))) + (multiplevaluesetq (m shift2) + (choosemultiplier (/ y (ash 1 shift1)) + ( sb!vm:nwordbits shift1)))) + (if (>= m n) + (flet ((wordmod (x) + `(ldb (byte #.sb!vm:nwordbits 0) ,x))) + `(let* ((num x) + (t1 (%multiply num ,( m n)))) + (ash ,(wordmod `(+ t1 (ash ,(wordmod `( num t1)) + 1))) + ,( 1 shift2)))) + `(ash (%multiply (ash x ,( shift1)) ,m) + ,( shift2))))))) + +;;; If the divisor is constant and both args are positive and fit in a +;;; machine word, replace the division by a multiplication and possibly +;;; some shifts and an addition. Calculate the remainder by a second +;;; multiplication and a subtraction. Dead code elimination will +;;; suppress the latter part if only the quotient is needed. If the type +;;; of the dividend allows to derive that the quotient will always have +;;; the same value, emit much simpler code to handle that. (This case +;;; may be rare but it's easy to detect and the compiler doesn't find +;;; this optimization on its own.) +(deftransform truncate ((x y) ((unsignedbyte #.sb!vm:nwordbits) + (constantarg + (unsignedbyte #.sb!vm:nwordbits))) + * + :policy (and (> speed compilationspeed) + (> speed space))) + "convert integer division to multiplication" + (let ((y (lvarvalue y))) + ;; Division by zero, one or powers of two is handled elsewhere. + (when (zerop (logand y (1 y))) + (giveupir1transform)) + ;; The compiler can't derive the result types to maximal tightness + ;; from the transformed expression, so we calculate them here and + ;; add the corresponding specifiers explicitly through TRULYTHE. + ;; This duplicates parts of the TRUNCATE DERIVETYPE optimizer but + ;; using that here would be too cumbersome. + (let* ((xtype (lvartype x)) + (xlow (or (and (numerictypep xtype) + (numerictypelow xtype)) + 0)) + (xhigh (or (and (numerictypep xtype) + (numerictypehigh xtype)) + (1 (expt 2 #.sb!vm:nwordbits)))) + (quotlow (truncate xlow y)) + (quothigh (truncate xhigh y))) + (if (= quotlow quothigh) + `(values ,quotlow + ( x ,(* quotlow y))) + `(let* ((quot ,(genunsigneddivbyconstantexpr y)) + (rem (ldb (byte #.sb!vm:nwordbits 0) + ( x (* quot ,y))))) + (values (trulythe (integer ,quotlow ,quothigh) quot) + (trulythe (integer 0 ,(1 y)) rem))))))) ;;;; arithmetic and logical identity operation elimination diff git a/tests/arith.pure.lisp b/tests/arith.pure.lisp index dde318b..d570df2 100644  a/tests/arith.pure.lisp +++ b/tests/arith.pure.lisp @@ 404,3 +404,67 @@ (if (eql fastresult slowresult) (print (list :ok `(,op ,@args) :=> fastresult)) (error "oops: ~S, ~S" args callargs))))))))))) + +;;; (TRUNCATE <unsignedword> <constant unsignedword>) is optimized +;;; to use multiplication instead of division. This propagates to FLOOR, +;;; MOD and REM. Test that the transform is indeed triggered and test +;;; several cases for correct results. +(withtest (:name (:integerdivisionusingmultiplication :used) + :skippedon '(not (or :x8664 :x86))) + (dolist (fun '(truncate floor ceiling mod rem)) + (let* ((foo (compile nil `(lambda (x) + (declare (optimize (speed 3) + (space 0) + (compilationspeed 0)) + (type (unsignedbyte + ,sbvm:nwordbits) x)) + (,fun x 9)))) + (disassembly (withoutputtostring (s) + (disassemble foo :stream s)))) + ;; KLUDGE copied from test :floatdivisionusingexactreciprocal + ;; in compiler.pure.lisp. + (assert (and (not (search "DIV" disassembly)) + (search "MUL" disassembly)))))) + +(withtest (:name (:integerdivisionusingmultiplication :correctness)) + (let ((*randomstate* (makerandomstate t))) + (dolist (dividendtype `((unsignedbyte ,sbvm:nwordbits) + (and fixnum unsignedbyte) + (integer 10000 10100))) + (dolist (divisor `(;; Some special cases from the paper + 7 10 14 641 274177 + ;; Range extremes + 3 + ,mostpositivefixnum + ,(1 (expt 2 sbvm:nwordbits)) + ;; Some random values + ,@(loop for i from 8 to sbvm:nwordbits + for r = (random (expt 2 i)) + ;; We don't want 0, 1 and powers of 2. + when (not (zerop (logand r (1 r)))) + collect r))) + (dolist (fun '(truncate ceiling floor mod rem)) + (let ((foo (compile nil `(lambda (x) + (declare (optimize (speed 3) + (space 0) + (compilationspeed 0)) + (type ,dividendtype x)) + (,fun x ,divisor))))) + (dolist (dividend `(0 1 ,mostpositivefixnum + ,(1 divisor) ,divisor + ,(1 (* divisor 2)) ,(* divisor 2) + ,@(loop repeat 4 + collect (+ 10000 (random 101))) + ,@(loop for i from 4 to sbvm:nwordbits + for r = (random (expt 2 i)) + collect r))) + (when (typep dividend dividendtype) + (multiplevaluebind (q1 r1) + (funcall foo dividend) + (multiplevaluebind (q2 r2) + (funcall fun dividend divisor) + (unless (and (= q1 q2) + (eql r1 r2)) + (error "bad results for ~s with dividend type ~s" + (list fun dividend divisor) + dividendtype))))))))))))  hooks/postreceive  SBCL 