From: Akshay S. <ak...@us...> - 2013-01-07 02:46:43
|
This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "matlisp". The branch, tensor has been updated via a5e3a50f5fbe85eb2b44d11ff2a6ba385011e979 (commit) from 32fd23120cff0e68ba1b02290f19e0dd48185944 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit a5e3a50f5fbe85eb2b44d11ff2a6ba385011e979 Author: Akshay Srinivasan <aks...@gm...> Date: Sun Jan 6 18:33:15 2013 -0800 o Changed loop order in the lisp GEMM function generator, to work better on row-ordered matrices. About twice as fast as the old one (for row ordered matrices). o Added new tests to loopy-tests.lisp ; (test-mm-lisp-lin <n>) is now as fast as the reference F77 BLAS implementation (!). Still too slow compared to ATLAS & GOTOBlas (not surprisingly). Might have to add different loops for different ordering, in the GEMM generator macro. diff --git a/configure.ac b/configure.ac index d2ec52f..6b0aeba 100644 --- a/configure.ac +++ b/configure.ac @@ -376,7 +376,7 @@ int main() EOF $CC $CFLAGS -c conftest.c $F77 $FFLAGS -o a.out conftest.o -L${BLAS_LAPACK_DIR} -lblas -llapack - if a.out; then + if ./a.out; then AC_MSG_RESULT([yes]) F2C=-ff2c else diff --git a/src/base/blas-helpers.lisp b/src/base/blas-helpers.lisp index 0092bf9..f9ad967 100644 --- a/src/base/blas-helpers.lisp +++ b/src/base/blas-helpers.lisp @@ -41,9 +41,10 @@ (defun blas-matrix-compatible-p (matrix op) (declare (type standard-matrix matrix)) - (let ((rs (aref (strides matrix) 0)) - (cs (aref (strides matrix) 1))) - (declare (type index-type rs cs)) + (let*-typed ((stds (strides matrix) :type index-store-vector) + (rs (aref stds 0) :type index-type) + (cs (aref stds 1) :type index-type)) + ;;Note that it is not required that (rs = nc * cs) or (cs = nr * rs) (cond ((= cs 1) (values :row-major rs (fortran-nop op))) ((= rs 1) (values :col-major cs (fortran-op op))) @@ -68,3 +69,9 @@ (defun combine-jobs (&rest jobs) (let ((job (intern (apply #'concatenate 'string (mapcar #'symbol-name jobs)) "KEYWORD"))) job)) + +(definline flip-major (job) + (declare (type symbol job)) + (case job + (:row-major :col-major) + (:col-major :row-major))) diff --git a/src/base/standard-tensor.lisp b/src/base/standard-tensor.lisp index 1c0397d..2f6a153 100644 --- a/src/base/standard-tensor.lisp +++ b/src/base/standard-tensor.lisp @@ -134,6 +134,8 @@ :f/ (a b) -> a * b^{-1} :finv* (a) -> 1/a :fid* () -> * identity + :f= (a b) -> (= a b) + :fconj (a) -> a^* {if nil, Field does not have a conjugation op} :coercer (ele) -> Coerced to store-type, with error checking :coercer-unforgiving (ele) -> Coerced to store-type, no error checking @@ -340,11 +342,11 @@ (defmacro define-tensor ((tensor-class element-type store-element-type store-type &rest class-decls) &key - f+ f- finv+ fid+ f* f/ finv* fid* fconj + f+ f- finv+ fid+ f* f/ finv* fid* fconj f= matrix vector store-allocator coercer coercer-unforgiving reader value-writer reader-writer swapper) ;;Error checking - (assert (and f+ f- finv+ fid+ f* f/ finv* fid* store-allocator coercer coercer-unforgiving matrix vector reader value-writer reader-writer swapper)) + (assert (and f+ f- finv+ fid+ f* f/ finv* fid* f= store-allocator coercer coercer-unforgiving matrix vector reader value-writer reader-writer swapper)) ;; `(progn ;;Class definitions @@ -379,6 +381,7 @@ :f/ ',f/ :finv* ',finv* :fid* ',fid* + :f= ',f= :fconj ',fconj :reader ',reader :value-writer ',value-writer diff --git a/src/classes/complex-tensor.lisp b/src/classes/complex-tensor.lisp index 73e624a..d2d8c04 100644 --- a/src/classes/complex-tensor.lisp +++ b/src/classes/complex-tensor.lisp @@ -48,6 +48,10 @@ (declare (type complex-type a)) (conjugate a)) +(definline complex-type.f= (a b) + (declare (type complex-type a b)) + (= a b)) + ;;Store operations (definline allocate-complex-store (size) " @@ -110,7 +114,8 @@ :f/ complex-type.f/ :finv* complex-type.finv* :fid* complex-type.fid* - :fconj complex-type.fconj + :f= complex-type.f= + :fconj complex-type.fconj ;; :store-allocator allocate-complex-store :coercer coerce-complex diff --git a/src/classes/real-tensor.lisp b/src/classes/real-tensor.lisp index 44239aa..aa9f293 100644 --- a/src/classes/real-tensor.lisp +++ b/src/classes/real-tensor.lisp @@ -39,6 +39,10 @@ (definline real-type.fid* () 1.0d0) +(definline real-type.fid= (a b) + (declare (type real-type a b)) + (= a b)) + ;;Store definitions (definline real-type.reader (tstore idx) (declare (type index-type idx) @@ -86,6 +90,7 @@ Allocates real storage. Default initial-element = 0d0.") :f/ real-type.f/ :finv* real-type.finv* :fid* real-type.fid* + :f= real-type.fid= :fconj nil ;; :store-allocator allocate-real-store diff --git a/src/classes/symbolic-tensor.lisp b/src/classes/symbolic-tensor.lisp index be7986c..e77f692 100644 --- a/src/classes/symbolic-tensor.lisp +++ b/src/classes/symbolic-tensor.lisp @@ -39,6 +39,10 @@ (definline symbolic-type.fid* () 1) +(definline symbolic-type.f= (a b) + (declare (type symbolic-type a b)) + (maxima::equal a b)) + (definline symbolic-type.fconj (a) (maxima::meval `((maxima::$conjugate maxima::simp) ,a))) @@ -97,6 +101,7 @@ Allocates symbolic storage. Default initial-element = 0.") :f/ symbolic-type.f/ :finv* symbolic-type.finv* :fid* symbolic-type.fid* + :f= symbolic-type.f= :fconj symbolic-type.fconj ;; :store-allocator allocate-symbolic-store diff --git a/src/level-1/scal.lisp b/src/level-1/scal.lisp index 517167c..6817cf8 100644 --- a/src/level-1/scal.lisp +++ b/src/level-1/scal.lisp @@ -195,16 +195,16 @@ #+maxima (progn (generate-typed-num-scal! symbolic-typed-num-scal! - (real-tensor nil 0)) + (symbolic-tensor nil 0)) (generate-typed-scal! symbolic-typed-scal! - (real-tensor nil 0)) + (symbolic-tensor nil 0)) (generate-typed-div! symbolic-typed-div! - (real-tensor nil 0)) + (symbolic-tensor nil 0)) (generate-typed-num-div! symbolic-typed-num-div! - (real-tensor nil 0))) + (symbolic-tensor nil 0))) ;;---------------------------------------------------------------;; (defgeneric scal! (alpha x) diff --git a/src/level-3/gemm.lisp b/src/level-3/gemm.lisp index 22b445b..172be3d 100644 --- a/src/level-3/gemm.lisp +++ b/src/level-3/gemm.lisp @@ -3,14 +3,14 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Copyright (c) 2000 The Regents of the University of California. -;;; All rights reserved. -;;; +;;; All rights reserved. +;;; ;;; Permission is hereby granted, without written agreement and without ;;; license or royalty fees, to use, copy, modify, and distribute this ;;; software and its documentation for any purpose, provided that the ;;; above copyright notice and the following two paragraphs appear in all ;;; copies of this software. -;;; +;;; ;;; IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY ;;; FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ;;; ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF @@ -39,6 +39,7 @@ (declare (type ,(getf opt :element-type) alpha beta) (type ,matrix-class A B C) (type symbol job)) + ;;The big done-in-lisp-gemm, loop-ordering was inspired by the BLAS dgemm reference implementation. ,(let ((lisp-routine `(let-typed ((nr-C (nrows C) :type index-type) @@ -59,39 +60,51 @@ (cstp-C (col-stride C) :type index-type) (hd-C (head C) :type index-type) (sto-C (store C) :type ,(linear-array-type (getf opt :store-type)))) - (when (eq job-A :t) - (rotatef rstp-A cstp-A)) - (when (eq job-B :t) - (rotatef rstp-B cstp-B)) - (very-quickly - (loop :repeat nr-C - :for rof-A :of-type index-type := hd-A :then (+ rof-A rstp-A) - :for rof-C :of-type index-type := hd-C :then (+ rof-C rstp-C) - :do (loop :repeat nc-C - :for cof-B :of-type index-type := hd-B :then (+ cof-B cstp-B) - :for of-C :of-type index-type := rof-C :then (+ of-C cstp-C) - :do (let-typed ((val (,(getf opt :f*) beta (,(getf opt :reader) sto-C of-C)) :type ,(getf opt :element-type))) - (loop :repeat dotl - :for of-A :of-type index-type := rof-A :then (+ of-A cstp-A) - :for of-B :of-type index-type := cof-B :then (+ of-B rstp-B) - :with sum :of-type ,(getf opt :element-type) := (,(getf opt :fid+)) - :do (let-typed ((A-val (,(getf opt :reader) sto-A of-A) :type ,(getf opt :element-type)) - (B-val (,(getf opt :reader) sto-B of-B) :type ,(getf opt :element-type))) - (setf sum (,(getf opt :f+) sum (,(getf opt :f*) A-val B-val)))) - :finally (,(getf opt :value-writer) (,(getf opt :f+) (,(getf opt :f*) alpha sum) val) sto-C of-C))))))))) - `(mlet* ,(recursive-append - '(((job-A job-B) (ecase job - (:nn (values :n :n)) - (:nt (values :n :t)) - (:tn (values :t :n)) - (:tt (values :t :t))) - :type (symbol symbol))) - (when blas? - `((call-fortran? (> (max (nrows C) (ncols C) (if (eq job-A :n) (ncols A) (nrows A))) - ,fortran-lb-parameter)) - ((maj-A ld-A fop-A) (if call-fortran? (blas-matrix-compatible-p A job-A) (values nil 0 "?")) :type (symbol index-type (string 1))) - ((maj-B ld-B fop-B) (if call-fortran? (blas-matrix-compatible-p B job-B) (values nil 0 "?")) :type (symbol index-type (string 1))) - ((maj-C ld-C fop-C) (if call-fortran? (blas-matrix-compatible-p C :n) (values nil 0 "?")) :type (symbol index-type nil))))) + ;;Replace with separate loops to maximize Row-ordered MM performance + (when (eq job-A :t) + (rotatef rstp-A cstp-A)) + (when (eq job-B :t) + (rotatef rstp-B cstp-B)) + ;; + (let-typed ((of-A hd-A :type index-type) + (of-B hd-B :type index-type) + (of-C hd-C :type index-type) + (r.cstp-C (* cstp-C nc-C) :type index-type) + (d.rstp-B (- rstp-B (* cstp-B nc-C)) :type index-type) + (d.rstp-A (- rstp-A (* cstp-A dotl)) :type index-type)) + (very-quickly + (loop :repeat nr-C + :do (progn + (loop :repeat dotl + :do (let-typed ((ele-A (,(getf opt :f*) alpha (,(getf opt :reader) sto-A of-A)) :type ,(getf opt :element-type))) + (loop :repeat nc-C + :do (progn + (,(getf opt :value-writer) + (,(getf opt :f+) + (,(getf opt :reader) sto-C of-C) + (,(getf opt :f*) ele-A (,(getf opt :reader) sto-B of-B))) + sto-C of-C) + (incf of-C cstp-C) + (incf of-B cstp-B))) + (decf of-C r.cstp-C) + (incf of-A cstp-A) + (incf of-B d.rstp-B))) + (incf of-C rstp-C) + (incf of-A d.rstp-A) + (setf of-B hd-B)))))))) + ;;Tie together Fortran and lisp-routines. + `(mlet* (((job-A job-B) (ecase job + (:nn (values :n :n)) + (:nt (values :n :t)) + (:tn (values :t :n)) + (:tt (values :t :t))) + :type (symbol symbol)) + ,@(when blas? + `((call-fortran? (> (max (nrows C) (ncols C) (if (eq job-A :n) (ncols A) (nrows A))) + ,fortran-lb-parameter)) + ((maj-A ld-A fop-A) (blas-matrix-compatible-p A job-A) :type (symbol index-type (string 1))) + ((maj-B ld-B fop-B) (blas-matrix-compatible-p B job-B) :type (symbol index-type (string 1))) + ((maj-C ld-C fop-C) (blas-matrix-compatible-p C :n) :type (symbol index-type nil))))) ,(if blas? `(cond ((and call-fortran? maj-A maj-B maj-C) @@ -164,7 +177,6 @@ ,lisp-routine)) lisp-routine))) C))) - ;;Real (generate-typed-gemm! real-base-typed-gemm! (real-tensor dgemm dgemv *real-l3-fcall-lb*)) @@ -191,11 +203,15 @@ (let ((A (ecase job-A ((:n :t) A) ((:h :c) + ;;BUG! + ;;Multiplication does not yield the complex conjugate! (let ((ret (apply #'make-complex-tensor (lvec->list (dimensions A))))) (complex-typed-axpy! #c(-1d0 0d0) A ret))))) (B (ecase job-B ((:n :t) B) ((:h :c) + ;;BUG! + ;;Multiplication does not yield the complex conjugate! (let ((ret (apply #'make-complex-tensor (lvec->list (dimensions B))))) (complex-typed-axpy! #c(-1d0 0d0) B ret))))) (tjob (combine-jobs (ecase job-A ((:n :t) job-A) (:h :t) (:c :n)) @@ -206,7 +222,7 @@ ;;Symbolic #+maxima (generate-typed-gemm! symbolic-base-typed-gemm! - (symbolic-tensor nil nil 0)) + (symbolic-tensor nil nil 0)) ;;---------------------------------------------------------------;; @@ -220,10 +236,10 @@ Purpose ======= Performs the GEneral Matrix Multiplication given by - -- - - + -- - - + + C <- alpha * op(A) * op(B) + beta * C - C <- alpha * op(A) * op(B) + beta * C - and returns C. alpha,beta are scalars and A,B,C are matrices. @@ -288,7 +304,7 @@ (real-typed-gemm! (coerce-real 1) A B (coerce-real 0) A.x job) ;;Re (axpy! (realpart alpha) A.x vw.c) - ;;Im + ;;Im (incf (head vw.c)) (axpy! (imagpart alpha) A.x vw.c)) (let ((vw.c (tensor-realpart~ c))) @@ -323,10 +339,10 @@ Purpose ======= Performs the GEneral Matrix Multiplication given by - -- - - + -- - - + + alpha * op(A) * op(B) + beta * C - alpha * op(A) * op(B) + beta * C - and returns the result in a new matrix. alpha,beta are scalars and A,B,C are matrices. diff --git a/tests/loopy-tests.lisp b/tests/loopy-tests.lisp index 67db2be..9dee685 100644 --- a/tests/loopy-tests.lisp +++ b/tests/loopy-tests.lisp @@ -102,6 +102,82 @@ (values t-a t-b t-c)))) +(defun test-mm-lisp (n) + (declare (type fixnum n)) + (let ((A (make-real-tensor n n)) + (B (make-real-tensor n n)) + (C (make-real-tensor n n))) + (let-typed ((nr-C (nrows C) :type index-type) + (nc-C (ncols C) :type index-type) + (dotl (ncols A) :type index-type) + ; + (rstp-A (row-stride A) :type index-type) + (cstp-A (col-stride A) :type index-type) + (hd-A (head A) :type index-type) + (sto-A (store A) :type real-store-vector) + ; + (rstp-B (row-stride B) :type index-type) + (cstp-B (col-stride B) :type index-type) + (hd-B (head B) :type index-type) + (sto-B (store B) :type real-store-vector) + ; + (rstp-C (row-stride C) :type index-type) + (cstp-C (col-stride C) :type index-type) + (hd-C (head C) :type index-type) + (sto-C (store C) :type real-store-vector)) + (time + (let-typed ((of-A hd-A :type index-type) + (of-B hd-B :type index-type) + (of-C hd-C :type index-type) + (r.cstp-C (* cstp-C nc-C) :type index-type) + (d.rstp-B (- rstp-B (* cstp-B nc-C)) :type index-type) + (d.rstp-A (- rstp-A (* cstp-A dotl)) :type index-type)) + (very-quickly + (loop :repeat nr-C + :do (progn + (loop :repeat dotl + :do (let-typed ((ele-A (aref sto-A of-A) :type real-type)) + (loop :repeat nc-C + :do (progn + (incf (aref sto-C of-C) (* ele-A (aref sto-B of-B))) + (incf of-C cstp-C) + (incf of-B cstp-B))) + (decf of-C r.cstp-C) + (incf of-A cstp-A) + (incf of-B d.rstp-B))) + (incf of-C rstp-C) + (incf of-A d.rstp-A) + (setf of-B hd-B)))))) + t))) + +(defun test-mm-lisp-lin (n) + (declare (type fixnum n)) + (let ((A (make-real-tensor n n)) + (B (make-real-tensor n n)) + (C (make-real-tensor n n))) + (let*-typed ((sto-A (store A) :type real-store-vector) + (sto-B (store B) :type real-store-vector) + (sto-C (store C) :type real-store-vector)) + (time + (let-typed ((of-A 0 :type index-type) + (of-B 0 :type index-type) + (of-C 0 :type index-type)) + (very-quickly + (loop :repeat n + :do (progn + (loop :repeat n + :do (let-typed ((ele-A (aref sto-A of-A) :type real-type)) + (loop :repeat n + :do (progn + (incf (aref sto-C of-C) (* ele-A (aref sto-B of-B))) + (incf of-C) + (incf of-B))) + (decf of-C n) + (incf of-A))) + (incf of-C n) + (setf of-B 0)))))) + t))) + (defun test-mm-ddot (n) (let ((t-a (make-real-tensor n n)) (t-b (make-real-tensor n n)) ----------------------------------------------------------------------- Summary of changes: configure.ac | 2 +- src/base/blas-helpers.lisp | 13 ++++- src/base/standard-tensor.lisp | 7 ++- src/classes/complex-tensor.lisp | 7 ++- src/classes/real-tensor.lisp | 5 ++ src/classes/symbolic-tensor.lisp | 5 ++ src/level-1/scal.lisp | 8 ++-- src/level-3/gemm.lisp | 106 ++++++++++++++++++++++---------------- tests/loopy-tests.lisp | 76 +++++++++++++++++++++++++++ 9 files changed, 173 insertions(+), 56 deletions(-) hooks/post-receive -- matlisp |