[35b509]: src / cllib / csv.lisp Maximize Restore History

Download this file

csv.lisp    290 lines (268 with data), 12.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
;;; read/write comma-separated values
;;;
;;; Copyright (C) 2003-2010 by Sam Steingold
;;; This is Free Software, covered by the GNU GPL (v2+)
;;; See http://www.gnu.org/copyleft/gpl.html
;;;
;;; $Id$
;;; $Source$
(eval-when (:compile-toplevel :load-toplevel :execute)
(require :cllib-base (translate-logical-pathname "clocc:src;cllib;base"))
;; `with-collect'
(require :cllib-simple (translate-logical-pathname "cllib:simple"))
;; `with-timing', `log'
(require :cllib-log (translate-logical-pathname "cllib:log")))
(in-package :cllib)
(export '(csv-print-vector csv-parse-string csv-read-file with-csv csv-names
class-csv-header class-csv-print *csv-first-line-names* *csv-junk*
defcsv csv-read csv-write
*csv-separator* *csv-whitespace* *csv-progress* *csv-progress-1*))
(defcustom *csv-separator* character #\,
"The separator in the CSV file, normally the comma.")
(defcustom *csv-first-line-names* (or t nil :default) :default
"How to treat the first line in WITH-CSV et el.
If this is T (or :DEFAULT and the first line starts with a +COMMENTS+
character), treat the first line as the vector of column names.
Otherwise, the first line is nothing special.")
(defun csv-print-vector (vec &optional (out *standard-output*))
"Print a vector as a comma-separated line."
(declare (type vector vec) (stream out))
(loop :with len = (length vec) :for val :across vec :and ii :from 1
:when val :do (write val :stream out :escape nil)
:unless (= ii len) :do (write-char *csv-separator* out))
(terpri out))
(defcustom *csv-whitespace* (or null string) +whitespace+
"The string of characters to trim from the values.")
(defcustom *csv-progress* integer 1000
"*How often the progress report should be made")
(defcustom *csv-progress-1* integer 10
"*How often the secondary progress report should be made")
(defcustom *csv-junk* (or symbol integer) :ERROR
"How to treat lines of wrong length.
When the :JUNK argument is :ERROR, signal an error.
When it is :WARNING, issue a warning and drop the line.
When it is a number, issue at most this many warnings.
When it is :KEEP, keep the line as is.")
(defun csv-trim (whitespace string)
"Trim the string argument from the whitespace."
(let* ((clean (string-trim whitespace string)) (len1 (1- (length clean))))
(when (and (plusp len1) (char= #\" (char clean 0) (char clean len1)))
(setq clean (subseq clean 1 len1)))
(if (zerop (length clean)) nil clean)))
(defun csv-parse-string (string &key
((:separator *csv-separator*) *csv-separator*)
((:whitespace *csv-whitespace*) *csv-whitespace*))
"Parse a string, returning a vector of strings."
(loop :with num = (count *csv-separator* string :test #'char=)
:with res = (make-array (1+ num))
:for ii :from 0 :to num
:for beg = 0 :then (1+ end)
:for end = (or (position *csv-separator* string :test #'char= :start beg)
(length string))
:do (setf (aref res ii)
(when (> end beg) ; otherwise NIL = missing
(csv-trim *csv-whitespace* (subseq string beg end))))
:finally (return res)))
(defconst +comments+ string "#;" "Characters that start comments.")
(defun uncomment-line (line)
"Remove the comment prefix from the string."
(if (find (char line 0) +comments+)
(string-left-trim +whitespace+ (string-left-trim +comments+ line))
line))
;;;###autoload
(defun csv-names (file)
"Read and parse as names the first line in the file."
(csv-parse-string (uncomment-line (with-open-file (s file) (read-line s)))))
(defun csv-check-vec-len (vec cols fn pos)
(unless (= cols (length vec))
(error "~S:~:D: Wrong column count: ~:D instead of ~:D: ~S"
fn pos (length vec) cols vec)))
(defmacro with-csv ((vec file &key (progress '*csv-progress*)
(first-line-names '*csv-first-line-names*)
(junk '*csv-junk*)
(progress-1 '*csv-progress-1*) limit
(out '*standard-output*) columns)
&body body)
"Open FILE and set VEC to successive vectors in it.
Return 3 values:
number of records (lines) read,
number of bytes in the file,
fraction of bytes read
vector of column names if FIRST-LINE-NAMES is non-NIL
or if it is :DEFAULT and the first line starts with a +COMMENTS+ character."
(with-gensyms ("WITH-CSV-" in fn fsize ln len cols lim l1 fln drop ja)
`(with-timing (:out ,out :count ,len :units "records" :progress ,progress
:progress-1 ,progress-1)
(let* ((,fn ,file) ,fsize ,l1
(,fln ,first-line-names) (,cols ,columns)
(,ja ,junk) (,drop 0)
,@(when limit `((,lim ,limit))))
(with-open-file (,in ,fn :direction :input)
(format ,out "~&Reading `~a' [~:d bytes]..."
,fn (setq ,fsize (file-length ,in)))
(force-output ,out)
(when (eq ,fln :default)
(setq ,fln (find (peek-char nil ,in) +comments+)))
(when ,fln
(let ((line1 (read-line ,in)))
(cond ((zerop (length line1))
(cerror "ignore, return NIL for names"
"empty first line, names expected"))
(t (setq ,l1 (csv-parse-string (uncomment-line line1)))
(if ,cols (csv-check-vec-len ,l1 ,cols ,fn 0)
(setq ,cols (length ,l1)))))))
(loop :with ,vec :for ,ln = (read-line ,in nil nil) :while ,ln
,@(when limit
`(:when (and ,lim (= ,len ,lim))
:do (warn "reached the limit of ~:D record~:P ~
at ~:D byte~:P (~4F%), aborted~%"
,len (file-position ,in)
(/ (file-position ,in) ,fsize 1d-2))
(loop-finish)
:end))
:do (setq ,ln (string-trim *csv-whitespace* ,ln))
:if (or (zerop (length ,ln)) ; empty line
(find (char ,ln 0) +comments+) ; comment line
(progn (setq ,vec (csv-parse-string ,ln)) (incf ,len)
(if ,cols
(case ,ja
(:keep nil)
(:error
(csv-check-vec-len ,vec ,cols ,fn ,len))
(t
(handler-case (csv-check-vec-len
,vec ,cols ,fn ,len)
(error (c)
(unless (eql ,ja 0)
(warn (princ-to-string c)))
(when (and (integerp ,ja) (plusp ,ja))
(decf ,ja)
(when (zerop ,ja)
(warn "Further warnings omitted")))
t))))
(and (setq ,cols (length ,vec)) nil))))
:do (incf ,drop)
:else :do ,@body
(progress (/ (file-position ,in) ,fsize)
;; print <*...*> when we expect to reach limit
(if ,(when limit `(and ,lim (> ,len (* ,lim pos))))
"*" ""))
:end
:finally (format ,out "done [~:d record~:p~@[, ~:d column~:p~]~
~[~:;,~:* ~:d line~:p dropped~]]"
,len ,cols ,drop)
:finally (return
(values ,len (file-length ,in)
(if (zerop ,fsize) 1
(/ (file-position ,in) ,fsize))
,l1))))))))
;;;###autoload
(defun csv-read-file (inf &key ((:junk *csv-junk*) *csv-junk*)
((:first-line-names *csv-first-line-names*)
*csv-first-line-names*)
((:separator *csv-separator*) *csv-separator*))
"Read comma-separated values into a list of vectors."
(let (len file-size complete names)
(declare (ignorable complete))
(values (with-collect (coll)
(setf (values len file-size complete names)
(with-csv (vec inf) (coll vec))))
len file-size names)))
;;; defstruct i/o
(defun class-csv-header (class &key (out *standard-output*))
"Print the CSV header for the class to the stream."
(format out "#~{~A~^,~}~%" (port:class-slot-list class)))
(defun class-csv-print (obj &key (out *standard-output*))
(format out "~{~A~^,~}~%" (mapcar (lambda (slot) (slot-value obj slot))
(port:class-slot-list obj))))
;;; generic CSV i/o
(defstruct csv-i/o
(name (port:required-argument) :type symbol)
(header (port:required-argument) :type vector)
(reader (port:required-argument) :type function)
(writer (port:required-argument) :type function)
(package *package* :type 'package))
(defvar *csv-i/o* (make-hash-table :test 'eq) "type -> csv-i/o")
(defun csv-i/o (type)
(or (gethash type *csv-i/o*)
(error "unknown CSV i/o ~S" type)))
(defun csv-read (type file)
(let ((csv-i/o (csv-i/o type)))
(multiple-value-bind (data len file-size names)
(csv-read-file file :first-line-names t :junk :keep)
(declare (ignore len file-size))
(unless (equalp names (csv-i/o-header csv-i/o))
(cerror "ignore and procede" "~S(~S): bad header ~S (expected ~S)"
'csv-read type names (csv-i/o-header csv-i/o)))
(mapcar (csv-i/o-reader csv-i/o) data))))
(defun csv-write (type file data)
(let* ((csv-i/o (csv-i/o type)) (writer (csv-i/o-writer csv-i/o)))
(with-open-file (out file :direction :output)
(write-char (char +comments+ 0) out)
(loop :for name :across (csv-i/o-header csv-i/o) :and i :upfrom 0
:do (unless (zerop i) (write-char *csv-separator* out))
(write-string name out))
(terpri out)
(dolist (item data)
(funcall writer out item)
(terpri out)))))
;;; struct definition for CSV i/o
(defun type-parser (slot-type)
(ecase slot-type
(symbol (lambda (s) (intern (nstring-upcase (nsubstitute #\- #\Space s)))))
(string #'identity)
(integer #'parse-integer)
(float #'read-from-string)
(float% (lambda (s) (/ (read-from-string (nsubstitute #\Space #\% s))
100)))))
(defun type-default (slot-type)
(ecase slot-type
(symbol nil)
(string "")
(integer 0)
((float float%) 0s0)))
(defun type-type (slot-type)
(case slot-type
(float% 'float)
(t slot-type)))
;; MAKE-READER & MAKE-WRITER are separate macros to avoid calling MOP functions
;; at read time when DEFSTRUCT has not been called yet.
(defmacro make-reader (vec slots type package)
`(let ((len (length ,vec)) (*package* ,(find-package package)))
(,(port:structure-keyword-constructor `,type)
,@(loop :for pos :upfrom 0
:for dslot :in (port:class-direct-slots (find-class `,type))
:for slotd :in slots
:for slot-type = (third slotd)
:for parser = (or (fourth slotd) (type-parser slot-type))
:nconc `(,(car (port:slot-definition-initargs dslot))
(let ((s (and (< ,pos len) (aref vec ,pos))))
(if s (funcall ,parser s)
,(type-default slot-type))))))))
(defmacro make-writer (out obj type)
`(progn
,@(loop :for pos :upfrom 0
:for slot :in (port:class-direct-slots (find-class type))
:nconc `(,@(when (plusp pos) `((write-char *csv-separator* ,out)))
(write (,(car (port:slot-definition-readers slot)) ,obj)
:stream ,out :escape nil)))))
(defmacro defcsv (type (&key (package (symbol-package type))) slots)
`(progn
(eval-when (:compile-toplevel :load-toplevel :execute)
(defstruct ,type
,@(mapcar (lambda (slot)
(destructuring-bind (string symbol slot-type &rest opts)
slot
(declare (ignore string opts))
`(,symbol (port:required-argument)
:type ,(type-type slot-type))))
slots)))
(setf (gethash ',type *csv-i/o*)
(make-csv-i/o
:name ',type
:header ,(coerce (mapcar #'car slots) 'vector)
:reader (lambda (vec) (make-reader vec ,slots ,type ,package))
:writer (lambda (out obj) (make-writer out obj ,type))
:package ,package))))
(provide :cllib-csv)
;;; file csv.lisp ends here