[1ba625]: inst / @dataframe / dataframe.m Maximize Restore History

Download this file

dataframe.m    527 lines (488 with data), 19.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
function df = dataframe(x, varargin)
%# -*- texinfo -*-
%# @deftypefn {Function File} @var{df} = dataframe(@var{x = []}, ...)
%# This is the default constructor for a dataframe object, which is
%# similar to R 'data.frame'. It's a way to group tabular data, then
%# accessing them either as matrix or by column name.
%# Input argument x may be: @itemize
%# @item a dataframe => use @var{varargin} to pad it with suplemental
%# columns
%# @item a matrix => create column names from input name; each column
%# is used as an entry
%# @item a cell matrix => try to infer column names from the first row,
%# and row indexes and names from the two first columns;
%# @item a file name => import data into a dataframe;
%# @item a matrix of char => initialise colnames from them.
%# @item a two-element cell: use the first as column as column to
%# append to, and the second as initialiser for the column(s)
%# @end itemize
%# If called with an empty value, or with the default argument, it
%# returns an empty dataframe which can be further populated by
%# assignement, cat, ... If called without any argument, it should
%# return a dataframe from the whole workspace.
%# @*Variable input arguments are first parsed as pairs (options, values).
%# Recognised options are: @itemize
%# @item rownames : take the values as initialiser for row names
%# @item colnames : take the values as initialiser for column names
%# @item seeked : a (kept) field value which triggers start of processing.
%# @item trigger : a (unkept) field value which triggers start of processing.
%# @item datefmt: date format, see datestr help
%# Each preceeding line is silently skipped. Default: none
%# @item unquot: a logical switch telling wheter or not strings should
%# be unquoted before storage, default = true;
%# @item sep: the elements separator, default '\t,'
%# @end itemize
%# The remaining data are concatenated (right-appended) to the existing ones.
%# @end deftypefn
%% Copyright (C) 2009-2012 Pascal Dupuis <Pascal.Dupuis@uclouvain.be>
%%
%% This file is part of Octave.
%%
%% Octave is free software; you can redistribute it and/or
%% modify it under the terms of the GNU General Public
%% License as published by the Free Software Foundation;
%% either version 2, or (at your option) any later version.
%%
%% Octave is distributed in the hope that it will be useful,
%% but WITHOUT ANY WARRANTY; without even the implied
%% warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
%% PURPOSE. See the GNU General Public License for more
%% details.
%%
%% You should have received a copy of the GNU General Public
%% License along with Octave; see the file COPYING. If not,
%% write to the Free Software Foundation, 51 Franklin Street -
%% Fifth Floor, Boston, MA 02110-1301, USA.
%#
%# $Id$
%#
if (0 == nargin)
%# default constructor: create a scalar struct and initialise the
%# fields in the right order
df = struct ('_cnt', [0 0]);
%# do not call "struct" with the two next args: it would create an array
df._name = {cell(0, 1), cell(1, 0)}; %# rows - cols
df._over = cell (1, 2);
df._ridx = [];
df._data = cell (0, 0);
df._rep = cell (0, 0); %# a repetition index
df._type = cell (0, 0); %# the type of each column
df._src = cell (0, 0);
df._header = cell (0, 0);
df._cmt = cell (0, 0); %# to put comments
df = class (df, 'dataframe');
return
endif
if (isnull (x) && 1 == nargin)
disp ('FIXME -- should create a dataframe from the whole workspace')
df = dataframe (); %# just call the default constructor
return
endif
if (isa (x, 'dataframe'))
%# Try to append whatever data or metadata given through varargin
%# into this dataframe
df = x;
else
df = dataframe (); %# get the right fields
if (isa (x, 'struct'))
%# only accept a struct if it has the same fieldnames as a dataframe
dummy = fieldnames (x);
indi = fieldnames (df);
if (size (dummy, 1) ~= size (indi, 1))
error ('First argument of dataframe is a struct with the wrong number of fields');
endif
if (~all (cellfun (@strcmp, dummy, indi)))
error ('First argument of dataframe is a struct with wrong field names');
endif
%# easy way to transform a struct into a dataframe object
df = class (x, 'dataframe');
if (1 == nargin) return; endif
endif
endif
%# default values
seeked = []; trigger = []; unquot = true; sep = "\t,"; cmt_lines = [];
conv_regexp = {}; datefmt = '';
if (length (varargin) > 0) %# extract known arguments
indi = 1;
%# loop over possible arguments
while (indi <= size (varargin, 2))
if (isa (varargin{indi}, 'char'))
switch(varargin{indi})
case 'rownames'
switch class (varargin{indi+1})
case {'cell'}
df._name{1} = varargin{indi+1};
case {'char'}
df._name{1} = cellstr (varargin{indi+1});
otherwise
df._name{1} = cellstr (num2str (varargin{indi+1}));
endswitch
df._name{1} = genvarname (df._name{1});
df._over{1}(1, 1:length (df._name{1})) = false;
df._cnt(1) = size (df._name{1}, 1);
df._ridx = (1:df._cnt(1))';
varargin(indi:indi+1) = [];
case 'colnames'
switch class(varargin{indi+1})
case {'cell'}
df._name{2} = varargin{indi+1};
case {'char'}
df._name{2} = cellstr (varargin{indi+1});
otherwise
df._name{2} = cellstr (num2str (varargin{indi+1}));
endswitch
%# detect assignment - functions calls - ranges
dummy = cellfun ('size', cellfun (@(x) strsplit (x, ":=("), df._name{2}, \
"UniformOutput", false), 2);
if (any (dummy > 1))
warning ('dataframe colnames taken literally and not interpreted');
endif
df._name{2} = genvarname (df._name{2});
df._over{2}(1, 1:length (df._name{2})) = false;
varargin(indi:indi+1) = [];
case 'seeked',
seeked = varargin{indi + 1};
varargin(indi:indi+1) = [];
case 'trigger',
trigger = varargin{indi + 1};
varargin(indi:indi+1) = [];
case 'unquot',
unquot = varargin{indi + 1};
varargin(indi:indi+1) = [];
case 'sep',
sep = varargin{indi + 1};
varargin(indi:indi+1) = [];
case 'conv'
conv_regexp = varargin{indi + 1};
varargin(indi:indi+1) = [];
case 'datefmt'
datefmt = varargin{indi + 1};
varargin(indi:indi+1) = [];
case '--'
%# stop processing args -- take the rest as filenames
varargin(indi) = [];
break;
otherwise %# FIXME: just skip it for now
disp (sprintf ("Ignoring unkown argument %s", varargin{indi}));
indi = indi + 1;
endswitch
else
indi = indi + 1; %# skip it
endif
endwhile
endif
if (~isempty (datefmt))
%# replace consecutive spaces by one
datefmt = regexprep (datefmt, '[ ]+', ' ');
%# is "space" used as separator ? Then we may take more than one field.
if (~isempty (regexp (sep, ' ')))
datefields = 1 + length (regexp (datefmt, ' '));
else
datefields = 1;
endif
else
datefields = 1;
endif
if (~isempty (seeked) && ~isempty (trigger))
error ('seeked and trigger are mutuallly incompatible arguments');
endif
indi = 0;
while (indi <= size (varargin, 2))
indi = indi + 1;
if (~isa (x, 'dataframe'))
if (isa (x, 'char') && size (x, 1) < 2)
dummy = tilde_expand (x);
try
%# read the data frame from a file
x = load (dummy);
df._src{end+1, 1} = dummy;
catch
%# try our own method
UTF8_BOM = char ([0xEF 0xBB 0xBF]);
unwind_protect
in = [];
fid = fopen (dummy);
if (fid ~= -1)
df._src{end+1, 1} = dummy;
dummy = fgetl (fid);
if (-1 == dummy)
x = []; %# file is valid but empty
else
if (~strcmp (dummy, UTF8_BOM))
frewind (fid);
endif
%# slurp everything and convert doubles to char, avoiding
%# problems with char > 127
in = char (fread (fid).');
endif
endif
unwind_protect_cleanup
if (fid ~= -1) fclose (fid); endif
end_unwind_protect
if (~isempty (in))
%# explicit list taken from 'man pcrepattern' -- we enclose all
%# vertical separators in case the underlying regexp engine
%# doesn't have them all.
eol = '(\r\n|\n|\v|\f|\r|\x85)';
%# cut into lines -- include the EOL to have a one-to-one
%# matching between line numbers. Use a non-greedy match.
lines = regexp (in, ['.*?' eol], 'match');
dummy = cellfun (@(x) regexp (x, eol), lines);
%# remove the EOL character(s)
lines(1 == dummy) = {""};
%# use a positive lookahead -- eol is not part of the match
lines(dummy > 1) = cellfun (@(x) regexp (x, ['.*?(?=' eol ')'], \
'match'), lines(dummy > 1));
%# a field either starts at a word boundary, either by + - . for
%# a numeric data, either by ' for a string.
%# content = cellfun(@(x) regexp(x, '(\b|[-+\.''])[^,]*(''|\b)', 'match'),\
%# lines, 'UniformOutput', false); %# extract fields
if (strfind (sep, ' '))
content = cellfun (@(x) strsplit (x, sep, true), lines, \
'UniformOutput', false); %# extract fields
else
content = cellfun (@(x) strsplit (x, sep), lines, \
'UniformOutput', false); %# extract fields
endif
indl = 1; indj = 1; %# disp('line 151 '); keyboard
if (~isempty (seeked))
while (indl <= length (lines))
dummy = content{indl};
if (all (cellfun ('size', dummy, 2) == 0))
indl = indl + 1;
continue;
endif
if (all (cellfun (@isempty, regexp (dummy, seeked, 'match'))))
if (isempty (df._header))
df._header = dummy;
else
df._header(end+1, 1:length (dummy)) = dummy;
endif
indl = indl + 1;
continue;
endif
break;
endwhile
elseif (~isempty (trigger))
while (indl <= length (lines))
dummy = content{indl};
indl = indl + 1;
if (all (cellfun ('size', dummy, 2) == 0))
continue;
endif
if (isempty (df._header))
df._header = dummy;
else
df._header(end+1, 1:length (dummy)) = dummy;
endif
if (all (cellfun (@isempty, regexp (dummy, trigger, 'match'))))
continue;
endif
break;
endwhile
endif
if (indl > length (lines))
x = [];
else
x = cell (1+length (lines)-indl, size (dummy, 2));
empty_lines = []; cmt_lines = [];
while (indl <= length (lines))
dummy = content{indl};
if (all (cellfun ('size', dummy, 2) == 0))
empty_lines = [empty_lines indj];
indl = indl + 1; indj = indj + 1;
continue;
endif
%# does it looks like a comment line ?
if (regexp (dummy{1}, ['^\s*' char(35)]))
empty_lines = [empty_lines indj];
cmt_lines = strvcat (cmt_lines, horzcat (dummy{:}));
indl = indl + 1; indj = indj + 1;
continue;
endif
%# try to convert to float
if (~ isempty(conv_regexp))
dummy = regexprep (dummy, conv_regexp{});
endif
the_line = cellfun (@(x) sscanf (x, "%f"), dummy, \
'UniformOutput', false);
indk = 1; indm = 1;
while (indk <= size (the_line, 2))
if (isempty (the_line{indk}) || any (size (the_line{indk}) > 1))
%#if indi > 1 && indk > 1, disp('line 117 '); keyboard; %#endif
if (unquot)
try
%# remove quotes and leading space(s)
x(indj, indm) = regexp (dummy{indk}, '[^''" ].*[^''"]', 'match'){1};
catch
%# if the previous test fails, try a simpler one
in = regexp (dummy{indk}, '[^'' ]+', 'match');
if (~isempty (in))
x(indj, indm) = in{1};
%# else
%# x(indj, indk) = [];
endif
end_try_catch
else
%# no conversion possible, store and remove leading space(s)
x(indj, indm) = regexp (dummy{indk}, '[^ ].*', 'match');
endif
elseif (~isempty (regexp (dummy{indk}, '[/:-]')) && ...
~isempty (datefmt))
%# does it look like a date ?
datetime = dummy{indk};
if (datefields > 1)
%# concatenate the required number of fields
indc = 1;
for indc = (2:datefields)
datetime = cstrcat(datetime, ' ', dummy{indk+indc-1});
endfor
else
%# ensure spaces are unique
datetime = regexprep (datetime, '[ ]+', ' ');
endif
try
datetime = datevec (datetime, datefmt);
timeval = struct ("usec", 0, "sec", floor (datetime (6)),
"min", datetime(5), "hour", datetime(4),
"mday", datetime(3), "mon", datetime(2)-1,
"year", datetime(1)-1900);
timeval.usec = 1e6*(datetime(6) - timeval.sec);
x(indj, indm) = str2num (strftime ([char(37) 's'], timeval)) + ...
timeval.usec * 1e-6;
if (datefields > 1)
%# skip fields successfully converted
indk = indk + (datefields - 1);
endif
catch
%# store it as is
x(indj, indm) = the_line{indk};
end_try_catch
else
x(indj, indm) = the_line{indk};
endif
indk = indk + 1; indm = indm + 1;
endwhile
indl = indl + 1; indj = indj + 1;
endwhile
if (~isempty (empty_lines))
x(empty_lines, :) = [];
endif
%# detect empty columns
empty_lines = find (0 == sum (cellfun ('size', x, 2)));
if (~isempty (empty_lines))
x(:, empty_lines) = [];
endif
endif
clear UTF8_BOM fid in lines indl the_line content empty_lines
clear datetime timeval idx
endif
end_try_catch
endif
%# fallback, avoiding a recursive call
idx.type = '()'; indj = [];
if (~isa (x, 'char')) %# x may be a cell array, a simple matrix, ...
indj = df._cnt(2)+(1:size (x, 2));
else
%# at this point, reading some filename failed
error ("dataframe: can't open '%s' for reading data", x);
endif;
if (iscell (x)) %# x was filled with fields read from the CSV
if (and (isvector (x), 2 == length (x)))
%# use the intermediate value as destination column
[indc, ncol] = df_name2idx (df._name{2}, x{1}, df._cnt(2), "column");
if (ncol ~= 1)
error (["With two-elements cell, the first should resolve " ...
"to a single column"]);
endif
try
dummy = cellfun (@class, x{2}(2, :), 'UniformOutput', false);
catch
dummy = cellfun (@class, x{2}(1, :), 'UniformOutput', false);
end_try_catch
df = df_pad (df, 2, [length(dummy) indc], dummy);
x = x{2};
indj = indc + (1:size (x, 2)); %# redefine target range
elseif (isa (x{1}, 'cell'))
x = x{1}; %# remove one cell level
endif
if (length (df._name{2}) < indj(1) || isempty (df._name{2}(indj)))
[df._name{2}(indj, 1), df._over{2}(1, indj)] ...
= df_colnames (inputname(indi), indj);
df._name{2} = genvarname (df._name{2});
endif
%# allow overwriting of column names
df._over{2}(1, indj) = true;
elseif (~isempty (indj))
%# x is an array, generates fieldnames from names given as args
if (1 == length (df._name{2}) && length (df._name{2}) < \
length (indj))
[df._name{2}(indj, 1), df._over{2}(1, indj)] ...
= df_colnames (char (df._name{2}), indj);
elseif (length (df._name{2}) < indj(1) || isempty (df._name{2}(indj)))
[df._name{2}(indj, 1), df._over{2}(1, indj)] ...
= df_colnames (inputname(indi), indj);
endif
df._name{2} = genvarname (df._name{2});
endif
if (~isempty (indj))
%# the exact row size will be determined latter
idx.subs = {'', indj};
%# use direct assignement
if (ndims (x) > 2), idx.subs{3} = 1:size (x, 3); endif
%# df = subsasgn(df, idx, x); <= call directly lower level
try
df = df_matassign (df, idx, indj, length (indj), x);
catch
disp ('line 443 '); keyboard
end_try_catch
if (~isempty (cmt_lines))
df._cmt = vertcat (df._cmt, cellstr (cmt_lines));
cmt_lines = [];
endif
else
df._cnt(2) = length (df._name{2});
endif
elseif (indi > 1)
error ('Concatenating dataframes: use cat instead');
endif
try
%# loop over next variable argument
x = varargin{1, indi};
catch
%# disp('line 197 ???');
end_try_catch
endwhile
endfunction
function [x, y] = df_colnames(base, num)
%# small auxiliary function to generate column names. This is required
%# here, as only the constructor can use inputname()
if (any ([index(base, "=")]))
%# takes the left part as base
x = strsplit (base, "=");
x = deblank (x{1});
if (isvarname (x))
y = false;
else
x = 'X'; y = true;
endif
else
%# is base most probably a filename ?
x = regexp (base, '''[^''].*[^'']''', 'match');
if (isempty (x))
if (isvarname (base))
x = base; y = false;
else
x = 'X'; y = true; %# this is a default value, may be changed
endif
else
x = x{1}; y = true;
endif
endif
if (numel (num) > 1)
x = repmat (x, numel (num), 1);
x = cstrcat (x, strjust (num2str (num(:)), 'left'));
y = repmat (y, 1, numel (num));
endif
x = cellstr (x);
endfunction