[R-gregmisc-users] SF.net SVN: r-gregmisc:[1342] trunk/gdata
Brought to you by:
warnes
From: <wa...@us...> - 2009-07-16 02:49:24
|
Revision: 1342 http://r-gregmisc.svn.sourceforge.net/r-gregmisc/?rev=1342&view=rev Author: warnes Date: 2009-07-16 02:49:11 +0000 (Thu, 16 Jul 2009) Log Message: ----------- Add support for using tab for field separator during translation from xls format in read.xls Modified Paths: -------------- trunk/gdata/NAMESPACE trunk/gdata/R/read.xls.R trunk/gdata/man/read.xls.Rd Added Paths: ----------- trunk/gdata/inst/perl/xls2tab.pl Modified: trunk/gdata/NAMESPACE =================================================================== --- trunk/gdata/NAMESPACE 2009-05-09 05:45:13 UTC (rev 1341) +++ trunk/gdata/NAMESPACE 2009-07-16 02:49:11 UTC (rev 1342) @@ -32,6 +32,8 @@ wideByFactor, write.fwf, xls2csv, + xls2tab, + xls2sep, ## Object size stuff object.size, as.object_size, is.object_size, humanReadable, Modified: trunk/gdata/R/read.xls.R =================================================================== --- trunk/gdata/R/read.xls.R 2009-05-09 05:45:13 UTC (rev 1341) +++ trunk/gdata/R/read.xls.R 2009-07-16 02:49:11 UTC (rev 1342) @@ -4,126 +4,170 @@ dQuote.ascii <- function(x) paste('"',x,'"',sep='') -xls2csv <- function(xls, sheet = 1, verbose=FALSE, ..., perl="perl") - { +xls2csv <- function(xls, sheet=1, verbose=FALSE, ..., perl="perl") + xls2sep(xls=xls, sheet=sheet, verbose=verbose, ..., method="csv", + perl="perl") +xls2tab <- function(xls, sheet=1, verbose=FALSE, ..., perl="perl") + xls2sep(xls=xls, sheet=sheet, verbose=verbose, ..., method="tab", + perl="perl") - ### - # directories - package.dir <- .path.package('gdata') - perl.dir <- file.path(package.dir,'perl') - # - ### +xls2sep <- function(xls, sheet = 1, verbose=FALSE, ..., + method=c("csv","tab"), perl="perl") + { + + method <- match.arg(method) + + ## + ## directories + package.dir <- .path.package('gdata') + perl.dir <- file.path(package.dir,'perl') + ## - ### - # files + ## + ## files + tf <- NULL + if (substring(xls, 1, 7) == "http://") { + tf <- paste(tempfile(), "xls", sep = ".") + if(verbose) + cat("Downloading", + dQuote.ascii(xls), " to ", + dQuote.ascii(tf), "...\n") + else + cat("Downloading...\n") + download.file(xls, tf, mode = "wb") + cat("Done.\n") + xls <- tf + } - tf <- NULL - if (substring(xls, 1, 7) == "http://") { - tf <- paste(tempfile(), "xls", sep = ".") - if(verbose) - cat("Downloading", - dQuote.ascii(xls), " to ", - dQuote.ascii(tf), "...\n") - else - cat("Downloading...\n") - download.file(xls, tf, mode = "wb") - cat("Done.\n") - xls <- tf - } + if(file.access(xls, 4)!=0) + stop("Unable to read xls file '", xls, "'." ) - if(file.access(xls, 4)!=0) - stop("Unable to read xls file '", xls, "'." ) + if(method=="csv") + { + script <- file.path(perl.dir,'xls2csv.pl') + targetFile <- paste(tempfile(), "csv", sep = ".") + } + else if(method=="tab") + { + script <- file.path(perl.dir,'xls2tab.pl') + targetFile <- paste(tempfile(), "tab", sep = ".") + } + else + { + stop("Unknown method", method) + } + + ## + ## - xls2csv <- file.path(perl.dir,'xls2csv.pl') - csv <- paste(tempfile(), "csv", sep = ".") - # - ### + ## + ## execution command + cmd <- paste(perl, script, dQuote.ascii(xls), dQuote.ascii(targetFile), + sheet, sep=" ") + ## + ## - ### - # execution command - cmd <- paste(perl, xls2csv, dQuote.ascii(xls), dQuote.ascii(csv), - sheet, sep=" ") - # - ### - if(verbose) { cat("\n") cat("Converting xls file\n") cat(" ", dQuote.ascii(xls), "\n") - cat("to csv file \n") - cat(" ", dQuote.ascii(csv), "\n") + cat("to", method, " file \n") + cat(" ", dQuote.ascii(targetFile), "\n") cat("... \n\n") } - else - cat("Converting xls file to csv file... ") + else + cat("Converting xls file to", method, "file... ") - ### - # do the translation - if(verbose) cat("Executing ", cmd, "... \n\n") - # - results <- system(cmd, intern=!verbose) - # - if (verbose) cat("Done.\n\n") - # - ### + ## + ## do the translation + if(verbose) cat("Executing ", cmd, "... \n\n") + # + results <- system(cmd, intern=!verbose) + # + if (verbose) cat("Done.\n\n") + # + ## - if(file.access(csv, 4)!=0) - stop("Unable to read translated csv file '", csv, "'." ) + if(file.access(targetFile, 4)!=0) + stop("Unable to read translated", method, "file '", targetFile, "'." ) - cat("Done.\n") + cat("Done.\n") - # prepare for cleanup now, in case of error reading file - file(csv) -} + ## prepare for cleanup now, in case of error reading file + file(targetFile) + } -read.xls <- function(xls, sheet = 1, verbose=FALSE, pattern, ..., perl="perl") { - con <- tfn <- NULL - on.exit({ - if (inherits(con, "connection") && isOpen(con)) close(con) - if (file.exists(tfn)) file.remove(tfn) - }) +read.xls <- function(xls, sheet = 1, verbose=FALSE, pattern, ..., + method=c("csv","tab"), perl="perl") +{ + con <- tfn <- NULL + on.exit({ + if (inherits(con, "connection") && isOpen(con)) close(con) + if (file.exists(tfn)) file.remove(tfn) + }) - ## expand file path, translating ~ to user's home directory, etc. - xls <- path.expand(xls) + method <- match.arg(method) + + ## expand file path, translating ~ to user's home directory, etc. + xls <- path.expand(xls) - ## translate from xls to csv format (returns csv file name) - con <- xls2csv(xls, sheet, verbose=verbose, ..., perl = perl) + ## translate from xls to csv/tab format (returns csv file name) + if(method=="csv") + con <- xls2csv(xls, sheet, verbose=verbose, ..., perl = perl) + else if(method=="tab") + con <- xls2tab(xls, sheet, verbose=verbose, ..., perl = perl) + else + stop("Unknown method", method) - ## load the csv file - open(con) - tfn <- summary(con)$description - if (missing(pattern)) - { - if(verbose) - cat("Reading csv file ", dQuote.ascii(tfn), "...\n") - else - cat("Reading csv file... ") - retval <- read.csv(con, ...) - cat("Done.\n") - } - else { - cat("Searching for lines containing pattern ", pattern, "... ") - idx <- grep(pattern, readLines(con)) - if (length(idx) == 0) { - warning("pattern not found") - return(NULL) - } - cat("Done.\n") - - seek(con, 0) + ## load the csv file + open(con) + tfn <- summary(con)$description + if (missing(pattern)) + { + if(verbose) + cat("Reading", method, "file ", dQuote.ascii(tfn), "...\n") + else + cat("Reading", method, "file... ") + + if(method=="csv") + retval <- read.csv(con, ...) + else if (method=="tab") + retval <- read.delim(con, ...) + else + stop("Unknown method", method) + + cat("Done.\n") + } + else { + cat("Searching for lines containing pattern ", pattern, "... ") + idx <- grep(pattern, readLines(con)) + if (length(idx) == 0) { + warning("pattern not found") + return(NULL) + } + cat("Done.\n") + + seek(con, 0) - if(verbose) - cat("Reading csv file ", dQuote.ascii(tfn), "...\n") - else - cat("Reading csv file... ") - retval <- read.csv(con, skip = idx[1]-1, ...) - cat("Done.\n") - } - retval + if(verbose) + cat("Reading", method, "file ", dQuote.ascii(tfn), "...\n") + else + cat("Reading", method, "file... ") + + if(method=="csv") + retval <- read.csv(con, skip = idx[1]-1, ...) + else if (method=="tab") + retval <- read.delim(con, skip = idx[1]-1, ...) + else + stop("Unknown method", method) + + cat("Done.\n") + } + retval } Copied: trunk/gdata/inst/perl/xls2tab.pl (from rev 1341, trunk/gdata/inst/perl/xls2csv.pl) =================================================================== --- trunk/gdata/inst/perl/xls2tab.pl (rev 0) +++ trunk/gdata/inst/perl/xls2tab.pl 2009-07-16 02:49:11 UTC (rev 1342) @@ -0,0 +1,166 @@ +#!/bin/env perl + +BEGIN { +use File::Basename; +unshift(@INC, dirname $0); +} + +use strict; +use Spreadsheet::ParseExcel; + +# declare some varibles local +my($row, $col, $sheet, $cell, $usage, $basename, $sheetnumber, $filename); + +## +## Usage information +## +$usage = <<EOF; + +xls2tab.pl <excel file> [<output file>] [<worksheet number>] + +Translate the Microsoft Excel spreadsheet file contained in +<excel file> into tab separated value format (TAB) and store +in <output file>. + +If <output file> is not specified, the output file will have the +same name as the input file with '.xls' or '.XLS' (if any) +removed and '.tab' appended. + +If no worksheet number is given, each worksheet will be written to +a separate file with the name '<output file>_<worksheet name>.tab'. + +EOF + +## +## parse arguments +## + +if(!defined($ARGV[0])) + { + print $usage; + exit 1; + } + +$basename = $ARGV[1]; +$basename =~ s/.tab//; +if ($basename eq "") + { + my @path; + @path = split(/[\/\\]/, $ARGV[0]); # split on file separator + $basename = $path[$#path]; + $basename =~ s/.xls//i; + } + +if(defined($ARGV[2]) ) + { + $sheetnumber = $ARGV[2]; + die "Sheetnumber must be an integer larger than 0." if $sheetnumber < 1; + } + +## +## open spreadsheet +## + +my $oExcel = new Spreadsheet::ParseExcel; + +print "Loading $ARGV[0] ...\n"; + +open(FH, "<$ARGV[0]") or die "Unable to open file '$ARGV[0]'.\n"; +close(FH); + +my $oBook = $oExcel->Parse($ARGV[0]); + +print "\n"; +print "Orignal Filename :", $oBook->{File} , "\n"; +print "Number of Sheets :", $oBook->{SheetCount} , "\n"; +print "Author :", $oBook->{Author} , "\n"; +print "\n"; + +my @sheetlist = (@{$oBook->{Worksheet}}); +if (defined($sheetnumber)) + { + @sheetlist=($sheetlist[$sheetnumber-1]); + } + +## +## iterate across each worksheet, writing out a separat tab file +## + +my $i=0; +foreach my $sheet (@sheetlist) +{ + $i++; + + my $sheetname = $sheet->{Name}; + if(defined($sheetnumber)) + { + $filename = "${basename}.tab"; + } + else + { + $filename = "${basename}_${sheetname}.tab"; + } + + print "Writing Sheet number $i ('$sheetname') to file '$filename'\n"; + + open(OutFile,">$filename"); + + my $cumulativeBlankLines=0; + + my $minrow = $sheet->{MinRow}; + my $maxrow = $sheet->{MaxRow}; + my $mincol = $sheet->{MinCol}; + my $maxcol = $sheet->{MaxCol}; + + print "Minrow=$minrow Maxrow=$maxrow Mincol=$mincol Maxcol=$maxcol\n"; + + for(my $row = $minrow; $row <= $maxrow; $row++) + { + my $outputLine = ""; + + for(my $col = $mincol; $col <= $maxcol; $col++) + { + my $cell = $sheet->{Cells}[$row][$col]; + if( defined($cell) ) + { + $_=$cell->Value; #{Val}; + + # convert '#NUM!' strings to missing (empty) values + s/#NUM!//; + + # escape double-quote characters in the data since + # they are used as field delimiters + s/\"/\\\"/g; + } + else + { + $_ = ''; + } + + $outputLine .= "\"" . $_ . "\"" if(length($_)>0); + + # separate cells with tabs + $outputLine .= "\t" if( $col != $maxcol) ; + + } + + #$outputLine =~ s/[\t ]+$//g; ## strip off trailing blanks and tabs + + # skip blank/empty lines + if( $outputLine =~ /^[\t ]*$/ ) + { + $cumulativeBlankLines++ + } + else + { + print OutFile "$outputLine \n" + } + } + + close OutFile; + + print " (Ignored $cumulativeBlankLines blank lines.)\n" + if ($cumulativeBlankLines); + print "\n"; +} + Property changes on: trunk/gdata/inst/perl/xls2tab.pl ___________________________________________________________________ Added: svn:executable + * Added: svn:keywords + Author Date Id Revision Added: svn:mergeinfo + Added: svn:eol-style + native Modified: trunk/gdata/man/read.xls.Rd =================================================================== --- trunk/gdata/man/read.xls.Rd 2009-05-09 05:45:13 UTC (rev 1341) +++ trunk/gdata/man/read.xls.Rd 2009-07-16 02:49:11 UTC (rev 1342) @@ -1,41 +1,53 @@ \name{read.xls} \alias{read.xls} \alias{xls2csv} +\alias{xls2tab} +\alias{xls2sep} \title{Read Excel files} \description{Reads a Microsoft Excel file into a data frame} \usage{ -read.xls(xls, sheet=1, verbose=FALSE, pattern, ..., perl="perl") +read.xls(xls, sheet=1, verbose=FALSE, pattern, ..., + method=c("csv","tab"), perl="perl") + xls2csv(xls, sheet=1, verbose=FALSE, ..., perl="perl") +xls2tab(xls, sheet=1, verbose=FALSE, ..., perl="perl") +xls2sep(xls, sheet=1, verbose=FALSE, ..., method=c("csv","tab"), + perl="perl") } \arguments{ - \item{xls}{name of the Microsoft Excel file. If on internet it - should begin with code{"http://"}.} - \item{sheet}{number of sheet within the Excel file from which data are - to be read} + \item{xls}{path to the Microsoft Excel file. To access a file on the + internet, start the path with "http://".} + \item{sheet}{number of the sheet within the Excel file from which data + are to be read} \item{verbose}{logical flag indicating whether details should be printed as the file is processed.} \item{pattern}{if specified, them skip all lines before the first - containing this string} + containing this string} \item{perl}{name of the perl executable to be called.} + \item{method}{intermediate file format, "csv" for comma-separated and + "tab" for tab-separated} \item{...}{additional arguments to read.table. The defaults of read.csv are used.} } \value{ - \code{"read.xls"} returns a data frame. \code{"xls2csv"} returns a - connection to a temporary file in csv format. + \code{"read.xls"} returns a data frame. + + \code{"xls2sep"} returns a temporary file in the specified format. + \code{"xls2csv"} and \code{"xls2tab"} are simply wrappers for + \code{"xls2sep"} specifying method as "csv" or "tab", respectively. } \details{ This function works translating the named Microsoft Excel file into a - temporary .csv file, using Greg Warnes' xls2csv Perl script (installed - as part of the gregmisc package). + temporary .csv or .tab file, using the xls2csv or xls2tab + Perl script installed as part of this (gdata) package. Caution: In the conversion to csv, strings will be quoted. This can be problem if you are trying to use the \code{comment.char} option of \code{read.table} since the first character of all lines (including comment lines) will be "\"" after conversion. - Caution: With \code{"xls2csv"} it is the responsibility of the user - to close and delete the file after using it. + Caution: If you call \code{"xls2csv"} directly, is your responsibility + to close and delete the file after using it. } \references{http://www.analytics.washington.edu/statcomp/downloads/xls2csv} \note{ Either a working version of Perl must be present in the executable @@ -48,9 +60,22 @@ xlsfile <- file.path(.path.package('gdata'),'xls','iris.xls') xlsfile - iris <- read.xls(xlsfile) + iris <- read.xls(xlsfile) # defaults to csv format + iris <- read.xls(xlsfile,method="csv") # specify csv format + iris <- read.xls(xlsfile,method="tab") # specify tab format + head(iris) # look at the top few rows + \dontshow{ + iris.1 <- read.xls(xlsfile) # defaults to csv format + iris.2 <- read.xls(xlsfile,method="csv") # specify csv format + iris.3 <- read.xls(xlsfile,method="tab") # specify tab format + + stopifnot(all.equal(iris.1, iris.2)) + stopifnot(all.equal(iris.1, iris.3)) + } + + \dontrun{ # Example specifying exact Perl path for default MS-Windows install of # ActiveState perl This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |