[R-gregmisc-users] SF.net SVN: r-gregmisc:[1342] trunk/gdata
Brought to you by:
warnes
|
From: <wa...@us...> - 2009-07-16 02:49:24
|
Revision: 1342
http://r-gregmisc.svn.sourceforge.net/r-gregmisc/?rev=1342&view=rev
Author: warnes
Date: 2009-07-16 02:49:11 +0000 (Thu, 16 Jul 2009)
Log Message:
-----------
Add support for using tab for field separator during translation from xls format in read.xls
Modified Paths:
--------------
trunk/gdata/NAMESPACE
trunk/gdata/R/read.xls.R
trunk/gdata/man/read.xls.Rd
Added Paths:
-----------
trunk/gdata/inst/perl/xls2tab.pl
Modified: trunk/gdata/NAMESPACE
===================================================================
--- trunk/gdata/NAMESPACE 2009-05-09 05:45:13 UTC (rev 1341)
+++ trunk/gdata/NAMESPACE 2009-07-16 02:49:11 UTC (rev 1342)
@@ -32,6 +32,8 @@
wideByFactor,
write.fwf,
xls2csv,
+ xls2tab,
+ xls2sep,
## Object size stuff
object.size, as.object_size, is.object_size, humanReadable,
Modified: trunk/gdata/R/read.xls.R
===================================================================
--- trunk/gdata/R/read.xls.R 2009-05-09 05:45:13 UTC (rev 1341)
+++ trunk/gdata/R/read.xls.R 2009-07-16 02:49:11 UTC (rev 1342)
@@ -4,126 +4,170 @@
dQuote.ascii <- function(x) paste('"',x,'"',sep='')
-xls2csv <- function(xls, sheet = 1, verbose=FALSE, ..., perl="perl")
- {
+xls2csv <- function(xls, sheet=1, verbose=FALSE, ..., perl="perl")
+ xls2sep(xls=xls, sheet=sheet, verbose=verbose, ..., method="csv",
+ perl="perl")
+xls2tab <- function(xls, sheet=1, verbose=FALSE, ..., perl="perl")
+ xls2sep(xls=xls, sheet=sheet, verbose=verbose, ..., method="tab",
+ perl="perl")
- ###
- # directories
- package.dir <- .path.package('gdata')
- perl.dir <- file.path(package.dir,'perl')
- #
- ###
+xls2sep <- function(xls, sheet = 1, verbose=FALSE, ...,
+ method=c("csv","tab"), perl="perl")
+ {
+
+ method <- match.arg(method)
+
+ ##
+ ## directories
+ package.dir <- .path.package('gdata')
+ perl.dir <- file.path(package.dir,'perl')
+ ##
- ###
- # files
+ ##
+ ## files
+ tf <- NULL
+ if (substring(xls, 1, 7) == "http://") {
+ tf <- paste(tempfile(), "xls", sep = ".")
+ if(verbose)
+ cat("Downloading",
+ dQuote.ascii(xls), " to ",
+ dQuote.ascii(tf), "...\n")
+ else
+ cat("Downloading...\n")
+ download.file(xls, tf, mode = "wb")
+ cat("Done.\n")
+ xls <- tf
+ }
- tf <- NULL
- if (substring(xls, 1, 7) == "http://") {
- tf <- paste(tempfile(), "xls", sep = ".")
- if(verbose)
- cat("Downloading",
- dQuote.ascii(xls), " to ",
- dQuote.ascii(tf), "...\n")
- else
- cat("Downloading...\n")
- download.file(xls, tf, mode = "wb")
- cat("Done.\n")
- xls <- tf
- }
+ if(file.access(xls, 4)!=0)
+ stop("Unable to read xls file '", xls, "'." )
- if(file.access(xls, 4)!=0)
- stop("Unable to read xls file '", xls, "'." )
+ if(method=="csv")
+ {
+ script <- file.path(perl.dir,'xls2csv.pl')
+ targetFile <- paste(tempfile(), "csv", sep = ".")
+ }
+ else if(method=="tab")
+ {
+ script <- file.path(perl.dir,'xls2tab.pl')
+ targetFile <- paste(tempfile(), "tab", sep = ".")
+ }
+ else
+ {
+ stop("Unknown method", method)
+ }
+
+ ##
+ ##
- xls2csv <- file.path(perl.dir,'xls2csv.pl')
- csv <- paste(tempfile(), "csv", sep = ".")
- #
- ###
+ ##
+ ## execution command
+ cmd <- paste(perl, script, dQuote.ascii(xls), dQuote.ascii(targetFile),
+ sheet, sep=" ")
+ ##
+ ##
- ###
- # execution command
- cmd <- paste(perl, xls2csv, dQuote.ascii(xls), dQuote.ascii(csv),
- sheet, sep=" ")
- #
- ###
-
if(verbose)
{
cat("\n")
cat("Converting xls file\n")
cat(" ", dQuote.ascii(xls), "\n")
- cat("to csv file \n")
- cat(" ", dQuote.ascii(csv), "\n")
+ cat("to", method, " file \n")
+ cat(" ", dQuote.ascii(targetFile), "\n")
cat("... \n\n")
}
- else
- cat("Converting xls file to csv file... ")
+ else
+ cat("Converting xls file to", method, "file... ")
- ###
- # do the translation
- if(verbose) cat("Executing ", cmd, "... \n\n")
- #
- results <- system(cmd, intern=!verbose)
- #
- if (verbose) cat("Done.\n\n")
- #
- ###
+ ##
+ ## do the translation
+ if(verbose) cat("Executing ", cmd, "... \n\n")
+ #
+ results <- system(cmd, intern=!verbose)
+ #
+ if (verbose) cat("Done.\n\n")
+ #
+ ##
- if(file.access(csv, 4)!=0)
- stop("Unable to read translated csv file '", csv, "'." )
+ if(file.access(targetFile, 4)!=0)
+ stop("Unable to read translated", method, "file '", targetFile, "'." )
- cat("Done.\n")
+ cat("Done.\n")
- # prepare for cleanup now, in case of error reading file
- file(csv)
-}
+ ## prepare for cleanup now, in case of error reading file
+ file(targetFile)
+ }
-read.xls <- function(xls, sheet = 1, verbose=FALSE, pattern, ..., perl="perl") {
- con <- tfn <- NULL
- on.exit({
- if (inherits(con, "connection") && isOpen(con)) close(con)
- if (file.exists(tfn)) file.remove(tfn)
- })
+read.xls <- function(xls, sheet = 1, verbose=FALSE, pattern, ...,
+ method=c("csv","tab"), perl="perl")
+{
+ con <- tfn <- NULL
+ on.exit({
+ if (inherits(con, "connection") && isOpen(con)) close(con)
+ if (file.exists(tfn)) file.remove(tfn)
+ })
- ## expand file path, translating ~ to user's home directory, etc.
- xls <- path.expand(xls)
+ method <- match.arg(method)
+
+ ## expand file path, translating ~ to user's home directory, etc.
+ xls <- path.expand(xls)
- ## translate from xls to csv format (returns csv file name)
- con <- xls2csv(xls, sheet, verbose=verbose, ..., perl = perl)
+ ## translate from xls to csv/tab format (returns csv file name)
+ if(method=="csv")
+ con <- xls2csv(xls, sheet, verbose=verbose, ..., perl = perl)
+ else if(method=="tab")
+ con <- xls2tab(xls, sheet, verbose=verbose, ..., perl = perl)
+ else
+ stop("Unknown method", method)
- ## load the csv file
- open(con)
- tfn <- summary(con)$description
- if (missing(pattern))
- {
- if(verbose)
- cat("Reading csv file ", dQuote.ascii(tfn), "...\n")
- else
- cat("Reading csv file... ")
- retval <- read.csv(con, ...)
- cat("Done.\n")
- }
- else {
- cat("Searching for lines containing pattern ", pattern, "... ")
- idx <- grep(pattern, readLines(con))
- if (length(idx) == 0) {
- warning("pattern not found")
- return(NULL)
- }
- cat("Done.\n")
-
- seek(con, 0)
+ ## load the csv file
+ open(con)
+ tfn <- summary(con)$description
+ if (missing(pattern))
+ {
+ if(verbose)
+ cat("Reading", method, "file ", dQuote.ascii(tfn), "...\n")
+ else
+ cat("Reading", method, "file... ")
+
+ if(method=="csv")
+ retval <- read.csv(con, ...)
+ else if (method=="tab")
+ retval <- read.delim(con, ...)
+ else
+ stop("Unknown method", method)
+
+ cat("Done.\n")
+ }
+ else {
+ cat("Searching for lines containing pattern ", pattern, "... ")
+ idx <- grep(pattern, readLines(con))
+ if (length(idx) == 0) {
+ warning("pattern not found")
+ return(NULL)
+ }
+ cat("Done.\n")
+
+ seek(con, 0)
- if(verbose)
- cat("Reading csv file ", dQuote.ascii(tfn), "...\n")
- else
- cat("Reading csv file... ")
- retval <- read.csv(con, skip = idx[1]-1, ...)
- cat("Done.\n")
- }
- retval
+ if(verbose)
+ cat("Reading", method, "file ", dQuote.ascii(tfn), "...\n")
+ else
+ cat("Reading", method, "file... ")
+
+ if(method=="csv")
+ retval <- read.csv(con, skip = idx[1]-1, ...)
+ else if (method=="tab")
+ retval <- read.delim(con, skip = idx[1]-1, ...)
+ else
+ stop("Unknown method", method)
+
+ cat("Done.\n")
+ }
+ retval
}
Copied: trunk/gdata/inst/perl/xls2tab.pl (from rev 1341, trunk/gdata/inst/perl/xls2csv.pl)
===================================================================
--- trunk/gdata/inst/perl/xls2tab.pl (rev 0)
+++ trunk/gdata/inst/perl/xls2tab.pl 2009-07-16 02:49:11 UTC (rev 1342)
@@ -0,0 +1,166 @@
+#!/bin/env perl
+
+BEGIN {
+use File::Basename;
+unshift(@INC, dirname $0);
+}
+
+use strict;
+use Spreadsheet::ParseExcel;
+
+# declare some varibles local
+my($row, $col, $sheet, $cell, $usage, $basename, $sheetnumber, $filename);
+
+##
+## Usage information
+##
+$usage = <<EOF;
+
+xls2tab.pl <excel file> [<output file>] [<worksheet number>]
+
+Translate the Microsoft Excel spreadsheet file contained in
+<excel file> into tab separated value format (TAB) and store
+in <output file>.
+
+If <output file> is not specified, the output file will have the
+same name as the input file with '.xls' or '.XLS' (if any)
+removed and '.tab' appended.
+
+If no worksheet number is given, each worksheet will be written to
+a separate file with the name '<output file>_<worksheet name>.tab'.
+
+EOF
+
+##
+## parse arguments
+##
+
+if(!defined($ARGV[0]))
+ {
+ print $usage;
+ exit 1;
+ }
+
+$basename = $ARGV[1];
+$basename =~ s/.tab//;
+if ($basename eq "")
+ {
+ my @path;
+ @path = split(/[\/\\]/, $ARGV[0]); # split on file separator
+ $basename = $path[$#path];
+ $basename =~ s/.xls//i;
+ }
+
+if(defined($ARGV[2]) )
+ {
+ $sheetnumber = $ARGV[2];
+ die "Sheetnumber must be an integer larger than 0." if $sheetnumber < 1;
+ }
+
+##
+## open spreadsheet
+##
+
+my $oExcel = new Spreadsheet::ParseExcel;
+
+print "Loading $ARGV[0] ...\n";
+
+open(FH, "<$ARGV[0]") or die "Unable to open file '$ARGV[0]'.\n";
+close(FH);
+
+my $oBook = $oExcel->Parse($ARGV[0]);
+
+print "\n";
+print "Orignal Filename :", $oBook->{File} , "\n";
+print "Number of Sheets :", $oBook->{SheetCount} , "\n";
+print "Author :", $oBook->{Author} , "\n";
+print "\n";
+
+my @sheetlist = (@{$oBook->{Worksheet}});
+if (defined($sheetnumber))
+ {
+ @sheetlist=($sheetlist[$sheetnumber-1]);
+ }
+
+##
+## iterate across each worksheet, writing out a separat tab file
+##
+
+my $i=0;
+foreach my $sheet (@sheetlist)
+{
+ $i++;
+
+ my $sheetname = $sheet->{Name};
+ if(defined($sheetnumber))
+ {
+ $filename = "${basename}.tab";
+ }
+ else
+ {
+ $filename = "${basename}_${sheetname}.tab";
+ }
+
+ print "Writing Sheet number $i ('$sheetname') to file '$filename'\n";
+
+ open(OutFile,">$filename");
+
+ my $cumulativeBlankLines=0;
+
+ my $minrow = $sheet->{MinRow};
+ my $maxrow = $sheet->{MaxRow};
+ my $mincol = $sheet->{MinCol};
+ my $maxcol = $sheet->{MaxCol};
+
+ print "Minrow=$minrow Maxrow=$maxrow Mincol=$mincol Maxcol=$maxcol\n";
+
+ for(my $row = $minrow; $row <= $maxrow; $row++)
+ {
+ my $outputLine = "";
+
+ for(my $col = $mincol; $col <= $maxcol; $col++)
+ {
+ my $cell = $sheet->{Cells}[$row][$col];
+ if( defined($cell) )
+ {
+ $_=$cell->Value; #{Val};
+
+ # convert '#NUM!' strings to missing (empty) values
+ s/#NUM!//;
+
+ # escape double-quote characters in the data since
+ # they are used as field delimiters
+ s/\"/\\\"/g;
+ }
+ else
+ {
+ $_ = '';
+ }
+
+ $outputLine .= "\"" . $_ . "\"" if(length($_)>0);
+
+ # separate cells with tabs
+ $outputLine .= "\t" if( $col != $maxcol) ;
+
+ }
+
+ #$outputLine =~ s/[\t ]+$//g; ## strip off trailing blanks and tabs
+
+ # skip blank/empty lines
+ if( $outputLine =~ /^[\t ]*$/ )
+ {
+ $cumulativeBlankLines++
+ }
+ else
+ {
+ print OutFile "$outputLine \n"
+ }
+ }
+
+ close OutFile;
+
+ print " (Ignored $cumulativeBlankLines blank lines.)\n"
+ if ($cumulativeBlankLines);
+ print "\n";
+}
+
Property changes on: trunk/gdata/inst/perl/xls2tab.pl
___________________________________________________________________
Added: svn:executable
+ *
Added: svn:keywords
+ Author Date Id Revision
Added: svn:mergeinfo
+
Added: svn:eol-style
+ native
Modified: trunk/gdata/man/read.xls.Rd
===================================================================
--- trunk/gdata/man/read.xls.Rd 2009-05-09 05:45:13 UTC (rev 1341)
+++ trunk/gdata/man/read.xls.Rd 2009-07-16 02:49:11 UTC (rev 1342)
@@ -1,41 +1,53 @@
\name{read.xls}
\alias{read.xls}
\alias{xls2csv}
+\alias{xls2tab}
+\alias{xls2sep}
\title{Read Excel files}
\description{Reads a Microsoft Excel file into a data frame}
\usage{
-read.xls(xls, sheet=1, verbose=FALSE, pattern, ..., perl="perl")
+read.xls(xls, sheet=1, verbose=FALSE, pattern, ...,
+ method=c("csv","tab"), perl="perl")
+
xls2csv(xls, sheet=1, verbose=FALSE, ..., perl="perl")
+xls2tab(xls, sheet=1, verbose=FALSE, ..., perl="perl")
+xls2sep(xls, sheet=1, verbose=FALSE, ..., method=c("csv","tab"),
+ perl="perl")
}
\arguments{
- \item{xls}{name of the Microsoft Excel file. If on internet it
- should begin with code{"http://"}.}
- \item{sheet}{number of sheet within the Excel file from which data are
- to be read}
+ \item{xls}{path to the Microsoft Excel file. To access a file on the
+ internet, start the path with "http://".}
+ \item{sheet}{number of the sheet within the Excel file from which data
+ are to be read}
\item{verbose}{logical flag indicating whether details should be
printed as the file is processed.}
\item{pattern}{if specified, them skip all lines before the first
- containing this string}
+ containing this string}
\item{perl}{name of the perl executable to be called.}
+ \item{method}{intermediate file format, "csv" for comma-separated and
+ "tab" for tab-separated}
\item{...}{additional arguments to read.table. The defaults of
read.csv are used.}
}
\value{
- \code{"read.xls"} returns a data frame. \code{"xls2csv"} returns a
- connection to a temporary file in csv format.
+ \code{"read.xls"} returns a data frame.
+
+ \code{"xls2sep"} returns a temporary file in the specified format.
+ \code{"xls2csv"} and \code{"xls2tab"} are simply wrappers for
+ \code{"xls2sep"} specifying method as "csv" or "tab", respectively.
}
\details{
This function works translating the named Microsoft Excel file into a
- temporary .csv file, using Greg Warnes' xls2csv Perl script (installed
- as part of the gregmisc package).
+ temporary .csv or .tab file, using the xls2csv or xls2tab
+ Perl script installed as part of this (gdata) package.
Caution: In the conversion to csv, strings will be quoted. This can be
problem if you are trying to use the \code{comment.char} option of
\code{read.table} since the first character of all lines (including
comment lines) will be "\"" after conversion.
- Caution: With \code{"xls2csv"} it is the responsibility of the user
- to close and delete the file after using it.
+ Caution: If you call \code{"xls2csv"} directly, is your responsibility
+ to close and delete the file after using it.
}
\references{http://www.analytics.washington.edu/statcomp/downloads/xls2csv}
\note{ Either a working version of Perl must be present in the executable
@@ -48,9 +60,22 @@
xlsfile <- file.path(.path.package('gdata'),'xls','iris.xls')
xlsfile
- iris <- read.xls(xlsfile)
+ iris <- read.xls(xlsfile) # defaults to csv format
+ iris <- read.xls(xlsfile,method="csv") # specify csv format
+ iris <- read.xls(xlsfile,method="tab") # specify tab format
+
head(iris) # look at the top few rows
+ \dontshow{
+ iris.1 <- read.xls(xlsfile) # defaults to csv format
+ iris.2 <- read.xls(xlsfile,method="csv") # specify csv format
+ iris.3 <- read.xls(xlsfile,method="tab") # specify tab format
+
+ stopifnot(all.equal(iris.1, iris.2))
+ stopifnot(all.equal(iris.1, iris.3))
+ }
+
+
\dontrun{
# Example specifying exact Perl path for default MS-Windows install of
# ActiveState perl
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|