[R-gregmisc-users] SF.net SVN: r-gregmisc:[1580] trunk/SASxport

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 1580
          http://r-gregmisc.svn.sourceforge.net/r-gregmisc/?rev=1580&view=rev
Author:   warnes
Date:     2012-06-28 01:42:52 +0000 (Thu, 28 Jun 2012)
Log Message:
-----------
New makeSASNames() function creates valid SAS names from a list of R
object or column names, properly handling case conversion (all
converted to upper-case), length restructions (8 characters), and
making duplicate names unique.  This is not as obvious as it appears,
since the process of shortening names can render cause name to become
duplicates, and adding digits to the end of names to make them unique
can make them too long, so a few iterations may be requireed to ensure
valid and unique names.

Modified Paths:
--------------
    trunk/SASxport/R/write.xport.R

Added Paths:
-----------
    trunk/SASxport/R/makeSASNames.R
    trunk/SASxport/tests/testDuplicateNames.R
    trunk/SASxport/tests/testDuplicateNames.Rout.save
    trunk/SASxport/tests/testManyNames.R
    trunk/SASxport/tests/testManyNames.Rout.save

Added: trunk/SASxport/R/makeSASNames.R
===================================================================

--- trunk/SASxport/R/makeSASNames.R	                        (rev 0)
+++ trunk/SASxport/R/makeSASNames.R	2012-06-28 01:42:52 UTC (rev 1580)
@@ -0,0 +1,46 @@
+makeSASNames <- function(names, nchar=8, maxPasses=10)
+  {
+    ## This function takes a vector of potential SAS dataset or
+    ## variable names and converts them into *unique* 8-character
+    ## names.
+
+    # Step 0: converce to uppercase
+    names <- toupper(names)
+    
+    # Step 1: expand/truncate to 8 characters
+    tooLong <- nchar(names)>8
+    if (any(tooLong))
+      {
+        shortNames <- substr(as.character(names), 1, nchar)
+        warning("Truncated ", sum(tooLong), " long names to 8 characters.")
+      }
+    else
+      shortNames <- names
+
+    # concievably, this could take a couple of iterations, because
+    # shortening the names to add digits may create new duplicates...
+    varNames <- shortNames
+    passes <- 0
+    dups <- FALSE
+    while ( any(duplicated(varNames)) && passes<maxPasses )
+      {
+        passes <- passes+1
+        dups <- duplicated(varNames)
+        repeatCount <- table(varNames[dups])
+        digitChars <- nchar(as.character(repeatCount))+1
+        names(digitChars) <- names(repeatCount)
+        newNames <- make.names(substr(varNames, 1, nchar-digitChars[varNames]), unique=TRUE)
+        changed <- newNames != names
+        
+        ##newNames[changed] <- gsub("\\.([0-9]+)$","\\1", newNames[changed])
+        varNames <- newNames
+      }
+
+    if(any(duplicated(varNames)))
+      stop("Unable to make all names unique after ", passes, " passes.")
+    
+    if(any(dups))
+      warning("Made ",sum(dups)," duplicate names unique.")
+
+    varNames
+  }

Modified: trunk/SASxport/R/write.xport.R
===================================================================
--- trunk/SASxport/R/write.xport.R	2012-06-28 01:37:19 UTC (rev 1579)
+++ trunk/SASxport/R/write.xport.R	2012-06-28 01:42:52 UTC (rev 1580)
@@ -88,33 +88,8 @@
 
     #######
     ##
-    scat("Check length of object names...\n")
-    long.names <- which(nchar(dfNames)>8)
-    if(length(long.names)>0)
-      {
-        old.names <- dfNames[long.names]
-        new.names <- substr(old.names, 1, 8 )
-        
-        warning("Truncating object names with more than 8 characters. ",
-                paste(long.names,
-                      ":'",
-                      old.names,
-                      "' --> '",
-                      new.names,
-                      "'",
-                      sep="",
-                      collapse=", " ))
-        
-        dfNames[long.names] <- new.names
-      }
-
-    #######
-    ##
     scat("Ensure object names are valid and unique...\n")
-    dfNames <- substr(make.names(dfNames, unique=TRUE),1,8)
-    if( all(names(dfList)!=dfNames))
-      warning("Data frame names modified to obey SAS rules")
-    names(dfList) <- dfNames
+    names(dfList) <- dfNames <- makeSASNames(dfNames)
     ##
     #######
 
@@ -174,14 +149,9 @@
             dfList[[i]] <- df
           }
 
-        varNames <- substr(make.names(colnames(df), unique=TRUE),1,8)
-        if( any(colnames(df)!=varNames))
-          {
-            warning("Variable names modified to obey SAS rules")
-            colnames(df) <- varNames
-            dfList[[i]] <- df
-          }
-
+        
+        colnames(dfList[[i]]) <- colnames(df) <- varNames <- makeSASNames(colnames(df))
+        
         offsetTable <- data.frame("name"=varNames, "len"=NA, "offset"=NA )
         rownames(offsetTable) <- offsetTable[,"name"]
 

Added: trunk/SASxport/tests/testDuplicateNames.R
===================================================================
--- trunk/SASxport/tests/testDuplicateNames.R	                        (rev 0)
+++ trunk/SASxport/tests/testDuplicateNames.R	2012-06-28 01:42:52 UTC (rev 1580)
@@ -0,0 +1,16 @@
+library(SASxport)
+
+
+##tests
+example(read.xport)
+
+# Duplicate df names
+write.xport("AA"=w$test,"Aa"=w$test,"aA"=w$test,"aa"=w$test, file="dn.a.xpt") #1.a
+read.xport("dn.a.xpt")
+
+# Duplicate column names
+a = w$test
+b = w$test
+colnames(b) <- tolower(colnames(b))
+write.xport("combined"=cbind(a, b), file="dn.b.xpt")
+read.xport("dn.b.xpt")

Added: trunk/SASxport/tests/testDuplicateNames.Rout.save
===================================================================
--- trunk/SASxport/tests/testDuplicateNames.Rout.save	                        (rev 0)
+++ trunk/SASxport/tests/testDuplicateNames.Rout.save	2012-06-28 01:42:52 UTC (rev 1580)
@@ -0,0 +1,159 @@
+
+R version 2.15.0 (2012-03-30)
+Copyright (C) 2012 The R Foundation for Statistical Computing
+ISBN 3-900051-07-0
+Platform: i386-apple-darwin9.8.0/i386 (32-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> library(SASxport)
+
+Loaded SASxport version 1.3.0 (2012-06-21).
+
+  Type `?SASxport' for usage information.
+
+> 
+> 
+> ##tests
+> example(read.xport)
+
+rd.xpr> # -------
+rd.xpr> # SAS code to generate test dataset:
+rd.xpr> # -------
+rd.xpr> # libname y SASV5XPT "test2.xpt";
+rd.xpr> #
+rd.xpr> # PROC FORMAT; VALUE race 1=green 2=blue 3=purple; RUN;
+rd.xpr> # PROC FORMAT CNTLOUT=format;RUN;  * Name, e.g. 'format', unimportant;
+rd.xpr> # data test;
+rd.xpr> # LENGTH race 3 age 4;
+rd.xpr> # age=30; label age="Age at Beginning of Study";
+rd.xpr> # race=2;
+rd.xpr> # d1='3mar2002'd ;
+rd.xpr> # dt1='3mar2002 9:31:02'dt;
+rd.xpr> # t1='11:13:45't;
+rd.xpr> # output;
+rd.xpr> #
+rd.xpr> # age=31;
+rd.xpr> # race=4;
+rd.xpr> # d1='3jun2002'd ;
+rd.xpr> # dt1='3jun2002 9:42:07'dt;
+rd.xpr> # t1='11:14:13't;
+rd.xpr> # output;
+rd.xpr> # format d1 mmddyy10. dt1 datetime. t1 time. race race.;
+rd.xpr> # run;
+rd.xpr> # data z; LENGTH x3 3 x4 4 x5 5 x6 6 x7 7 x8 8;
+rd.xpr> #    DO i=1 TO 100;
+rd.xpr> #        x3=ranuni(3);
+rd.xpr> #        x4=ranuni(5);
+rd.xpr> #        x5=ranuni(7);
+rd.xpr> #        x6=ranuni(9);
+rd.xpr> #        x7=ranuni(11);
+rd.xpr> #        x8=ranuni(13);
+rd.xpr> #        output;
+rd.xpr> #        END;
+rd.xpr> #    DROP i;
+rd.xpr> #    RUN;
+rd.xpr> # PROC MEANS; RUN;
+rd.xpr> # PROC COPY IN=work OUT=y;SELECT test format z;RUN; *Creates test2.xpt;
+rd.xpr> # ------
+rd.xpr> 
+rd.xpr> # Read this dataset from a local file:
+rd.xpr> ## Not run: 
+rd.xpr> ##D w <- read.xport('test2.xpt')
+rd.xpr> ## End(Not run)
+rd.xpr> 
+rd.xpr> # Or read a copy of test2.xpt available on the web:
+rd.xpr> host <- 'http://biostat.mc.vanderbilt.edu'
+
+rd.xpr> path <- '/cgi-bin/viewvc.cgi/*checkout*/Hmisc/trunk/tests/test2.xpt'
+
+rd.xpr> url <- paste(host,path,sep="")
+
+rd.xpr> w <- read.xport(url)
+
+rd.xpr> # We can also get the dataset wrapped in a list
+rd.xpr> w <- read.xport(url, as.list=TRUE)
+
+rd.xpr> # And we can ask for the format information to be included as well.
+rd.xpr> w <- read.xport(url, as.list=TRUE, include.formats=TRUE)
+
+rd.xpr> ## Don't show: 
+rd.xpr> SASxport:::assert( is.data.frame(w)==FALSE && is.list(w)==TRUE )
+
+rd.xpr> ## End Don't show
+rd.xpr> 
+rd.xpr> 
+rd.xpr> ## Not run: 
+rd.xpr> ##D ## The Hmisc library provides many useful functions for interacting with
+rd.xpr> ##D ## data imported from SAS via read.xport()
+rd.xpr> ##D library(Hmisc)
+rd.xpr> ##D 
+rd.xpr> ##D describe(w$test)   # see labels, format names for dataset test
+rd.xpr> ##D lapply(w, describe)# see descriptive stats in more detaiil for each variable
+rd.xpr> ##D 
+rd.xpr> ##D contents(w$test)   # another way to see variable attributes
+rd.xpr> ##D lapply(w, contents)# show contents of individual items in more detail
+rd.xpr> ##D 
+rd.xpr> ##D options(digits=7)  # compare the following matrix with PROC MEANS output
+rd.xpr> ##D t(sapply(w$z, function(x)
+rd.xpr> ##D  c(Mean=mean(x),SD=sqrt(var(x)),Min=min(x),Max=max(x))))
+rd.xpr> ## End(Not run)
+rd.xpr> 
+rd.xpr> 
+rd.xpr> 
+rd.xpr> 
+> 
+> # Duplicate df names
+> write.xport("AA"=w$test,"Aa"=w$test,"aA"=w$test,"aa"=w$test, file="dn.a.xpt") #1.a
+Warning message:
+In makeSASNames(dfNames) : Made 3 duplicate names unique.
+> read.xport("dn.a.xpt")
+$aa
+  RACE AGE         D1                    DT1       T1
+1 blue  30 2002-03-03 (03 Mar 2002 04:31:02) 11:13:45
+2       31 2002-06-03 (03 Jun 2002 04:42:07) 11:14:13
+
+$aa.1
+  RACE AGE         D1                    DT1       T1
+1 blue  30 2002-03-03 (03 Mar 2002 04:31:02) 11:13:45
+2       31 2002-06-03 (03 Jun 2002 04:42:07) 11:14:13
+
+$aa.2
+  RACE AGE         D1                    DT1       T1
+1 blue  30 2002-03-03 (03 Mar 2002 04:31:02) 11:13:45
+2       31 2002-06-03 (03 Jun 2002 04:42:07) 11:14:13
+
+$aa.3
+  RACE AGE         D1                    DT1       T1
+1 blue  30 2002-03-03 (03 Mar 2002 04:31:02) 11:13:45
+2       31 2002-06-03 (03 Jun 2002 04:42:07) 11:14:13
+
+> 
+> # Duplicate column names
+> a = w$test
+> b = w$test
+> colnames(b) <- tolower(colnames(b))
+> write.xport("combined"=cbind(a, b), file="dn.b.xpt")
+Warning message:
+In makeSASNames(colnames(df)) : Made 5 duplicate names unique.
+> read.xport("dn.b.xpt")
+  RACE AGE         D1                    DT1       T1 RACE.1 AGE.1       D1.1
+1 blue  30 2002-03-03 (03 Mar 2002 04:31:02) 11:13:45   blue    30 2002-03-03
+2       31 2002-06-03 (03 Jun 2002 04:42:07) 11:14:13           31 2002-06-03
+                   DT1.1     T1.1
+1 (03 Mar 2002 04:31:02) 11:13:45
+2 (03 Jun 2002 04:42:07) 11:14:13
+> 
+> proc.time()
+   user  system elapsed 
+  0.430   0.031   2.232 

Added: trunk/SASxport/tests/testManyNames.R
===================================================================
--- trunk/SASxport/tests/testManyNames.R	                        (rev 0)
+++ trunk/SASxport/tests/testManyNames.R	2012-06-28 01:42:52 UTC (rev 1580)
@@ -0,0 +1,30 @@
+library(SASxport)
+
+data(iris)
+
+# to 'stress test' use 200 instead of 20
+ncopies <- 20
+
+## create a data file containing ncopies separate copies of the iris *dataframe*
+manyDF.out <- rep(list(iris),ncopies)
+names(manyDF.out) <- rep("iris", ncopies)
+manyDF.out$file <- "manyDF.xport"
+
+do.call(write.xport, manyDF.out)
+manyDF.in <- read.xport(file="manyDF.xport")
+names(manyDF.in)
+head(manyDF.in[[ncopies]])
+tail(manyDF.in[[ncopies]])
+stopifnot( all( sapply(manyDF.in, dim)==c(150,5) ) )
+stopifnot( all( sapply(manyDF.in, colnames) == colnames(manyDF.in[[ncopies]]) ) )
+
+## create a data file containing a single dataframe that holds ncopies copies of
+## the *columns* of the iris dataframe
+manyCols.out <- do.call(cbind, rep(list(iris),ncopies))
+dim(manyCols.out)
+write.xport(manyCols.out, file="manyCols.xport")
+
+manyCols.in <- read.xport("manyCols.xport")
+names(manyCols.in)
+stopifnot(dim(manyCols.in)==dim(manyCols.out))
+

Added: trunk/SASxport/tests/testManyNames.Rout.save
===================================================================
--- trunk/SASxport/tests/testManyNames.Rout.save	                        (rev 0)
+++ trunk/SASxport/tests/testManyNames.Rout.save	2012-06-28 01:42:52 UTC (rev 1580)
@@ -0,0 +1,98 @@
+
+R version 2.15.0 (2012-03-30)
+Copyright (C) 2012 The R Foundation for Statistical Computing
+ISBN 3-900051-07-0
+Platform: i386-apple-darwin9.8.0/i386 (32-bit)
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+> library(SASxport)
+
+Loaded SASxport version 1.3.0 (2012-06-21).
+
+  Type `?SASxport' for usage information.
+
+> 
+> data(iris)
+> 
+> # to 'stress test' use 200 instead of 20
+> ncopies <- 20
+> 
+> ## create a data file containing ncopies separate copies of the iris *dataframe*
+> manyDF.out <- rep(list(iris),ncopies)
+> names(manyDF.out) <- rep("iris", ncopies)
+> manyDF.out$file <- "manyDF.xport"
+> 
+> do.call(write.xport, manyDF.out)
+There were 21 warnings (use warnings() to see them)
+> manyDF.in <- read.xport(file="manyDF.xport")
+> names(manyDF.in)
+ [1] "iris"    "iris.1"  "iris.2"  "iris.3"  "iris.4"  "iris.5"  "iris.6" 
+ [8] "iris.7"  "iris.8"  "iris.9"  "iris.10" "iris.11" "iris.12" "iris.13"
+[15] "iris.14" "iris.15" "iris.16" "iris.17" "iris.18" "iris.19"
+> head(manyDF.in[[ncopies]])
+  SEPAL.LE SEPAL.WI PETAL.LE PETAL.WI SPECIES
+1      5.1      3.5      1.4      0.2  setosa
+2      4.9      3.0      1.4      0.2  setosa
+3      4.7      3.2      1.3      0.2  setosa
+4      4.6      3.1      1.5      0.2  setosa
+5      5.0      3.6      1.4      0.2  setosa
+6      5.4      3.9      1.7      0.4  setosa
+> tail(manyDF.in[[ncopies]])
+    SEPAL.LE SEPAL.WI PETAL.LE PETAL.WI   SPECIES
+145      6.7      3.3      5.7      2.5 virginica
+146      6.7      3.0      5.2      2.3 virginica
+147      6.3      2.5      5.0      1.9 virginica
+148      6.5      3.0      5.2      2.0 virginica
+149      6.2      3.4      5.4      2.3 virginica
+150      5.9      3.0      5.1      1.8 virginica
+> stopifnot( all( sapply(manyDF.in, dim)==c(150,5) ) )
+> stopifnot( all( sapply(manyDF.in, colnames) == colnames(manyDF.in[[ncopies]]) ) )
+> 
+> ## create a data file containing a single dataframe that holds ncopies copies of
+> ## the *columns* of the iris dataframe
+> manyCols.out <- do.call(cbind, rep(list(iris),ncopies))
+> dim(manyCols.out)
+[1] 150 100
+> write.xport(manyCols.out, file="manyCols.xport")
+Warning messages:
+1: In makeSASNames(dfNames) : Truncated 1 long names to 8 characters.
+2: In makeSASNames(colnames(df)) :
+  Truncated 80 long names to 8 characters.
+3: In makeSASNames(colnames(df)) : Made 95 duplicate names unique.
+> 
+> manyCols.in <- read.xport("manyCols.xport")
+> names(manyCols.in)
+  [1] "SEPAL"    "SEPAL.1"  "PETAL"    "PETAL.1"  "SPECI"    "SEPAL.2" 
+  [7] "SEPAL.3"  "PETAL.2"  "PETAL.3"  "SPECI.1"  "SEPAL.4"  "SEPAL.5" 
+ [13] "PETAL.4"  "PETAL.5"  "SPECI.2"  "SEPAL.6"  "SEPAL.7"  "PETAL.6" 
+ [19] "PETAL.7"  "SPECI.3"  "SEPAL.8"  "SEPAL.9"  "PETAL.8"  "PETAL.9" 
+ [25] "SPECI.4"  "SEPAL.10" "SEPAL.11" "PETAL.10" "PETAL.11" "SPECI.5" 
+ [31] "SEPAL.12" "SEPAL.13" "PETAL.12" "PETAL.13" "SPECI.6"  "SEPAL.14"
+ [37] "SEPAL.15" "PETAL.14" "PETAL.15" "SPECI.7"  "SEPAL.16" "SEPAL.17"
+ [43] "PETAL.16" "PETAL.17" "SPECI.8"  "SEPAL.18" "SEPAL.19" "PETAL.18"
+ [49] "PETAL.19" "SPECI.9"  "SEPAL.20" "SEPAL.21" "PETAL.20" "PETAL.21"
+ [55] "SPECI.10" "SEPAL.22" "SEPAL.23" "PETAL.22" "PETAL.23" "SPECI.11"
+ [61] "SEPAL.24" "SEPAL.25" "PETAL.24" "PETAL.25" "SPECI.12" "SEPAL.26"
+ [67] "SEPAL.27" "PETAL.26" "PETAL.27" "SPECI.13" "SEPAL.28" "SEPAL.29"
+ [73] "PETAL.28" "PETAL.29" "SPECI.14" "SEPAL.30" "SEPAL.31" "PETAL.30"
+ [79] "PETAL.31" "SPECI.15" "SEPAL.32" "SEPAL.33" "PETAL.32" "PETAL.33"
+ [85] "SPECI.16" "SEPAL.34" "SEPAL.35" "PETAL.34" "PETAL.35" "SPECI.17"
+ [91] "SEPAL.36" "SEPAL.37" "PETAL.36" "PETAL.37" "SPECI.18" "SEPAL.38"
+ [97] "SEPAL.39" "PETAL.38" "PETAL.39" "SPECI.19"
+> stopifnot(dim(manyCols.in)==dim(manyCols.out))
+> 
+> 
+> proc.time()
+   user  system elapsed 
+  6.985   0.068   7.099 

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.