|
From: <jas...@us...> - 2003-04-08 20:21:55
|
Update of /cvsroot/genex/genex-server/rcluster-1.0/lib/r
In directory sc8-pr-cvs1:/tmp/cvs-serv16754
Modified Files:
ChangeLog permcluster.r
Log Message:
* permcluster.r (Repository):
Added new version from kschlauch
Index: ChangeLog
===================================================================
RCS file: /cvsroot/genex/genex-server/rcluster-1.0/lib/r/ChangeLog,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** ChangeLog 18 Nov 2002 04:42:58 -0000 1.1
--- ChangeLog 8 Apr 2003 20:21:44 -0000 1.2
***************
*** 1,2 ****
--- 1,7 ----
+ 2003-04-08 Jason E. Stewart <ja...@op...>
+
+ * permcluster.r (Repository):
+ Added new version from kschlauch
+
2002-11-17 Harry Mangalam <hj...@ta...>
* committed Karen's changes to permcluster.r
Index: permcluster.r
===================================================================
RCS file: /cvsroot/genex/genex-server/rcluster-1.0/lib/r/permcluster.r,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** permcluster.r 18 Nov 2002 04:42:58 -0000 1.5
--- permcluster.r 8 Apr 2003 20:21:45 -0000 1.6
***************
*** 1,14 ****
! #All rights reserved #
!
! ##########################################
! ## Main program call
! ## This program does a breadth-first search of the clustering tree generated by Murtagh's
! ## hclust procedure to store the clusters for use of the permutation routine called in index.
! ## INPUT: data.txt is the user's tab-delimited "txt" data file
! ## dist.method is the user-defined distance method
! ## stop.level is the user-defined level at which the clustering should stop
! ## OUTPUT: Two files written to the user directory in "txt" format
! ## as "User.Data" and "Graph.Data".
! #######################################################################################
perm.program<-function(data.txt,log.flag,dist.method,agglom.method,stop.level,num.perms,thresh)
{options(digits=10)
--- 1,19 ----
! ###############################################################
! ## Copyright 2000-2003 ##
! ## National Center for Genome Resources ##
! ## George Mason University ##
! ## All rights reserved ##
! ## permcluster.2003.r, written by Karen Schlauch ##
! ###############################################################
! ################################################################################################
! ## Main program call ##
! ## This program does a breadth-first search of the clustering tree generated by Murtagh's ##
! ## hclust procedure to store the clusters for use of the permutation routine called in index. ##
! ## INPUT: data.txt is the user's tab-delimited "txt" data file ##
! ## dist.method is the user-defined distance method ##
! ## stop.level is the user-defined level at which the clustering should stop ##
! ## OUTPUT: Two files written to the user directory in "txt" format ##
! ## as "User.Data" and "Graph.Data". ##
! ################################################################################################
perm.program<-function(data.txt,log.flag,dist.method,agglom.method,stop.level,num.perms,thresh)
{options(digits=10)
***************
*** 22,43 ****
stuff<-index(data.file,gene.names,dist.method,agglom.method,stop.level,num.perms,thresh)
return(stuff)}
!
! #############################################################################################
! ## The scan.data function reads in tab-delimited "txt" file,
! ## which is in the following form:
! ##
! ## genename ratio1 ratio2 ratio3 ratio4 ....
! ## gene1 .1 .2 .3 .4
! ##
! ## Only the FIRST row should contain column labels or other text. The program
! ## thus skips only the FIRST row, regardless of what it contains. To change this,
! ## simply change the "skip" variable which is now set to 1.
! ## INPUT: data.txt is the user's tab-delimited "txt" data file
! ## OUTPUT: Data: the data file in matrix form with the gene name replaced by gene number,
! ## where the gene number is the position of the gene in the data file
! ## num.exp: the number of experiments (or replicates) of the data file
! ## num.genes: the number of genes in the data file
! ## gene.names: a vector of the gene names in the data file
! #############################################################################################
scan.data<-function(data.txt,log.flag)
{num.exp<-length(scan(data.txt,sep="\t",what="",skip=2,nlines=1))-1
--- 27,47 ----
stuff<-index(data.file,gene.names,dist.method,agglom.method,stop.level,num.perms,thresh)
return(stuff)}
! ###############################################################################################
! ## The scan.data function reads in tab-delimited "txt" file, ##
! ## which is in the following form: ##
! ## ##
! ## genename ratio1 ratio2 ratio3 ratio4 .... ##
! ## gene1 .1 .2 .3 .4 ##
! ## ##
! ## Only the FIRST row should contain column labels or other text. The program ##
! ## thus skips only the FIRST row, regardless of what it contains. To change this, ##
! ## simply change the "skip" variable which is now set to 1. ##
! ## INPUT: data.txt is the user's tab-delimited "txt" data file ##
! ## OUTPUT: Data: the data file in matrix form with the gene name replaced by gene number, ##
! ## where the gene number is the position of the gene in the data file ##
! ## num.exp: the number of experiments (or replicates) of the data file ##
! ## num.genes: the number of genes in the data file ##
! ## gene.names: a vector of the gene names in the data file ##
! ###############################################################################################
scan.data<-function(data.txt,log.flag)
{num.exp<-length(scan(data.txt,sep="\t",what="",skip=2,nlines=1))-1
***************
*** 56,94 ****
if(log.flag=="F") return(Data,num.exp,num.genes,gene.names)}
! ##############################################################################
! ## The get.clust.list function first calls the hclust function to generate the
! ## clustering info and clustering tree.
! ## The clustering info given by the hclust function is parsed.
! ## The first loop counts the total number of clusters, the second loop stores
! ## necessary information to build the View.Matrix in the following function.
! ## (See details below in comments)
! ## INPUT: data.file: the data file output of scan.data
! ## dist.method: the user-defined distance method
! ## OUTPUT: View.Matrix: a somewhat empty shell of View.Matrix to be completed
! ## in the index function here it contains the gene
! ## expression profiles and gene numbers of each cluster
! ## and the cluster dimension
! ## I.list: a list storing indeces of the clusters of View.Matrix, height, dimension
! ## m: the merge matrix produced by the hclust function
! ## d: the number of rows of the merge matrix
! ## num.exp: the number of experiments
! ## dim.sum: the sum of the dimensions of all clusters
! ## Left: the Left child of the cluster being considered
! ## Right: the right child of the cluster being considered
! ## Orig: the cluster being considered
! ## num.clusters:the number of total clusters considered
! #####################################################################
get.clust.list<- function(data.file,dist.method,agglom.method)
{library(mva)
num.exp<-dim(data.file)[2]-1
!
! if((dist.method =="cos") || (dist.method=="pcos") || (dist.method=="cor") || (dist.method=="eucna"))
dist.matrix<-other.dist.metrics(data.file,dist.method)$A
! if((dist.method !="cos") && (dist.method!="pcos") && (dist.method!="cor") && (dist.method!="eucna"))
dist.matrix<-dist(data.file[,2:(num.exp+1)],method=dist.method)
clust.info <- hclust(dist.matrix,agglom.method)
-
m <- clust.info$merge
h <- clust.info$height
--- 60,100 ----
if(log.flag=="F") return(Data,num.exp,num.genes,gene.names)}
! #################################################################################################
! ## The get.clust.list function first calls the hclust function to generate the ##
! ## clustering info and clustering tree. ##
! ## The clustering info given by the hclust function is parsed. ##
! ## The first loop counts the total number of clusters, the second loop stores ##
! ## necessary information to build the View.Matrix in the following function. ##
! ## (See details below in comments) ##
! ## INPUT: data.file: the data file output of scan.data ##
! ## dist.method: the user-defined distance method ##
! ## "euc": euclidean distance, in which NA's are excluded and no scaling occurs ##
! ## "cor": correlation coefficient ##
! ## "cos": cosine of the angle between two gene profile vectors ##
! ## "pcos": Munneke's penalized cosine distance metric
! ## OUTPUT: View.Matrix: a somewhat empty shell of View.Matrix to be completed ##
! ## in the index function here it contains the gene ##
! ## expression profiles and gene numbers of each cluster ##
! ## and the cluster dimension ##
! ## I.list: a list storing indeces of the clusters of View.Matrix,height,dimension ##
! ## m: the merge matrix produced by the hclust function ##
! ## d: the number of rows of the merge matrix ##
! ## num.exp: the number of experiments ##
! ## dim.sum: the sum of the dimensions of all clusters ##
! ## Left: the Left child of the cluster being considered ##
! ## Right: the right child of the cluster being considered ##
! ## Orig: the cluster being considered ##
! ## num.clusters:the number of total clusters considered ##
! #################################################################################################
get.clust.list<- function(data.file,dist.method,agglom.method)
{library(mva)
num.exp<-dim(data.file)[2]-1
! if((dist.method =="cos") || (dist.method=="pcos") || (dist.method=="cor") || (dist.method=="euc"))
dist.matrix<-other.dist.metrics(data.file,dist.method)$A
! if((dist.method !="cos") && (dist.method!="pcos") && (dist.method!="cor") && (dist.method!="euc"))
dist.matrix<-dist(data.file[,2:(num.exp+1)],method=dist.method)
clust.info <- hclust(dist.matrix,agglom.method)
m <- clust.info$merge
h <- clust.info$height
***************
*** 98,120 ****
either<-0
neither<-0
!
! ## The loop below counts the total number clusters appearing in the clustering tree as follows.
! ## The merge matrix m generated by hclust consists of d rows.
! ## Each row i represents a parent cluster; each of the two column values of row i
! ## represents one of the two child clusters of cluster i. Therefore, the number of
! ## total clusters *should be* 2*d. There are two obstructions to this deduction.
! ## First, clusters of dimension two (and thus having singletons as both child clusters)
! ## are counted twice: once as a whole cluster and once as two singleton clusters.
! ## As we are not clustering any clusters of dimension two, it is enough to count these
! ## clusters once, as an entire cluster of dimension 2.
! ## Secondly, the original cluster (consisting of the entire data
! ## set) is not counted. Thus the total number of clusters is 2*(d-both)+1, where both
! ## is the number of clusters consisting of two singleton subclusters.
! ## D is the vector which counts the dimension of each cluster.
! ## dim.sum is the sum of dimensions of all clusters we consider:
! ## D is the dimension of all clusters but the original,
! ## d+1 is the dimension of the original,
! ## 2*both represents all pairs of singleton subclusters counted twice.
!
for(i in 1:d)
{if(m[i,1]<0 && m[i,2]<0)
--- 104,126 ----
either<-0
neither<-0
! ###################################################################################################
! ## The loop below counts the total number clusters appearing in the clustering tree as follows. ##
! ## The merge matrix m generated by hclust consists of d rows. ##
! ## Each row i represents a parent cluster; each of the two column values of row i ##
! ## represents one of the two child clusters of cluster i. Therefore, the number of ##
! ## total clusters *should be* 2*d. There are two obstructions to this deduction. ##
! ## First, clusters of dimension two (and thus having singletons as both child clusters) ##
! ## are counted twice: once as a whole cluster and once as two singleton clusters. ##
! ## As we are not clustering any clusters of dimension two, it is enough to count these ##
! ## clusters once, as an entire cluster of dimension 2. ##
! ## Secondly, the original cluster (consisting of the entire data ##
! ## set) is not counted. Thus the total number of clusters is 2*(d-both)+1, where both ##
! ## is the number of clusters consisting of two singleton subclusters. ##
! ## D is the vector which counts the dimension of each cluster. ##
! ## dim.sum is the sum of dimensions of all clusters we consider: ##
! ## D is the dimension of all clusters but the original, ##
! ## d+1 is the dimension of the original, ##
! ## 2*both represents all pairs of singleton subclusters counted twice. ##
! ###################################################################################################
for(i in 1:d)
{if(m[i,1]<0 && m[i,2]<0)
***************
*** 136,149 ****
a<-dim.sum+1
! ## The loop below determines which genes belong to each cluster by unraveling
! ## the hclust merge matrix m. View.Matrix is being set up to store the
! ## cluster name, gene number of each gene within the cluster and the dimension
! ## of the cluster. Genes are identified by their numerical placement within
! ## the data file.
! ## I.list[[i]][1] stores the starting row number of View.Matrix for cluster i
! ## I.list[[i]][2] stores the ending row number of View.Matrix for cluster i
! ## I.list[[i]][3] stores the dimension of cluster i
! ## I.list[[i]][4] stores the height of cluster i
!
for(i in 1:d)
{I.list[[i]]<-vector(mode="numeric",length=4)
--- 142,156 ----
a<-dim.sum+1
! #################################################################################
! ## The loop below determines which genes belong to each cluster by unraveling ##
! ## the hclust merge matrix m. View.Matrix is being set up to store the ##
! ## cluster name, gene number of each gene within the cluster and the dimension ##
! ## of the cluster. Genes are identified by their numerical placement within ##
! ## the data file. ##
! ## I.list[[i]][1] stores the starting row number of View.Matrix for cluster i ##
! ## I.list[[i]][2] stores the ending row number of View.Matrix for cluster i ##
! ## I.list[[i]][3] stores the dimension of cluster i ##
! ## I.list[[i]][4] stores the height of cluster i ##
! #################################################################################
for(i in 1:d)
{I.list[[i]]<-vector(mode="numeric",length=4)
***************
*** 169,173 ****
View.Matrix[1:(d+1),4:(4+num.exp)]<-data.file
-
Orig <-View.Matrix[I.list[[d]][1]:I.list[[d]][2],4:(4+num.exp)]
--- 176,179 ----
***************
*** 191,219 ****
return(View.Matrix,I.list,m,d,h,num.exp,dim.sum,Left,Right,Orig,num.clusters)}
! ###################################################################################################
! ## This function sets up the rest of View.Matrix, which stores and returns the CLUSTERLABEL,
! ## GENENUM, and CLUSTERDIMENSION. View.Matrix stores the clusters in the order generated
! ## by traversing the clustering tree from top down, one level at a time, left to right.
! ## This traversal is accomplished by setting up and using v.stack,
! ## which acts as a top-down breadth-first searching stack.
! ## The columns of v.stack contain the level of the clustering tree, the node number, the number
! ## of the parent of that node, the cluster name associated to the node, the cluster number (with respect to
! ## the merge matrix m found in the function get.clust.list, the dimension of the cluster, the height of the cluster,
! ## and how many times the cluster has been visited. [PLEASE NOTE: Cluster Name corresponds to the variables
! ## z and k, and an old numbering system I use to check whether the clustering is correct. These can be ignored.]
! ## The algorithm works as follows. Beginning with the top cluster (entire data set) of the tree, each
! ## cluster is visited. If the cluster has dimension
! ## larger than two, its two subclusters will be determined first. Then the left child cluster will be visited
! ## first. If it has dimension greater than two, we continue to follow the chain of left subclusters until
! ## a subcluster has dimension two or less. At this point, we begin the traversal up to the parent node of the
! ## cluster just visited, then to its right subcluster. If the dimension of this right subcluster is more than
! ## two, we begin to traverse the chain of left subclusters until we reach one with dimension less than or
! ## equal to two. ETC. A node can only be visited twice.
! ## INPUT: data.file, gene.names generated by scan.data
! ## dist.method, stop.level are user inputs
! ## num.perms: the number of permutations defined by the user
! ## thresh: the confidence threshold defined by the user as a percentage
! ## OUTPUT: View.Matrix, User.Matrix, which are written out in txt file format as Graph.Data, User.Data
! ###############################################################################################################
index<-function(data.file,gene.names,dist.metric,agglom.method,stop.level,num.perms,thresh)
--- 197,227 ----
return(View.Matrix,I.list,m,d,h,num.exp,dim.sum,Left,Right,Orig,num.clusters)}
! ##########################################################################################################
! ## This function sets up the rest of View.Matrix, which stores and returns the CLUSTERLABEL, ##
! ## GENENUM, and CLUSTERDIMENSION. View.Matrix stores the clusters in the order generated ##
! ## by traversing the clustering tree from top down, one level at a time, left to right. ##
! ## This traversal is accomplished by setting up and using v.stack, which acts as a top-down ##
! ## breadth-first searching stack. The columns of v.stack contain the level of the clustering ##
! ## tree, the node number, the number of the parent of that node, the cluster name associated ##
! ## to the node, the cluster number (with respect to the merge matrix m found in the function ##
! ## get.clust.list, the dimension of the cluster, the height of the cluster, and how many times ##
! ## the cluster has been visited. [PLEASE NOTE: Cluster Name corresponds to the variables ##
! ## z and k, and an old numbering system I use to check whether the clustering is correct. ##
! ## These can be ignored.] ##
! ## The algorithm works as follows. Beginning with the top cluster (entire data set) of the tree, ##
! ## each cluster is visited. If the cluster has dimension larger than two, its two subclusters ##
! ## will be determined first. Then the left child cluster will be visited first. If it has ##
! ## dimension greater than two, we continue to follow the chain of left subclusters until ##
! ## a subcluster has dimension two or less. At this point, we begin the traversal up to the ##
! ## parent node of the cluster just visited, then to its right subcluster. If the dimension of ##
! ## this right subcluster is more than two, we begin to traverse the chain of left subclusters ##
! ## until we reach one with dimension less than or equal to two. ETC. ##
! ## A node can only be visited twice. ##
! ## INPUT: data.file, gene.names generated by scan.data ##
! ## dist.method, stop.level are user inputs ##
! ## num.perms: the number of permutations defined by the user ##
! ## thresh: the confidence threshold defined by the user as a percentage ##
! ## OUTPUT: View.Matrix, User.Matrix, which are written out in txt file format as Graph.Data, User.Data ##
! ##########################################################################################################
index<-function(data.file,gene.names,dist.metric,agglom.method,stop.level,num.perms,thresh)
***************
*** 318,323 ****
ret.pos<-ret.pos+1 ## go to next cluster on stack
tmpT<-v.stack[ret.pos,6]
! if(v.stack[ret.pos,9]==-1) tmp<-0 ## if the cluster hasn't been visited by now, it's the last (stop) line on v.stack
! if(v.stack[ret.pos,9]!=-1) ## if the cluster has been visited, continue loop
{if(tmpT<0) tmp<-data.file[abs(tmpT),]
if(tmpT>0) tmp<-View.Matrix[I.list[[tmpT]][1]:I.list[[tmpT]][2],4:(num.exp+4)]}} ##no more clstrs on stack
--- 326,331 ----
ret.pos<-ret.pos+1 ## go to next cluster on stack
tmpT<-v.stack[ret.pos,6]
! if(v.stack[ret.pos,9]==-1) tmp<-0 ## if the cluster hasn't been visited by now, it's the last (stop) line on v.stack
! if(v.stack[ret.pos,9]!=-1) ## if the cluster has been visited, continue loop
{if(tmpT<0) tmp<-data.file[abs(tmpT),]
if(tmpT>0) tmp<-View.Matrix[I.list[[tmpT]][1]:I.list[[tmpT]][2],4:(num.exp+4)]}} ##no more clstrs on stack
***************
*** 325,329 ****
rm(I.list)
! v.stack<-v.stack[!is.na(v.stack[,1]),] ## get rid of dummy (stop) row at bottom ##
## !is.na(View.Matrix[,1])] is the first subclustering which is listed twice but only counted once
## We are deleting the second listing here
--- 333,337 ----
rm(I.list)
! v.stack<-v.stack[!is.na(v.stack[,1]),] ## delete dummy (stop) row at bottom ##
## !is.na(View.Matrix[,1])] is the first subclustering which is listed twice but only counted once
## We are deleting the second listing here
***************
*** 361,365 ****
GRAPH.LABEL[1:(d+1)]<-rep(paste(0,0,"ORIG"),d+1)
last.level<-v.stack[dim(v.stack)[1],1]
! print(paste("The Last TREE LEVEL is: ",last.level))
acc.vec<-c(rep(1,d+1),rep(-1,dim(View.Matrix)[1]-(d+1))) ## this is the ACC column of View.Matrix
--- 369,373 ----
GRAPH.LABEL[1:(d+1)]<-rep(paste(0,0,"ORIG"),d+1)
last.level<-v.stack[dim(v.stack)[1],1]
! print(paste("The Clustering Tree has ",last.level, " levels"))
acc.vec<-c(rep(1,d+1),rep(-1,dim(View.Matrix)[1]-(d+1))) ## this is the ACC column of View.Matrix
***************
*** 372,379 ****
## OLD V.dimnames<-list(NULL,c( "CLUSTER","NAME","DIM","GeneNum",c(1:num.exp))
## NEW V.dimnames<-list(NULL,c("ACC","CLUSTER","NAME","DIM","GeneNum",c(1:num.exp))) ## column names of View.Matrix
-
## In the following loop, v.stack is traversed from top to bottom.
## Note: v.stack[i,1]==1 means the cluster has been perused
! ## View.Matrix[i,1]==1 means the cluster's subclusters have been accepted
b<-0
for(i in 1:length(v.stack[,8]))
--- 380,386 ----
## OLD V.dimnames<-list(NULL,c( "CLUSTER","NAME","DIM","GeneNum",c(1:num.exp))
## NEW V.dimnames<-list(NULL,c("ACC","CLUSTER","NAME","DIM","GeneNum",c(1:num.exp))) ## column names of View.Matrix
## In the following loop, v.stack is traversed from top to bottom.
## Note: v.stack[i,1]==1 means the cluster has been perused
! ## View.Matrix[i,1]==1 means the cluster's subclusters have been accepted
b<-0
for(i in 1:length(v.stack[,8]))
***************
*** 445,452 ****
{D1<-H0
single.flag<-1
! tstat.matrix[3*clust.count,(p+2)]<-abs(T1)
! ##write.table(tmp.data,file=paste(v.stack[i,5],v.stack[i,6],p,
! ## ".perm.txt",sep=""),quote=F,sep="\t",col.names=F,row.names=F)
! }
rm(tmp.data)
if(T2<0) D2<-H0
--- 452,456 ----
{D1<-H0
single.flag<-1
! tstat.matrix[3*clust.count,(p+2)]<-abs(T1)}
rm(tmp.data)
if(T2<0) D2<-H0
***************
*** 454,464 ****
{D1<-H0-tmp.h[T1]
single.flag<-0
! tstat.matrix[3*clust.count,(p+2)]<-0}
! if(T2>0) D2<-H0-tmp.h[T2]
! t.stat<-D1+D2
! t.seq[p]<-t.stat
! single.flag.seq[p]<-single.flag} ##end of permutation loop
! rm(C0)
! t.seq[1:num.perms]<-t.seq[order(t.seq[1:num.perms])] ## order the num.perms test statistics
tstat.matrix[(3*clust.count-2),3:(num.perms+3)]<-round(t.seq,2)
tstat.matrix[(3*clust.count-1),3:(num.perms+3)]<-single.flag.seq
--- 458,468 ----
{D1<-H0-tmp.h[T1]
single.flag<-0
! tstat.matrix[3*clust.count,(p+2)]<-0}
! if(T2>0) D2<-H0-tmp.h[T2]
! t.stat<-D1+D2
! t.seq[p]<-t.stat
! single.flag.seq[p]<-single.flag} ##end of permutation loop
! rm(C0)
! t.seq[1:num.perms]<-t.seq[order(t.seq[1:num.perms])] ## order the num.perms test statistics
tstat.matrix[(3*clust.count-2),3:(num.perms+3)]<-round(t.seq,2)
tstat.matrix[(3*clust.count-1),3:(num.perms+3)]<-single.flag.seq
***************
*** 480,487 ****
if(accept)
! {print("accepting the clusters of")
! #print(View.Matrix[a,2:3])
! #print("GRAPH LABEL")
! #print(GRAPH.LABEL[a])
v.stack[!is.na(M),1]<-1
## the children are accepted, so set their acc var to 1
--- 484,489 ----
if(accept)
! {print("accepting the subclusters of")
! print(GRAPH.LABEL[a])
v.stack[!is.na(M),1]<-1
## the children are accepted, so set their acc var to 1
***************
*** 489,509 ****
if(!accept)
! {print("not accepting the subclusters of cluster")
! #print(View.Matrix[a,2:3])
! #print("GRAPH LABEL")
! #print(GRAPH.LABEL[a])
v.stack[!is.na(M),1]<-0
## the children are not accepted, so set their acc var to 0
View.Matrix[!is.na(N),1]<-0}}}}
!
## v.stack has been completely traversed
- ##return(View.Matrix)
View.Matrix<-View.Matrix[,-3]
! ## deleting archaic numbering scheme and putting in labels
!
View.Matrix[,2]<-GRAPH.LABEL
View.Matrix[1:(d+1),2]<-rep(paste(0,0,"ORIG", sep=","),d+1)
-
Graph.Matrix<-data.frame(ACC=View.Matrix[!duplicated(View.Matrix[,2]),1],
GRAPHLABEL=View.Matrix[!duplicated(View.Matrix[,2]),2],
--- 491,506 ----
if(!accept)
! {print("not accepting the subclusters of cluster")
! print(GRAPH.LABEL[a])
v.stack[!is.na(M),1]<-0
## the children are not accepted, so set their acc var to 0
View.Matrix[!is.na(N),1]<-0}}}}
!
## v.stack has been completely traversed
View.Matrix<-View.Matrix[,-3]
! ## deleting archaic numbering scheme and putting in labels
View.Matrix[,2]<-GRAPH.LABEL
View.Matrix[1:(d+1),2]<-rep(paste(0,0,"ORIG", sep=","),d+1)
Graph.Matrix<-data.frame(ACC=View.Matrix[!duplicated(View.Matrix[,2]),1],
GRAPHLABEL=View.Matrix[!duplicated(View.Matrix[,2]),2],
***************
*** 513,530 ****
write.table(Graph.Matrix,file="Graph.Data",quote=F,sep="\t",col.names=TRUE,row.names=F)
-
View.Matrix<-data.frame(ACC=View.Matrix[,1],
CLUSTERLABEL=view.single.LABEL,
- #GENENAME=gene.names[as.numeric(View.Matrix[,4])],
CLUSTERDIMENSION=View.Matrix[,3],
GENENUM=as.numeric(View.Matrix[,4]))
write.table(View.Matrix, file="User.Data", quote=F,sep="\t",col.names=TRUE,row.names=F)
!
! rm(View.Matrix,Graph.Matrix)
! #return(v.stack,View.Matrix,Graph.Matrix)
! tmp.tstat.matrix<-tstat.matrix[!is.na(tstat.matrix[,1]),]
! return(tmp.tstat.matrix)}
!
!
####################################################################################
## The permute function permutes the data file generated by scan.data by permuting
--- 510,519 ----
write.table(Graph.Matrix,file="Graph.Data",quote=F,sep="\t",col.names=TRUE,row.names=F)
View.Matrix<-data.frame(ACC=View.Matrix[,1],
CLUSTERLABEL=view.single.LABEL,
CLUSTERDIMENSION=View.Matrix[,3],
GENENUM=as.numeric(View.Matrix[,4]))
write.table(View.Matrix, file="User.Data", quote=F,sep="\t",col.names=TRUE,row.names=F)
! rm(View.Matrix,Graph.Matrix)}
####################################################################################
## The permute function permutes the data file generated by scan.data by permuting
***************
*** 546,624 ****
check.vector<-vector(mode="character",length=(num.cols-1))
for(c in 2:num.cols)
! {check.vector[c]<-round(mean(perm.data[,c]),5)==round(mean(data.file[,c]),5)
if(check.vector[c]!="TRUE")
{print("permute isn't working right")
return(c)}}
! return(perm.data,check.vector)}
!
! ## This function creates a "dist" matrix using the given
! ## gene expression matrix and code for distance metric
! ## "cos" ==> 1-cosine of the angle between the two vectors
! ## "pcos" ==> penalized cosine metric (Brian's)
! ## "cor" ==> abs(cor-1) Pearson's correlation coefficient
other.dist.metrics<-function(expression.matrix,dist.metric)
{library(mva)
EM<-expression.matrix
! ##EM does not have a label row!##
! EM<-EM[-1,]
EM<-EM[,-1] ##delete gene.name column
- print(dim(EM))
r<-dim(EM)[1]
c<-dim(EM)[2]
DM<-matrix(ncol=r,nrow=r)
! if(dist.metric=="eucna")
! {for(j in 1:r)
! {for(i in j:r)
! {##print(paste(j,i))
! x<-EM[j,]
! y<-EM[i,]
! na.posn<-vector(mode="numeric") ##records posn of na's##
! na.posn<-NULL
! for(k in 1:length(x))
! {if(is.na(x[k]) || is.na(y[k]))
! na.posn<-c(na.posn,k)}
! if(length(na.posn)==0)
! {tmp<-rbind(x,y)
! distxy<-dist(tmp,method="euclidean")}
! if(length(na.posn)>0)
! {tmp<-rbind(x,y)
! nona.tmp<-tmp[,-(na.posn)]
! distxy<-dist(nona.tmp,method="euclidean")}
! ##col.dist<-(tmp.dist^2)/(dim(tmp)[2]-length(na.posn)) ## num of non-NA cols ##
! ##distxy<-sqrt(tmp.dist^2+(col.dist*length(na.posn)))
! DM[i,j]<-distxy}}}
!
!
! if(dist.metric=="pcos")
! {for(j in 1:r)
! {for(i in j:r)
! {x<-EM[j,]
! y<-EM[i,]
! distxy<-munneke.dist(x,y)
! DM[i,j]<-distxy}}}
!
! if(dist.metric=="cos")
! {for(j in 1:r)
! {for(i in j:r)
! {#print(paste("j=",j,"i=",i))
! x<-EM[j,]
! y<-EM[i,]
! distxy<-cos.dist(x,y)
! DM[i,j]<-distxy}}}
!
!
! if(dist.metric=="cor")
! {for(j in 1:r)
! {for(i in j:r)
! {print(paste("j=",j,"i=",i))
! x<-EM[j,]
! y<-EM[i,]
! distxy<-abs(1-cor(x,y,use="pairwise.complete.obs"))
! DM[i,j]<-distxy}}}
A<-as.dist(DM)
- #print("finished getting distance matrix")
return(DM,A)}
--- 535,587 ----
check.vector<-vector(mode="character",length=(num.cols-1))
for(c in 2:num.cols)
! {check.vector[c]<-round(mean(perm.data[,c],na.rm=T),5)==round(mean(data.file[,c],na.rm=T),5)
if(check.vector[c]!="TRUE")
{print("permute isn't working right")
return(c)}}
! return(perm.data,check.vector)}
+ ##########################################################################
+ ## The other.dist.metrics function creates a "dist" matrix ##
+ ## using the given gene expression matrix and code for distance metric ##
+ ## "cos" ==> 1-cosine of the angle between the two vectors ##
+ ## "pcos" ==> penalized cosine metric (Brian Munneke) ##
+ ## "cor" ==> abs(cor-1) Pearson's correlation coefficient ##
+ ##########################################################################
other.dist.metrics<-function(expression.matrix,dist.metric)
{library(mva)
EM<-expression.matrix
! ##labels<-key[EM[,1],2]
! ##labels<-EM[,1]
EM<-EM[,-1] ##delete gene.name column
r<-dim(EM)[1]
c<-dim(EM)[2]
DM<-matrix(ncol=r,nrow=r)
! for(j in 1:r)
! {for(i in j:r)
! {x<-EM[j,]
! y<-EM[i,]
! na.posn<-vector(mode="numeric") ##record positions of NA's##
! na.posn<-NULL
! for(k in 1:length(x))
! {if(is.na(x[k]) || is.na(y[k]))
! na.posn<-c(na.posn,k)}
! if(length(na.posn)==0)
! tmpd<-rbind(x,y)
! if(length(na.posn)>0)
! {tmp1<-rbind(x,y)
! nona.tmp<-tmp1[,-(na.posn)] ##do not include NA's##
! tmpd<-nona.tmp}
! if(dist.metric=="euc")
! distxy<-dist(tmpd,method="euclidean")
! if(dist.metric=="pcos")
! distxy<-munneke.dist(tmpd[1,],tmpd[2,])
! if(dist.metric=="cos")
! distxy<-cos.dist(tmpd[1,],tmpd[2,])
! if(dist.metric=="cor")
! distxy<-abs(1-cor(x,y,use="pairwise.complete.obs"))
! DM[i,j]<-distxy}}
A<-as.dist(DM)
return(DM,A)}
***************
*** 627,644 ****
########################################################################
munneke.dist<-function(x,y)
! {sumxy<-0
! sumx<-0
! sumy<-0
! diffxy<-0
! for(i in 1:length(x))
! {sumxy<-sumxy+(x[i]*y[i])
! sumx<-sumx+x[i]^2
! sumy<-sumy+y[i]^2
! diffxy<-diffxy+abs(x[i]-y[i])}
! cosxy<-sumxy/(sqrt(sumx)*sqrt(sumy))
! p<-diffxy/length(x)
! p<-2*(p/(1+p))
! distxy<-1-cosxy+p
! return(distxy)}
########################################################################
--- 590,599 ----
########################################################################
munneke.dist<-function(x,y)
! {cosxy<-sum(x*y)/sqrt(sum(x^2)*sum(y^2))
! diffxy<-sum(abs(x-y))
! p<-diffxy/length(x)
! p<-2*(p/(1+p))
! pcosxy<-1-cosxy+p
! return(pcosxy)}
########################################################################
***************
*** 646,672 ****
########################################################################
cos.dist<-function(x,y)
! {sumxy<-0
! sumx<-0
! sumy<-0
! diffxy<-0
! for(i in 1:length(x))
! {sumxy<-sumxy+(x[i]*y[i])
! sumx<-sumx+x[i]^2
! sumy<-sumy+y[i]^2
! diffxy<-diffxy+abs(x[i]-y[i])}
! tmpcosxy<-sumxy/(sqrt(sumx)*sqrt(sumy))
! cosxy<-1-tmpcosxy
! return(cosxy)}
!
!
!
!
!
!
!
!
!
!
!
!
!
--- 601,604 ----
########################################################################
cos.dist<-function(x,y)
! {cosxy<-sum(x*y)/sqrt(sum(x^2)*sum(y^2))
! return(1-cosxy)}
|