[codeface] [PATCH 1/7] Change rountines for random graph test for community significance

From: Mitchell Joblin <joblin.m@xxxxxxxxx>
To: codeface@xxxxxxxxxxxxx
Date: Thu, 21 Aug 2014 22:52:31 +0200
- use faster implementation of rewire procedure

- change plotting of results to use ggplot2 package

- add the ability to choose other quality metrics for the significance
  test

Signed-off-by: Mitchell Joblin <mitchell.joblin.ext@xxxxxxxxxxx>
---
 codeface/R/cluster/community_metrics.r | 118 ++++++++++++++++++---------------
 1 file changed, 65 insertions(+), 53 deletions(-)

diff --git a/codeface/R/cluster/community_metrics.r 
b/codeface/R/cluster/community_metrics.r
index fa209a1..c628306 100644
--- a/codeface/R/cluster/community_metrics.r
+++ b/codeface/R/cluster/community_metrics.r
@@ -17,6 +17,7 @@
 ## File Description:
 ##  Various measures and test for the significance and quality of a community
 ##  or cluster in a graph structure
+suppressMessages(library(BiRewire))
 
 edge.weight.to.multi <- function(g) {
   ## Converters an edge weight to multiple edges between the connected nodes
@@ -157,7 +158,7 @@ community.quality.modularity <- function(graph, 
community.vertices) {
   m = sum(graph.strength(graph, mode="all")) / 2
 
   ## Measure the degree for intra-community edges
-  intra.degree <- degree(subgraph)
+  intra.degree <- igraph::degree(subgraph)
 
   ## Degree of vertices in community
   community.vertices.degree <- graph.strength(graph, community.vertices, 
mode="all")
@@ -182,60 +183,68 @@ community.quality.modularity <- function(graph, 
community.vertices) {
 ## - Output -
 ## p-value: the result of the statistical significance test
 ############################################################################
-community.stat.significance <- function(graph, cluster.algo) {
-  ## extract clusters
+community.stat.significance <- function(graph, cluster.algo, metric, niter) {
+  ## Extract clusters
   graph.clusters <- community.detection.disconnected(graph, cluster.algo)
 
-  ## compute cluster conductance values
-  cluster.conductance <- community.metric(graph, graph.clusters,
-                                          "conductance")
-  cluster.conductance <- cluster.conductance[!is.na(cluster.conductance)]
-  ## compute randomized conductance samples
-  niter <- 2
-  rand.samps <- randomised.conductance.samples(graph, niter, cluster.algo)
+  ## Compute cluster quality
+  cluster.quality <- community.metric(graph, graph.clusters,
+                                      metric)
+  cluster.quality <- cluster.quality[!is.na(cluster.quality)]
+ 
+  ## Compute randomized conductance samples
+  rand.samps <- rewired.graph.samples(graph, cluster.algo, metric, niter)
 
-  ## test for normality
+  ## Test for normality
   ## check if values are same because the test while fail if they are 
-  if( length(unique(rand.samps) == 1)){
-    normality.test <- c()
-    normality.test$p.value <- -1
+  if(length(unique(rand.samps)) == 1) {
+    shapiro.test.result <- c()
+    shapiro.test.result$p.value <- -1
   } else {
-    normality.test <- shapiro.test(rand.samps)
+    shapiro.test.result <- shapiro.test(rand.samps)
   }
 
-  ## compute normal distribution
-  mean.conductance <- mean(rand.samps)
-  sd.conductance   <- sd  (rand.samps)
+  ## Perform T-test on test statistic
+  t.test.result <- t.test(rand.samps,  mu=mean(cluster.quality))
 
-  ## perform t-test on test statistic
-  t.test.result <- t.test(rand.samps,  mu=mean(cluster.conductance))
+  ## Output result
+  res <- list(random.simulation.quality=rand.samps,
+              original.quality=cluster.quality,
+              t.test.result=t.test.result,
+              shapiro.test.result=shapiro.test.result)
 
-  ## output result
-  save.comm.sig.test(rand.samps, cluster.conductance,t.test.result,
-                     normality.test, outfile, format="png")
+  return(res)
 }
 
 
 ## write the significance test results to pdf
-save.comm.sig.test <- function(rand.samps, cluster.conductance,t.result, 
-                               shap.result, outfile, size=7, format="png") {
-       m.c = sum(cluster.conductance)
-       m.r = mean(rand.samps)
-       s.r = sd(rand.samps)
-       x.low.lim <- 0
-       x.up.lim  <- 1
-       x.lim = c(x.low.lim, x.up.lim)
-  select.graphics.dev(filename=outfile, size=size, format=format)
-       plot(density(rand.samps), main="Community Significance Test", 
ylab="Probability Density", xlab="Conductance", xlim=x.lim)
-       points(x=mean(cluster.conductance),y=0, col="green", pch=21, cex=2.5, 
bg="green")
-       abline(v=t.result$conf.int[1], col='red')
-       #abline(v=t.result$estimate, col='black')
-       abline(v=t.result$conf.int[2], col='red')
-       legendData <- character(2)
-       legendData[1] <- sprintf("T-test p-value: %e", t.result$p.value)
-       legendData[2] <- sprintf("Shapiro-Wilk p-value: %e", 
shap.result$p.value)
-       legend("topleft", legend=legendData, bty="n")
-       dev.off()
+plot.t.test <- function(random.samples, test.values, t.test, shapiro.test, 
title, metric.name) {
+  test.value.mean <- mean(test.values)
+  conf.intervals <- c(t.test$conf.int[1], t.test$conf.int[2]) 
+  x.lower.lim <- 0
+  x.upper.lim  <- 1
+  x.lim = c(x.lower.lim, x.upper.lim)
+  t.test.annotation <- paste("T-test p-value =", 
as.character(signif(t.test$p.value,3))) 
+  shapiro.test.annotation <- paste("Shapiro-test p-value =", 
as.character(signif(shapiro.test$p.value,3)))
+  total.annotation <- paste(t.test.annotation, shapiro.test.annotation, 
sep="\n")
+  y.upper.lim <- max(density(random.samples)$y)
+
+  t.test.plot <- ggplot(data.frame(random.samples), aes(x=random.samples)) +
+                 geom_histogram(aes(y=..density..), color="black", 
fill="white") +
+                 geom_density(alpha=.2, fill="grey") +
+                 geom_vline(xintercept=conf.intervals, linetype="dotted") +
+                 geom_point(data=data.frame(x=test.value.mean, y=0), aes(x=x, 
y=y), size=5, color='black') +
+                 scale_x_continuous(limits = x.lim) +
+                 ggtitle(title) +
+                 xlab(metric.name) +
+                 ylab("Density") +
+                 theme(plot.title = element_text(lineheight=.8, face="bold"),
+                       axis.title.x = element_text(size=rel(1.3)),
+                       axis.title.y = element_text(size=rel(1.3))) +
+                 annotate("text", label=total.annotation, x=x.upper.lim, 
y=y.upper.lim*1.15) +
+                 theme_bw(base_size=16)
+
+  return(t.test.plot)
 }
 
 
@@ -250,7 +259,7 @@ save.comm.sig.test <- function(rand.samps, 
cluster.conductance,t.result,
 ## - Ouput -
 ## conduct.vec: the conductance for each trial
 ############################################################################
-randomised.conductance.samples <- function(graph, niter, cluster.algo) {
+rewired.graph.samples <- function(graph, cluster.algo, metric, niter) {
   ## Check if loops exist in the original graph, this information is necessary
   ## to choose the appropriate rewiring strategy
   loops.exist <- any(is.loop(graph))
@@ -264,15 +273,16 @@ randomised.conductance.samples <- function(graph, niter, 
cluster.algo) {
   ## Perform iterations
   conduct.vec <- vector()
   graph.multi <- edge.weight.to.multi(graph)
+  graph.multi <- remove.vertex.attribute(graph.multi, 'name')
   pb <- txtProgressBar(min = 0, max = niter, style = 3)
   for (i in 1:niter) {
     ## Update progress bar
     setTxtProgressBar(pb, i)
-
     ## Rewire graph, randomize the graph while maintaining the degree 
distribution
+    rw.graph <- birewire.rewire(graph.multi, MAXITER_MUL=1000, exact=T, 
verbose=F)
     #rw.graph <- rewire(graph.multi, mode = rewire.mode,
     #                              niter = 1)#10*ecount(graph.multi))
-    rw.graph <- degree.sequence.game(igraph::degree(graph.multi, mode="all"))
+    #rw.graph <- degree.sequence.game(igraph::degree(graph.multi, mode="all"))
 
     E(rw.graph)$weight <- 1
     rw.graph <- simplify(rw.graph, remove.loops=FALSE)
@@ -280,12 +290,13 @@ randomised.conductance.samples <- function(graph, niter, 
cluster.algo) {
     ## Find clusters
     rw.graph.clusters <- community.detection.disconnected(rw.graph, 
cluster.algo)
 
-    ## Only analyze clusters that are large than 10 vertices
-    ##rw.graph.clusters.more <- select.communities.more(rw.graph.clusters, 10)
+    ## Only analyze clusters that are large than 4 vertices
+    min.community <- minCommGraph(rw.graph, rw.graph.clusters, 5)
+    rw.graph.min <- min.community$graph
+    rw.graph.clusters.min <- min.community$community
 
     ## Compute conductance
-    rw.cluster.conductance <- community.metric(rw.graph, rw.graph.clusters,
-                                               "conductance")
+    rw.cluster.conductance <- community.metric(rw.graph.min, 
rw.graph.clusters.min, metric)
     conduct.vec <- append(conduct.vec, mean(rw.cluster.conductance, 
na.rm=TRUE))
   }
   ## Close progress bar
@@ -323,10 +334,11 @@ community.metric <- function(graph, community, test) {
   }
 
   if(test == "modularity") {
-    metric.vec <- sapply(community.id,
-                          function(x) {
-                            return(community.quality.modularity(graph, 
members[[x]]))
-                          })
+    modularity.vec <- sapply(community.id,
+                             function(x) {
+                               return(community.quality.modularity(graph, 
members[[x]]))
+                            })
+    metric.vec <- sum(modularity.vec)
   }
   else if (test == "wilcox") {
     metric.vec <- sapply(community.id,
-- 
1.9.1
Follow-Ups:
- [codeface] [PATCH 2/7] Set db connection session timeout to 24 instead of the default 8 hours
  - From: Mitchell Joblin
- [codeface] [PATCH 3/7] Add parallel computation for rewire procedure
  - From: Mitchell Joblin
- [codeface] [PATCH 4/7] Based on myslqdb documentation this parameter should be a tuple
  - From: Mitchell Joblin
- [codeface] [PATCH 5/7] Add ability to perform bulk inserts
  - From: Mitchell Joblin
- [codeface] [PATCH 6/7] Shorten transaction times
  - From: Mitchell Joblin
- [codeface] [PATCH 7/7] Remove hack to fix page rank numerical stability issues
  - From: Mitchell Joblin
[codeface] [PATCH 1/7] Change rountines for random graph test for community significance

Other related posts: