[codeface] [PATCH] Fix mismatch between dev ids and adjacency matrix columns

  • From: Mitchell Joblin <joblin.m@xxxxxxxxx>
  • To: codeface@xxxxxxxxxxxxx
  • Date: Wed, 5 Feb 2014 21:56:49 +0100

From: Mitchell Joblin <mitchell.joblin.ext@xxxxxxxxxxx>

- When querying the database for developer ids, the results
  does not necessarily match the order of ids in the
  adjacency matrix, yet we make this assumption in later code

- We now encode the adjacency matrix header with developer ids
  instead of names, then we compare the ids we get from the
  database with the ids we parse from the adjacency matrix
  storage file

- If a mismatch exists we remap the order correctly

Signed-off-by: Mitchell Joblin <mitchell.joblin.ext@xxxxxxxxxxx>
---
 codeface/R/cluster/persons.r | 17 ++++++++++++++---
 codeface/cluster/cluster.py  |  4 ++--
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/codeface/R/cluster/persons.r b/codeface/R/cluster/persons.r
index e30b0a9..c1fd1b1 100755
--- a/codeface/R/cluster/persons.r
+++ b/codeface/R/cluster/persons.r
@@ -974,15 +974,26 @@ writePageRankData <- function(conf, outdir, .iddb, 
devs.by.pr, devs.by.pr.tr) {
 performAnalysis <- function(outdir, conf) {
   ################## Process the data #################
   logdevinfo("Reading files", logger="cluster.persons")
-  adjMatrix <- read.table(file=paste(outdir, "/adjacencyMatrix.txt", sep=""),
-                     sep="\t", header=FALSE)
+  mat.file <- paste(outdir, "/adjacencyMatrix.txt", sep="")
+  adjMatrix <- read.table(mat.file, sep="\t", header=TRUE)
+  adjMatrix.ids <- unlist(strsplit(readLines(mat.file, n=1), "\t"))
+
   colnames(adjMatrix) <- rownames(adjMatrix)
 
   ## The adjacency matrix file format uses a different convention for edge
   ## direction than GNU R, so we need to transpose the matrix
   adjMatrix <- t(adjMatrix)
 
-  ids <- get.range.stats(conf$con, conf$range.id)
+  ids.db <- get.range.stats(conf$con, conf$range.id)
+
+  ## Check that ids are in correct order, the ids queried from the
+  ## db are not necessarily in the same order as the adjacency matrix
+  ## columns. Here we remap ids to the correct order.
+  remapping <- unlist(lapply(adjMatrix.ids, function(id) {which(id == 
ids.db$ID)}))
+  ids <- ids.db[remapping,]
+  if(!all(ids$ID==adjMatrix.ids)) {
+      logerror("Id mismatch", logger="cluster.persons")
+  }
 
   id.subsys <- read.csv(file=paste(outdir, "/id_subsys.txt", sep=""),
                        sep="\t", header=TRUE)
diff --git a/codeface/cluster/cluster.py b/codeface/cluster/cluster.py
index ecc58e2..d52d92e 100755
--- a/codeface/cluster/cluster.py
+++ b/codeface/cluster/cluster.py
@@ -985,8 +985,8 @@ def writeAdjMatrix2File(id_mgr, outdir, conf):
     out = open(os.path.join(outdir, "adjacencyMatrix.txt"), 'wb')
     idlist = sorted(id_mgr.getPersons().keys())
     # Header
-    out.write("# " +
-              "\t".join([str(id_mgr.getPI(elem).getName()) for elem in 
idlist]) +
+    out.write("" +
+              "\t".join([str(elem) for elem in idlist]) +
               "\n")
 
     # Matrix. The sum of all elements in row N describes how many
-- 
1.8.1.2


Other related posts: