From: Mitchell Joblin <mitchell.joblin.ext@xxxxxxxxxxx> - When querying the database for developer ids, the results does not necessarily match the order of ids in the adjacency matrix, yet we make this assumption in later code - We now encode the adjacency matrix header with developer ids instead of names, then we compare the ids we get from the database with the ids we parse from the adjacency matrix storage file - If a mismatch exists we remap the order correctly Signed-off-by: Mitchell Joblin <mitchell.joblin.ext@xxxxxxxxxxx> --- codeface/R/cluster/persons.r | 17 ++++++++++++++--- codeface/cluster/cluster.py | 4 ++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/codeface/R/cluster/persons.r b/codeface/R/cluster/persons.r index e30b0a9..c1fd1b1 100755 --- a/codeface/R/cluster/persons.r +++ b/codeface/R/cluster/persons.r @@ -974,15 +974,26 @@ writePageRankData <- function(conf, outdir, .iddb, devs.by.pr, devs.by.pr.tr) { performAnalysis <- function(outdir, conf) { ################## Process the data ################# logdevinfo("Reading files", logger="cluster.persons") - adjMatrix <- read.table(file=paste(outdir, "/adjacencyMatrix.txt", sep=""), - sep="\t", header=FALSE) + mat.file <- paste(outdir, "/adjacencyMatrix.txt", sep="") + adjMatrix <- read.table(mat.file, sep="\t", header=TRUE) + adjMatrix.ids <- unlist(strsplit(readLines(mat.file, n=1), "\t")) + colnames(adjMatrix) <- rownames(adjMatrix) ## The adjacency matrix file format uses a different convention for edge ## direction than GNU R, so we need to transpose the matrix adjMatrix <- t(adjMatrix) - ids <- get.range.stats(conf$con, conf$range.id) + ids.db <- get.range.stats(conf$con, conf$range.id) + + ## Check that ids are in correct order, the ids queried from the + ## db are not necessarily in the same order as the adjacency matrix + ## columns. Here we remap ids to the correct order. + remapping <- unlist(lapply(adjMatrix.ids, function(id) {which(id == ids.db$ID)})) + ids <- ids.db[remapping,] + if(!all(ids$ID==adjMatrix.ids)) { + logerror("Id mismatch", logger="cluster.persons") + } id.subsys <- read.csv(file=paste(outdir, "/id_subsys.txt", sep=""), sep="\t", header=TRUE) diff --git a/codeface/cluster/cluster.py b/codeface/cluster/cluster.py index ecc58e2..d52d92e 100755 --- a/codeface/cluster/cluster.py +++ b/codeface/cluster/cluster.py @@ -985,8 +985,8 @@ def writeAdjMatrix2File(id_mgr, outdir, conf): out = open(os.path.join(outdir, "adjacencyMatrix.txt"), 'wb') idlist = sorted(id_mgr.getPersons().keys()) # Header - out.write("# " + - "\t".join([str(id_mgr.getPI(elem).getName()) for elem in idlist]) + + out.write("" + + "\t".join([str(elem) for elem in idlist]) + "\n") # Matrix. The sum of all elements in row N describes how many -- 1.8.1.2