We now support tracing commits, features and functions through the strengh of an relation. This allows to include several new informations in the output. Currently we output a new file 'adjacencyMatrix_max_weight.txt' which outputs the feature or function with the most influence instead of the collaboration strength ('adjacencyMatrix.txt'). This would work for files as well but we have to change the 'File_Level' to something like '$filename' for the output to be usefull. Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx> --- codeface/cluster/PersonInfo.py | 80 +++++++++++++++++++++++++++----- codeface/cluster/cluster.py | 103 ++++++++++++++++++++++++++++++++++------- codeface/cluster/codeBlock.py | 11 ++++- 3 files changed, 163 insertions(+), 31 deletions(-) diff --git a/codeface/cluster/PersonInfo.py b/codeface/cluster/PersonInfo.py index 8069b78..ff38bd2 100644 --- a/codeface/cluster/PersonInfo.py +++ b/codeface/cluster/PersonInfo.py @@ -16,8 +16,65 @@ # All Rights Reserved. from logging import getLogger; log = getLogger(__name__) -from codeface.commit_analysis import tag_types, active_tag_types, proximity_relation \ -, file_relation, committer2author_relation, all_link_types +from codeface.commit_analysis import tag_types, active_tag_types, proximity_relation, \ + file_relation, committer2author_relation, all_link_types + + +# Readonly class +class RelationWeight: + def __init__(self, weight, group_name, commit_ids1, commit_ids2): + self.weight = weight + self.groupName = group_name + self.commitIds1 = commit_ids1 + self.commitIds2 = commit_ids2 + + def get_weight(self): + return self.weight + + def get_commit_ids1(self): + return self.commitIds1 + + def get_commit_ids2(self): + return self.commitIds2 + + def get_group_name(self): + return self.groupName + + +class RelationWeights: + def __init__(self, init_weight=None): + if init_weight is not None: + self.weightSum = init_weight.get_weight() + self.weights = [init_weight] + self.maxWeight = init_weight + else: + self.weightSum = 0 + self.weights = [] + self.maxWeight = None + + def get_weight(self): + return self.weightSum + + def get_max_weight(self): + return self.maxWeight + + def add_weight(self, new_weight): + weight = new_weight.get_weight() + if (self.maxWeight is None) or (weight > self.maxWeight.get_weight()): + self.maxWeight = new_weight + self.weightSum += weight + self.weights.append(new_weight) + + def add_weights(self, weights): + for weight in weights: + self.add_weight(weight) + + def copy(self): + new = RelationWeights() + new.weightSum = self.weightSum + new.weights = list(self.weights) + new.maxWeight = self.maxWeight + return new class PersonInfo: """ Information about a commiter, and his relation to other commiters""" @@ -118,7 +175,7 @@ class PersonInfo: if ID in link_hash.keys(): return link_hash[ID] else: - return 0 + return RelationWeights() def getActiveTagsReceivedByID(self, ID): return self._getLinksReceivedByID(self.active_tags_received_by_id, ID) @@ -133,18 +190,18 @@ class PersonInfo: def getAllTagsReceivedByID(self, ID): return self._getTagsReceivedByID(self.all_tags_received_by_id, ID) - def addRelation(self, relation_type, ID, assoc, weight=1): + def addRelation(self, relation_type, ID, assoc, weight): """State that the person has received or given a tag from/to ID. The distinction between taking and giving is made in other functions.""" if (ID in assoc[relation_type]): - assoc[relation_type][ID] += weight + assoc[relation_type][ID].add_weight(weight) else: - assoc[relation_type][ID] = weight + assoc[relation_type][ID] = RelationWeights(weight) - def addReceiveRelation(self, relation_type, ID, weight=1): + def addReceiveRelation(self, relation_type, ID, weight): ''' add a one directional relation from the person identified by ID and this person instance (ie. self) @@ -154,7 +211,7 @@ class PersonInfo: self.addRelation(relation_type, ID, self.associations, weight) - def addSendRelation(self, relation_type, ID, cmt, weight=1): + def addSendRelation(self, relation_type, ID, cmt, weight): ''' add a one directional relation from the person instance (ie. self) and the person identified by ID @@ -162,7 +219,7 @@ class PersonInfo: the weight parameter specified the edge strength ''' - self.addRelation(relation_type, ID, self.inv_associations) + self.addRelation(relation_type, ID, self.inv_associations, weight) if relation_type in tag_types: self.tagged_commits[relation_type].append(cmt.id) @@ -187,10 +244,11 @@ class PersonInfo: # Helper for computeTagStats, see below def _sum_relations(self, relation_type, rcv_by_id_hash): for ID in self.associations[relation_type]: + weights = self.associations[relation_type][ID] if ID in rcv_by_id_hash: - rcv_by_id_hash[ID] += self.associations[relation_type][ID] + rcv_by_id_hash[ID].add_weights(weights) else: - rcv_by_id_hash[ID] = self.associations[relation_type][ID] + rcv_by_id_hash[ID] = weights.copy() def computeStats(self, link_type): diff --git a/codeface/cluster/cluster.py b/codeface/cluster/cluster.py index 976c85c..7fcbaf8 100755 --- a/codeface/cluster/cluster.py +++ b/codeface/cluster/cluster.py @@ -35,6 +35,7 @@ from logging import getLogger; log = getLogger(__name__) from codeface import kerninfo from codeface.commit_analysis import (getSignoffCount, getSignoffEtcCount, getInvolvedPersons, tag_types) +from codeface.cluster.PersonInfo import RelationWeight from codeface.VCS import gitVCS from codeface.dbmanager import DBManager, tstamp_to_sql from .PersonInfo import PersonInfo @@ -296,7 +297,7 @@ def groupFuncLines(file_commit, file_state, cmtList): append(codeBlock.codeBlock(blk_start, blk_end, cmtList[str(curr_cmt_id)].getAuthorPI().getID(), cmtList[str(curr_cmt_id)].getCommitterPI().getID(), - curr_cmt_id)) + curr_cmt_id, curr_func_id)) blk_start = next_line blk_end = blk_start @@ -304,7 +305,7 @@ def groupFuncLines(file_commit, file_state, cmtList): func_blks[next_func_indx].append(codeBlock.codeBlock(blk_start, blk_end, cmtList[str(next_cmt_id)].getAuthorPI().getID(), cmtList[str(next_cmt_id)].getCommitterPI().getID(), - next_cmt_id)) + next_cmt_id, curr_func_id)) return func_blks @@ -350,7 +351,7 @@ def group_feature_lines(file_commit, file_state, cmt_list): .getID(), cmt_list[str(curr_cmt_id)].getCommitterPI() .getID(), - curr_cmt_id)) + curr_cmt_id, feature)) blk_start[feature] = next_line blk_end[feature] = next_line @@ -362,7 +363,7 @@ def group_feature_lines(file_commit, file_state, cmt_list): blk_start[feature], blk_end[feature], cmt_list[str(next_cmt_id)].getAuthorPI().getID(), cmt_list[str(next_cmt_id)].getCommitterPI().getID(), - next_cmt_id)) + next_cmt_id, feature)) return feature_blks @@ -486,7 +487,7 @@ def computeCommitCollaboration(codeBlks, cmt, id_mgr, link_type, maxDist, # collaboration strength is seen as the sum of the newly contributed # lines of code and previously committed code by the other person - collaboration_strength = computeBlksSize(revCmtBlks, oldRevBlks) + collaboration_strength = compute_block_weight(revCmtBlks, oldRevBlks) #store result if author: @@ -563,6 +564,13 @@ def computeBlksSize(blks1, blks2): return size_total +def compute_block_weight(blocks1, blocks2): + commit_ids1 = [blk.cmtHash for blk in blocks1] + commit_ids2 = [blk.cmtHash for blk in blocks2] + size = computeBlksSize(blocks1, blocks2) + return RelationWeight(size, blocks1[0].get_group_name(), commit_ids1, commit_ids2) + + def computeEdgeStrength(blk1, blk2, maxDist): ''' Calculates a value that indicates how strongly the two @@ -1229,13 +1237,13 @@ def writeAdjMatrix2File(id_mgr, outdir, conf): if link_type == LinkType.tag: for id_receiver in idlist: out.write("\t".join( - [str(id_mgr.getPI(id_receiver).getActiveTagsReceivedByID(id_sender)) + [str(id_mgr.getPI(id_receiver).getActiveTagsReceivedByID(id_sender).get_weight()) for id_sender in idlist]) + "\n") else: for id_receiver in idlist: out.write("\t".join( - [str(id_mgr.getPI(id_receiver).getLinksReceivedByID(id_sender, link_type)) + [str(id_mgr.getPI(id_receiver).getLinksReceivedByID(id_sender, link_type).get_weight()) for id_sender in idlist]) + "\n") @@ -1243,6 +1251,59 @@ def writeAdjMatrix2File(id_mgr, outdir, conf): out.close() +def writeAdjMatrixMaxWeight2File(id_mgr, outdir, conf): + ''' + Connections between the developers are written to the outdir location + in adjacency matrix format + ''' + + # Store the adjacency matrix for developer network, i.e., create + # a NxN matrix in which the entry a_{i,j} denotes how strongly + # developer j was associated with developer i + # NOTE: This produces a sparse matrix, but since the number + # of developers is only a few thousand, it will likely not pay + # off to utilise this fact for more efficient storage. + + link_type = conf["tagging"] + out = open(os.path.join(outdir, "adjacencyMatrix_max_weight.txt"), 'wb') + idlist = sorted(id_mgr.getPersons().keys()) + # Header + out.write("" + + "\t".join([str(elem) for elem in idlist]) + + "\n") + + # Matrix. The sum of all elements in row N describes how many + # tags id N has received. The sum of column N states how many + # tags were given by id N to other developers. + def get_tags_received_by_id_max_group_name(id_receiver, id_sender): + max_weight = id_mgr.getPI(id_receiver).getActiveTagsReceivedByID(id_sender).get_max_weight() + if max_weight is None: + return "None" + else: + return str(max_weight.get_group_name()) + + def get_links_received_by_id_max_group_name(id_receiver, id_sender): + max_weight = id_mgr.getPI(id_receiver).getLinksReceivedByID(id_sender, link_type).get_max_weight() + if max_weight is None: + return "None" + else: + return str(max_weight.get_group_name()) + + if link_type == LinkType.tag: + for id_receiver in idlist: + out.write("\t".join( + [get_tags_received_by_id_max_group_name(id_receiver, id_sender) + for id_sender in idlist]) + "\n") + + else: + for id_receiver in idlist: + out.write("\t".join( + [get_links_received_by_id_max_group_name(id_receiver, id_sender) + for id_sender in idlist]) + "\n") + + out.close() + + def emitStatisticalData(cmtlist, id_mgr, logical_depends, outdir, releaseRangeID, dbm, conf, fileCommitDict): """Save the available information for a release interval for further statistical processing. @@ -1264,6 +1325,8 @@ def emitStatisticalData(cmtlist, id_mgr, logical_depends, outdir, releaseRangeID writeAdjMatrix2File(id_mgr, outdir, conf) + writeAdjMatrixMaxWeight2File(id_mgr, outdir, conf) + if logical_depends is not None: writeDependsToDB(logical_depends, fileCommitDict, cmtlist, releaseRangeID, dbm, conf) @@ -1439,17 +1502,17 @@ def compute_feature_proximity_links(file_commit_list, cmt_list, id_mgr, link_typ #of code if len(file_state_mod) > 1: # identify code line clustering using feature location information - feature_clusters = group_feature_lines(file_commit, file_state_mod, cmt_list) + feature_groups = group_feature_lines(file_commit, file_state_mod, cmt_list) # We now have a 'feature -> codeblock list' mapping - for feature in feature_clusters: + for feature in feature_groups: if feature not in author_feature_changes: author_feature_changes[feature] = {} author_changes = author_feature_changes[feature] - codeBlks = feature_clusters[feature] + feature_group = feature_groups[feature] #get all blocks contributed by the revision commit we are looking at - revCmtBlks = [blk for blk in codeBlks if blk.cmtHash == cmt.id] + revCmtBlks = [blk for blk in feature_group if blk.cmtHash == cmt.id] if revCmtBlks: #get the person responsible for this revision if author: @@ -1458,9 +1521,9 @@ def compute_feature_proximity_links(file_commit_list, cmt_list, id_mgr, link_typ revPerson = id_mgr.getPI( revCmtBlks[0].committerId ) if revPerson not in author_changes: - author_changes[revPerson] = 0 - - author_changes[revPerson] += computeBlksSize(revCmtBlks, []) + author_changes[revPerson] = list(revCmtBlks) + else: + author_changes[revPerson].extend(revCmtBlks) # Now we calculate the collaboration strength between authors as # (SUM(MIN(line-changes of author1 on feature, line-changes of author2 on feature) FOR feature IN features)) @@ -1469,9 +1532,13 @@ def compute_feature_proximity_links(file_commit_list, cmt_list, id_mgr, link_typ for author1 in author_changes: for author2 in author_changes: if author1 is not author2: - strength = min(author_changes[author1], author_changes[author2]) - author1.addSendRelation(link_type, author2.getID(), cmt, strength) - author2.addReceiveRelation(link_type, author1.getID(), strength) + weight = compute_block_weight(author_changes[author1], author_changes[author2]) + size1 = computeBlksSize(author_changes[author1], []) + size2 = computeBlksSize(author_changes[author2], []) + weight = RelationWeight( + min(size1, size2), feature, weight.get_commit_ids1(), weight.get_commit_ids2()) + author1.addSendRelation(link_type, author2.getID(), cmt, weight) + author2.addReceiveRelation(link_type, author1.getID(), weight) #del author_feature_changes[author1] #raise Exception("feature proximity links is not implemented!") @@ -1542,7 +1609,7 @@ def computeTagLinks(cmtlist, id_mgr): # With every person, we can associate statistical information into # which subsystems he/she typically commits, with whom he collaborates, # and so on. From this, we can infer further information for each - # commit, for instance how many people working on different subststems + # commit, for instance how many people working on different subsystems # have signed off the commit, of how important the people who sign off # the commit are. diff --git a/codeface/cluster/codeBlock.py b/codeface/cluster/codeBlock.py index 2096ca6..eb26437 100644 --- a/codeface/cluster/codeBlock.py +++ b/codeface/cluster/codeBlock.py @@ -27,20 +27,27 @@ class codeBlock: # Constructors #=========================== - def __init__(self, start=None, end=None, authorId=None, committerId=None, cmtHash=None): + def __init__(self, start=None, end=None, authorId=None, committerId=None, cmtHash=None, groupName=None): self.start = start #start of the code block self.end = end #end of the code block self.authorId = authorId self.committerId = committerId self.cmtHash = cmtHash + # specified the name of this block, + # this is saved so we can trace where collaborations come from + # ie which function/feature/file is responsible for a specific + # collaboration. + self.groupName = groupName #=========================== # Accessors and Modifiers #=========================== + def get_group_name(self): + return self.groupName + def get_codeLines(self): return self.codeLines - def add_codeLine(self, lineNum, cmtHash, authorId, committerId): self.codeLines.append( codeLine.codeLine(lineNum, cmtHash, authorId, committerId) ) \ No newline at end of file -- 1.8.5.5