[codeface] [PATCH 11/12] Trace features and functions in the collaboration analysis.

  • From: Matthias Dittrich <matthi.d@xxxxxxxxxxxxxx>
  • To: codeface@xxxxxxxxxxxxx
  • Date: Wed, 19 Nov 2014 21:40:30 +0100

We now support tracing commits, features and functions through the strengh of 
an relation.
This allows to include several new informations in the output.
Currently we output a new file 'adjacencyMatrix_max_weight.txt' which outputs 
the feature or
function with the most influence instead of the collaboration strength 
('adjacencyMatrix.txt').
This would work for files as well but we have to change the 'File_Level' to 
something like '$filename' for the output to be usefull.

Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx>
---
 codeface/cluster/PersonInfo.py |  80 +++++++++++++++++++++++++++-----
 codeface/cluster/cluster.py    | 103 ++++++++++++++++++++++++++++++++++-------
 codeface/cluster/codeBlock.py  |  11 ++++-
 3 files changed, 163 insertions(+), 31 deletions(-)

diff --git a/codeface/cluster/PersonInfo.py b/codeface/cluster/PersonInfo.py
index 8069b78..ff38bd2 100644
--- a/codeface/cluster/PersonInfo.py
+++ b/codeface/cluster/PersonInfo.py
@@ -16,8 +16,65 @@
 # All Rights Reserved.
 
 from logging import getLogger; log = getLogger(__name__)
-from codeface.commit_analysis import tag_types, active_tag_types, 
proximity_relation \
-, file_relation, committer2author_relation, all_link_types
+from codeface.commit_analysis import tag_types, active_tag_types, 
proximity_relation, \
+    file_relation, committer2author_relation, all_link_types
+
+
+# Readonly class
+class RelationWeight:
+    def __init__(self, weight, group_name, commit_ids1, commit_ids2):
+        self.weight = weight
+        self.groupName = group_name
+        self.commitIds1 = commit_ids1
+        self.commitIds2 = commit_ids2
+
+    def get_weight(self):
+        return self.weight
+
+    def get_commit_ids1(self):
+        return self.commitIds1
+
+    def get_commit_ids2(self):
+        return self.commitIds2
+
+    def get_group_name(self):
+        return self.groupName
+
+
+class RelationWeights:
+    def __init__(self, init_weight=None):
+        if init_weight is not None:
+            self.weightSum = init_weight.get_weight()
+            self.weights = [init_weight]
+            self.maxWeight = init_weight
+        else:
+            self.weightSum = 0
+            self.weights = []
+            self.maxWeight = None
+
+    def get_weight(self):
+        return self.weightSum
+
+    def get_max_weight(self):
+        return self.maxWeight
+
+    def add_weight(self, new_weight):
+        weight = new_weight.get_weight()
+        if (self.maxWeight is None) or (weight > self.maxWeight.get_weight()):
+            self.maxWeight = new_weight
+        self.weightSum += weight
+        self.weights.append(new_weight)
+
+    def add_weights(self, weights):
+        for weight in weights:
+            self.add_weight(weight)
+
+    def copy(self):
+        new = RelationWeights()
+        new.weightSum = self.weightSum
+        new.weights = list(self.weights)
+        new.maxWeight = self.maxWeight
+        return new
 
 class PersonInfo:
     """ Information about a commiter, and his relation to other commiters"""
@@ -118,7 +175,7 @@ class PersonInfo:
         if ID in link_hash.keys():
             return link_hash[ID]
         else:
-            return 0
+            return RelationWeights()
 
     def getActiveTagsReceivedByID(self, ID):
         return self._getLinksReceivedByID(self.active_tags_received_by_id, ID)
@@ -133,18 +190,18 @@ class PersonInfo:
     def getAllTagsReceivedByID(self, ID):
         return self._getTagsReceivedByID(self.all_tags_received_by_id, ID)
 
-    def addRelation(self, relation_type, ID, assoc, weight=1):
+    def addRelation(self, relation_type, ID, assoc, weight):
         """State that the person has received or given a tag from/to ID.
 
         The distinction between taking and giving is made in other
         functions."""
 
         if (ID in assoc[relation_type]):
-            assoc[relation_type][ID] += weight
+            assoc[relation_type][ID].add_weight(weight)
         else:
-            assoc[relation_type][ID] = weight
+            assoc[relation_type][ID] = RelationWeights(weight)
 
-    def addReceiveRelation(self, relation_type, ID, weight=1):
+    def addReceiveRelation(self, relation_type, ID, weight):
         '''
         add a one directional relation from the person identified by
         ID and this person instance (ie. self)
@@ -154,7 +211,7 @@ class PersonInfo:
 
         self.addRelation(relation_type, ID, self.associations, weight)
 
-    def addSendRelation(self, relation_type, ID, cmt, weight=1):
+    def addSendRelation(self, relation_type, ID, cmt, weight):
         '''
         add a one directional relation from the person instance
         (ie. self) and the person identified by ID
@@ -162,7 +219,7 @@ class PersonInfo:
         the weight parameter specified the edge strength
         '''
 
-        self.addRelation(relation_type, ID, self.inv_associations)
+        self.addRelation(relation_type, ID, self.inv_associations, weight)
 
         if relation_type in tag_types:
             self.tagged_commits[relation_type].append(cmt.id)
@@ -187,10 +244,11 @@ class PersonInfo:
     # Helper for computeTagStats, see below
     def _sum_relations(self, relation_type, rcv_by_id_hash):
         for ID in self.associations[relation_type]:
+            weights = self.associations[relation_type][ID]
             if ID in rcv_by_id_hash:
-                rcv_by_id_hash[ID] += self.associations[relation_type][ID]
+                rcv_by_id_hash[ID].add_weights(weights)
             else:
-                rcv_by_id_hash[ID] = self.associations[relation_type][ID]
+                rcv_by_id_hash[ID] = weights.copy()
 
     def computeStats(self, link_type):
 
diff --git a/codeface/cluster/cluster.py b/codeface/cluster/cluster.py
index 976c85c..7fcbaf8 100755
--- a/codeface/cluster/cluster.py
+++ b/codeface/cluster/cluster.py
@@ -35,6 +35,7 @@ from logging import getLogger; log = getLogger(__name__)
 from codeface import kerninfo
 from codeface.commit_analysis import (getSignoffCount, getSignoffEtcCount,
         getInvolvedPersons, tag_types)
+from codeface.cluster.PersonInfo import RelationWeight
 from codeface.VCS import gitVCS
 from codeface.dbmanager import DBManager, tstamp_to_sql
 from .PersonInfo import PersonInfo
@@ -296,7 +297,7 @@ def groupFuncLines(file_commit, file_state, cmtList):
             append(codeBlock.codeBlock(blk_start, blk_end,
                    cmtList[str(curr_cmt_id)].getAuthorPI().getID(),
                    cmtList[str(curr_cmt_id)].getCommitterPI().getID(),
-                   curr_cmt_id))
+                   curr_cmt_id, curr_func_id))
             blk_start = next_line
             blk_end   = blk_start
 
@@ -304,7 +305,7 @@ def groupFuncLines(file_commit, file_state, cmtList):
     func_blks[next_func_indx].append(codeBlock.codeBlock(blk_start, blk_end,
                             cmtList[str(next_cmt_id)].getAuthorPI().getID(),
                             cmtList[str(next_cmt_id)].getCommitterPI().getID(),
-                            next_cmt_id))
+                            next_cmt_id, curr_func_id))
 
     return func_blks
 
@@ -350,7 +351,7 @@ def group_feature_lines(file_commit, file_state, cmt_list):
                                 .getID(),
                                 cmt_list[str(curr_cmt_id)].getCommitterPI()
                                 .getID(),
-                                curr_cmt_id))
+                                curr_cmt_id, feature))
                 blk_start[feature] = next_line
                 blk_end[feature] = next_line
 
@@ -362,7 +363,7 @@ def group_feature_lines(file_commit, file_state, cmt_list):
                     blk_start[feature], blk_end[feature],
                     cmt_list[str(next_cmt_id)].getAuthorPI().getID(),
                     cmt_list[str(next_cmt_id)].getCommitterPI().getID(),
-                    next_cmt_id))
+                    next_cmt_id, feature))
 
     return feature_blks
 
@@ -486,7 +487,7 @@ def computeCommitCollaboration(codeBlks, cmt, id_mgr, 
link_type, maxDist,
 
         # collaboration strength is seen as the sum of the newly contributed
         # lines of code and previously committed code by the other person
-        collaboration_strength = computeBlksSize(revCmtBlks, oldRevBlks)
+        collaboration_strength = compute_block_weight(revCmtBlks, oldRevBlks)
 
         #store result
         if author:
@@ -563,6 +564,13 @@ def computeBlksSize(blks1, blks2):
     return size_total
 
 
+def compute_block_weight(blocks1, blocks2):
+    commit_ids1 = [blk.cmtHash for blk in blocks1]
+    commit_ids2 = [blk.cmtHash for blk in blocks2]
+    size = computeBlksSize(blocks1, blocks2)
+    return RelationWeight(size, blocks1[0].get_group_name(), commit_ids1, 
commit_ids2)
+
+
 def computeEdgeStrength(blk1, blk2, maxDist):
     '''
     Calculates a value that indicates how strongly the two
@@ -1229,13 +1237,13 @@ def writeAdjMatrix2File(id_mgr, outdir, conf):
     if link_type == LinkType.tag:
         for id_receiver in idlist:
             out.write("\t".join(
-                
[str(id_mgr.getPI(id_receiver).getActiveTagsReceivedByID(id_sender))
+                
[str(id_mgr.getPI(id_receiver).getActiveTagsReceivedByID(id_sender).get_weight())
                    for id_sender in idlist]) + "\n")
 
     else:
         for id_receiver in idlist:
             out.write("\t".join(
-                [str(id_mgr.getPI(id_receiver).getLinksReceivedByID(id_sender, 
link_type))
+                [str(id_mgr.getPI(id_receiver).getLinksReceivedByID(id_sender, 
link_type).get_weight())
                    for id_sender in idlist]) + "\n")
 
 
@@ -1243,6 +1251,59 @@ def writeAdjMatrix2File(id_mgr, outdir, conf):
     out.close()
 
 
+def writeAdjMatrixMaxWeight2File(id_mgr, outdir, conf):
+    '''
+    Connections between the developers are written to the outdir location
+    in adjacency matrix format
+    '''
+
+    # Store the adjacency matrix for developer network, i.e., create
+    # a NxN matrix in which the entry a_{i,j} denotes how strongly
+    # developer j was associated with developer i
+    # NOTE: This produces a sparse matrix, but since the number
+    # of developers is only a few thousand, it will likely not pay
+    # off to utilise this fact for more efficient storage.
+
+    link_type = conf["tagging"]
+    out = open(os.path.join(outdir, "adjacencyMatrix_max_weight.txt"), 'wb')
+    idlist = sorted(id_mgr.getPersons().keys())
+    # Header
+    out.write("" +
+              "\t".join([str(elem) for elem in idlist]) +
+              "\n")
+
+    # Matrix. The sum of all elements in row N describes how many
+    # tags id N has received. The sum of column N states how many
+    # tags were given by id N to other developers.
+    def get_tags_received_by_id_max_group_name(id_receiver, id_sender):
+        max_weight = 
id_mgr.getPI(id_receiver).getActiveTagsReceivedByID(id_sender).get_max_weight()
+        if max_weight is None:
+            return "None"
+        else:
+            return str(max_weight.get_group_name())
+
+    def get_links_received_by_id_max_group_name(id_receiver, id_sender):
+        max_weight = id_mgr.getPI(id_receiver).getLinksReceivedByID(id_sender, 
link_type).get_max_weight()
+        if max_weight is None:
+            return "None"
+        else:
+            return str(max_weight.get_group_name())
+
+    if link_type == LinkType.tag:
+        for id_receiver in idlist:
+            out.write("\t".join(
+                [get_tags_received_by_id_max_group_name(id_receiver, id_sender)
+                   for id_sender in idlist]) + "\n")
+
+    else:
+        for id_receiver in idlist:
+            out.write("\t".join(
+                [get_links_received_by_id_max_group_name(id_receiver, 
id_sender)
+                   for id_sender in idlist]) + "\n")
+
+    out.close()
+
+
 def emitStatisticalData(cmtlist, id_mgr, logical_depends, outdir, 
releaseRangeID, dbm, conf, 
                         fileCommitDict):
     """Save the available information for a release interval for further 
statistical processing.
@@ -1264,6 +1325,8 @@ def emitStatisticalData(cmtlist, id_mgr, logical_depends, 
outdir, releaseRangeID
 
     writeAdjMatrix2File(id_mgr, outdir, conf)
 
+    writeAdjMatrixMaxWeight2File(id_mgr, outdir, conf)
+
     if logical_depends is not None:
         writeDependsToDB(logical_depends, fileCommitDict, cmtlist, 
releaseRangeID, dbm, conf)
 
@@ -1439,17 +1502,17 @@ def compute_feature_proximity_links(file_commit_list, 
cmt_list, id_mgr, link_typ
             #of code
             if len(file_state_mod) > 1:
                 # identify code line clustering using feature location 
information
-                feature_clusters = group_feature_lines(file_commit, 
file_state_mod, cmt_list)
+                feature_groups = group_feature_lines(file_commit, 
file_state_mod, cmt_list)
                 # We now have a 'feature -> codeblock list' mapping
-                for feature in feature_clusters:
+                for feature in feature_groups:
                     if feature not in author_feature_changes:
                         author_feature_changes[feature] = {}
                     author_changes = author_feature_changes[feature]
 
-                    codeBlks = feature_clusters[feature]
+                    feature_group = feature_groups[feature]
 
                     #get all blocks contributed by the revision commit we are 
looking at
-                    revCmtBlks = [blk for blk in codeBlks if blk.cmtHash == 
cmt.id]
+                    revCmtBlks = [blk for blk in feature_group if blk.cmtHash 
== cmt.id]
                     if revCmtBlks:
                         #get the person responsible for this revision
                         if author:
@@ -1458,9 +1521,9 @@ def compute_feature_proximity_links(file_commit_list, 
cmt_list, id_mgr, link_typ
                             revPerson = id_mgr.getPI( 
revCmtBlks[0].committerId )
 
                         if revPerson not in author_changes:
-                            author_changes[revPerson] = 0
-
-                        author_changes[revPerson] += 
computeBlksSize(revCmtBlks, [])
+                            author_changes[revPerson] = list(revCmtBlks)
+                        else:
+                            author_changes[revPerson].extend(revCmtBlks)
 
     # Now we calculate the collaboration strength between authors as
     # (SUM(MIN(line-changes of author1 on feature, line-changes of author2 on 
feature) FOR feature IN features))
@@ -1469,9 +1532,13 @@ def compute_feature_proximity_links(file_commit_list, 
cmt_list, id_mgr, link_typ
         for author1 in author_changes:
             for author2 in author_changes:
                 if author1 is not author2:
-                    strength = min(author_changes[author1], 
author_changes[author2])
-                    author1.addSendRelation(link_type, author2.getID(), cmt, 
strength)
-                    author2.addReceiveRelation(link_type, author1.getID(), 
strength)
+                    weight = compute_block_weight(author_changes[author1], 
author_changes[author2])
+                    size1 = computeBlksSize(author_changes[author1], [])
+                    size2 = computeBlksSize(author_changes[author2], [])
+                    weight = RelationWeight(
+                        min(size1, size2), feature, weight.get_commit_ids1(), 
weight.get_commit_ids2())
+                    author1.addSendRelation(link_type, author2.getID(), cmt, 
weight)
+                    author2.addReceiveRelation(link_type, author1.getID(), 
weight)
         #del author_feature_changes[author1]
 
     #raise Exception("feature proximity links is not implemented!")
@@ -1542,7 +1609,7 @@ def computeTagLinks(cmtlist, id_mgr):
     # With every person, we can associate statistical information into
     # which subsystems he/she typically commits, with whom he collaborates,
     # and so on. From this, we can infer further information for each
-    # commit, for instance how many people working on different subststems
+    # commit, for instance how many people working on different subsystems
     # have signed off the commit, of how important the people who sign off
     # the commit are.
 
diff --git a/codeface/cluster/codeBlock.py b/codeface/cluster/codeBlock.py
index 2096ca6..eb26437 100644
--- a/codeface/cluster/codeBlock.py
+++ b/codeface/cluster/codeBlock.py
@@ -27,20 +27,27 @@ class codeBlock:
     # Constructors
     #===========================
 
-    def __init__(self, start=None, end=None, authorId=None, committerId=None, 
cmtHash=None):
+    def __init__(self, start=None, end=None, authorId=None, committerId=None, 
cmtHash=None, groupName=None):
 
         self.start       = start #start of the code block
         self.end         = end   #end of the code block
         self.authorId    = authorId
         self.committerId = committerId
         self.cmtHash     = cmtHash
+        # specified the name of this block,
+        # this is saved so we can trace where collaborations come from
+        # ie which function/feature/file is responsible for a specific
+        # collaboration.
+        self.groupName     = groupName
     #===========================
     # Accessors and Modifiers
     #===========================
 
+    def get_group_name(self):
+        return self.groupName
+
     def get_codeLines(self):
         return self.codeLines
 
-
     def add_codeLine(self, lineNum, cmtHash, authorId, committerId):
         self.codeLines.append( codeLine.codeLine(lineNum, cmtHash, authorId, 
committerId) )
\ No newline at end of file
-- 
1.8.5.5


Other related posts:

  • » [codeface] [PATCH 11/12] Trace features and functions in the collaboration analysis. - Matthias Dittrich