[codeface] [PATCH 07/12] Added some functions to calculate the collaboration between developers with feature instead of function blocks.

From: Matthias Dittrich <matthi.d@xxxxxxxxxxxxxx>
To: codeface@xxxxxxxxxxxxx
Date: Wed, 19 Nov 2014 21:40:26 +0100
Added some functions similar to the proximity analysis functions, but for 
features.
- computeProximityLinks -> compute_feature_proximity_links_per_file (the 
per-file feature collaboration analysis).
- computeSnapshotCollaboration -> compute_snapshot_collaboration_features 
(compute the collaboration for a file snapshot).
- linesOfInterest -> lines_of_interest_features (fiter out not interesting 
lines (ie lines not relevant for the current snapshot).
- groupFuncLines -> group_feature_lines (we group feature instead of function 
blocks).
- computeCommitCollaboration was reused as produce the same abstract codeblocks 
for features.
- computeProximityLinks -> compute_feature_proximity_links (the project wide 
feature collaboration analysis).

Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx>
Reviewed-by: Wolfgang Mauerer <wolfgang.mauerer@xxxxxxxxxxx>
---
 codeface/cluster/cluster.py | 301 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 301 insertions(+)

diff --git a/codeface/cluster/cluster.py b/codeface/cluster/cluster.py
index 3589d5f..542275c 100755
--- a/codeface/cluster/cluster.py
+++ b/codeface/cluster/cluster.py
@@ -197,6 +197,79 @@ def computeSnapshotCollaboration(file_commit, cmtList, 
id_mgr, link_type,
                                         maxDist, author) for cluster in 
clusters if cluster]
 
 
+def compute_snapshot_collaboration_features(
+        file_commit, cmt_list, id_mgr, link_type, start_date=None,
+        random=False):
+    """
+    Generates the collaboration data from a file snapshot at a particular
+    point in time
+    """
+
+    '''
+    Detailed description: this function is quite similar to
+    computeSnapshotCollaboration.
+    But to identify interesting lines and groups we use a different logic
+    to be able to do the same for features instead of functions.
+    '''
+
+    #------------------------
+    #variable declarations
+    #------------------------
+    max_dist = 25
+    author = True
+    file_state = file_commit.getFileSnapShot()
+    rev_cmt_ids = file_commit.getrevCmts()
+    rev_cmts = [cmt_list[revCmtId] for revCmtId in rev_cmt_ids]
+
+    for cmt in rev_cmts:
+        # the file_state will be modified but for each loop we should start
+        # with the original file_state
+        file_state_mod = file_state.copy()
+
+        # check if commit is in the current revision of the file, if it is
+        # not we no longer have a need to process further since the commit
+        # is now irrelevant
+        if not (cmt.id in file_state_mod.values()):
+            continue
+
+        # find code lines of interest, these are the lines that are
+        # localized around the cmt.id hash, modify the file_state to
+        # include only the lines of interest
+        if not random:
+            file_state_mod = \
+                lines_of_interest_features(
+                    file_state_mod, cmt.id, cmt_list, file_commit)
+
+        #remove commits that occur prior to the specified startDate
+        if start_date is not None:
+            file_state_mod = \
+                removePriorCommits(file_state_mod, cmt_list, start_date)
+
+        # collaboration is meaningless without more than one line
+        # of code, because we need another line as reference
+        if len(file_state_mod) > 1:
+            # identify code line clustering using function location
+            # information
+            feature_clusters = \
+                group_feature_lines(file_commit, file_state_mod, cmt_list)
+            for feature in feature_clusters:
+                feature_cluster = feature_clusters[feature]
+
+                def exists(f, l):
+                    for i in l:
+                        if f(i):
+                            return True
+                    return False
+                if feature_cluster and \
+                        exists(lambda blk: blk.cmtHash == cmt.id,
+                               feature_cluster):
+                    # calculate the collaboration coefficient for each
+                    # code block
+                    computeCommitCollaboration(
+                        feature_cluster, cmt, id_mgr, link_type, max_dist,
+                        author)
+
+
 def groupFuncLines(file_commit, file_state, cmtList):
     '''
     cluster code lines that fall under the same function
@@ -242,6 +315,65 @@ def groupFuncLines(file_commit, file_state, cmtList):
 
     return func_blks
 
+
+def group_feature_lines(file_commit, file_state, cmt_list):
+    """
+    cluster code lines that fall under the same feature
+    """
+    feature_blks = {}
+    lines = sorted(map(int, file_state.keys()))
+    blk_start = {}
+    blk_end = {}
+
+    for features in file_commit.feature_info.values():
+        for feature in features:
+            blk_start[feature] = lines[0]
+            blk_end[feature] = lines[0]
+            feature_blks[feature] = []
+
+    for i in range(0, len(file_state) - 1):
+        curr_line = lines[i]
+        next_line = lines[i + 1]
+        curr_cmt_id = file_state[str(curr_line)]
+        next_cmt_id = file_state[str(next_line)]
+        curr_features = file_commit.findFeatureList(curr_line)
+        next_features = file_commit.findFeatureList(next_line)
+
+        for feature in feature_blks:
+            if (curr_cmt_id == next_cmt_id) and \
+                    (curr_line + 1 == next_line) and \
+                    (feature in curr_features) and \
+                    (feature in next_features):
+                # nothing changed for this feature
+                blk_end[feature] += 1
+            else:
+                # block for this feature finished
+                if feature in curr_features:
+                    feature_blks[feature]. \
+                        append(
+                            codeBlock.codeBlock(
+                                blk_start[feature], blk_end[feature],
+                                cmt_list[str(curr_cmt_id)].getAuthorPI()
+                                .getID(),
+                                cmt_list[str(curr_cmt_id)].getCommitterPI()
+                                .getID(),
+                                curr_cmt_id))
+                blk_start[feature] = next_line
+                blk_end[feature] = next_line
+
+    # boundary case for open code-blocks.
+    for feature in feature_blks:
+        if feature in curr_features:
+            feature_blks[feature].append(
+                codeBlock.codeBlock(
+                    blk_start[feature], blk_end[feature],
+                    cmt_list[str(next_cmt_id)].getAuthorPI().getID(),
+                    cmt_list[str(next_cmt_id)].getCommitterPI().getID(),
+                    next_cmt_id))
+
+    return feature_blks
+
+
 def randomizeCommitCollaboration(codeBlks, fileState):
     '''
     randomizes the location in the file where commits were made
@@ -708,6 +840,68 @@ def linesOfInterest(fileState, snapShotCommit, maxDist, 
cmtlist, file_commit):
     return modFileState
 
 
+def lines_of_interest_features(file_state, snapshot_commit, cmt_list,
+                               file_commit):
+    """
+    Finds the regions of interest for analyzing the file.
+    We want to look at localized regions around the commit of
+    interest (snapShotCommit) and ignore code lines that are
+    located some far distance away.
+
+    - Input -
+    fileState:      code line numbers together with commit hashes
+    snapShotCommit: the commit hash that marks when the fileState was acquired
+    maxDist:        indicates how large the area of interest should be
+    file_commit: a fileCommit instance
+    - Output -
+    mod_filestate: the file state after line not of interest are removed
+    """
+    #variable declarations
+    snapshot_cmt_date = cmt_list[snapshot_commit].getCdate()
+    mod_file_state = {}
+    snapshot_feature_set = set()
+
+    #take a pass over the fileState to identify where the snapShotCommit
+    #made contributions to the fileState
+    snapshot_cmt_lines = []
+    for lineNum in file_state.keys():
+        cmt_id = file_state[lineNum]
+
+        if cmt_id == snapshot_commit:
+            snapshot_cmt_lines.append(lineNum)
+            # retrieve the function id that each line falls into
+            snapshot_feature_set.update(
+                file_commit.findFeatureList(int(lineNum)))
+    #end for line
+
+    # remove lines that are from commits that occur after the snapShotCmt
+    for lineNum, cmt_id in file_state.items():
+        if cmt_id in cmt_list:
+            cmt_date = cmt_list[cmt_id].getCdate()
+        else:
+            #must be a old commit that occurred in a prior release
+            continue
+
+        # check to keep lines committed in the past with respect to the
+        # current snapshot commit
+        if cmt_date <= snapshot_cmt_date:
+            # check if the line will fall under one of the features that
+            # the snapshot commit lines fall under (ie. we only want to
+            # keep lines that are in the same feature as the snapshot
+            # commit)
+
+            if any(com in snapshot_feature_set
+                   for com in file_commit.findFeatureList(int(lineNum))):
+                mod_file_state[lineNum] = file_state[lineNum]
+
+                # else: ignore line since it belongs to some feature
+                # outside of the set of features we are interested in
+
+                #else: forget line because it was in a future commit
+
+    return mod_file_state
+
+
 def blockDist(blk1, blk2):
     '''
     Finds the euclidean distance between two code blocks.
@@ -1182,6 +1376,113 @@ def computeProximityLinks(fileCommitList, cmtList, 
id_mgr, link_type, \
                                     in fileCommit.getFileSnapShots().items()]
 
 
+def compute_feature_proximity_links_per_file(file_commit_list, cmt_list, 
id_mgr, link_type,
+                                             start_date=None, speed_up=True):
+    """
+    Constructs network based on commit proximity information
+    """
+
+    '''
+    Two contributors are linked when they make a commit that is in
+    close proximity to each other (ie. same file AND nearby line numbers).
+    Collaboration is quantified by a single metric indicating the
+    strength of collaboration between two individuals.
+    '''
+    for file_commit in file_commit_list.values():
+        if speed_up:
+            compute_snapshot_collaboration_features(file_commit, cmt_list, 
id_mgr, link_type, start_date)
+        else:
+            for fileSnapShot in file_commit.getFileSnapShots().items():
+                compute_snapshot_collaboration_features(
+                    fileSnapShot[1], [fileSnapShot[0]], cmt_list, id_mgr, 
link_type, start_date)
+
+
+def compute_feature_proximity_links(file_commit_list, cmt_list, id_mgr, 
link_type, start_date=None, random=False):
+    """
+    Constructs network based on commit proximity information, same as 
computeProximityLinks but for features
+    instead of functions.
+    """
+
+    '''
+    Because features (unlike functions) are split across files, we define 
collaboration differently:
+
+    Two contributors are linked when they make a commit that is within the 
same feature.
+    Collaboration between to contributors is quantified by the number of lines 
they worked on the same feature.
+    TODO!
+    '''
+
+    # First we calculate how many lines each contributor changed in each 
feature
+    author_feature_changes = {}
+
+    for file_commit in file_commit_list.values():
+        author = True
+        file_state = file_commit.getFileSnapShot()
+        revCmtIds = file_commit.getrevCmts()
+        revCmts = [cmt_list[revCmtId] for revCmtId in revCmtIds]
+
+        for cmt in revCmts:
+            # the fileState will be modified but for each loop we should start 
with
+            # the original fileState
+            file_state_mod = file_state.copy()
+
+            # check if commit is in the current revision of the file, if it is 
not
+            # we no longer have a need to process further since the commit is 
now
+            # irrelevant
+            if not (cmt.id in file_state_mod.values()):
+                continue
+
+            #find code lines of interest, these are the lines that are 
localized
+            #around the cmt.id hash, modify the fileState to include only the
+            #lines of interest
+            if not random:
+                file_state_mod = lines_of_interest_features(file_state_mod, 
cmt.id, cmt_list, file_commit)
+
+            #remove commits that occur prior to the specified startDate
+            if start_date is not None:
+                file_state_mod = removePriorCommits(file_state_mod, cmt_list, 
start_date)
+
+            #collaboration is meaningless without more than one line
+            #of code
+            if len(file_state_mod) > 1:
+                # identify code line clustering using feature location 
information
+                feature_clusters = group_feature_lines(file_commit, 
file_state_mod, cmt_list)
+                # We now have a 'feature -> codeblock list' mapping
+                for feature in feature_clusters:
+                    if feature not in author_feature_changes:
+                        author_feature_changes[feature] = {}
+                    author_changes = author_feature_changes[feature]
+
+                    codeBlks = feature_clusters[feature]
+
+                    #get all blocks contributed by the revision commit we are 
looking at
+                    revCmtBlks = [blk for blk in codeBlks if blk.cmtHash == 
cmt.id]
+                    if revCmtBlks:
+                        #get the person responsible for this revision
+                        if author:
+                            revPerson = id_mgr.getPI( revCmtBlks[0].authorId )
+                        else:
+                            revPerson = id_mgr.getPI( 
revCmtBlks[0].committerId )
+
+                        if revPerson not in author_changes:
+                            author_changes[revPerson] = 0
+
+                        author_changes[revPerson] += 
computeBlksSize(revCmtBlks, [])
+
+    # Now we calculate the collaboration strength between authors as
+    # (SUM(MIN(line-changes of author1 on feature, line-changes of author2 on 
feature) FOR feature IN features))
+    for feature in author_feature_changes:
+        author_changes = author_feature_changes[feature]
+        for author1 in author_changes:
+            for author2 in author_changes:
+                if author1 is not author2:
+                    strength = min(author_changes[author1], 
author_changes[author2])
+                    author1.addSendRelation(link_type, author2.getID(), cmt, 
strength)
+                    author2.addReceiveRelation(link_type, author1.getID(), 
strength)
+        #del author_feature_changes[author1]
+
+    #raise Exception("feature proximity links is not implemented!")
+
+
 def computeCommitterAuthorLinks(cmtlist, id_mgr):
     '''
     Constructs network based on the author and commiter of a commit
-- 
1.8.5.5
References:
- [codeface] [PATCH 00/12] Feature-aware Collaboration Analysis with Codeface (after first Review)
  - From: Matthias Dittrich
[codeface] [PATCH 07/12] Added some functions to calculate the collaboration between developers with feature instead of function blocks.

Other related posts: