[codeface] [PATCH 8/9] Added clustering functions for feature analysis.

From: Matthias Dittrich <matthi.d@xxxxxxxxxxxxxx>
To: codeface@xxxxxxxxxxxxx
Date: Fri, 17 Oct 2014 15:14:25 +0200
Added some clustering functions similar to the function analysis functions, but 
for clustering features.

Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx>
---
 codeface/cluster/cluster.py | 284 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)

diff --git a/codeface/cluster/cluster.py b/codeface/cluster/cluster.py
index ba4c970..45cfb59 100755
--- a/codeface/cluster/cluster.py
+++ b/codeface/cluster/cluster.py
@@ -197,6 +197,64 @@ def computeSnapshotCollaboration(file_commit, cmtList, 
id_mgr, link_type,
                                         maxDist, author) for cluster in 
clusters if cluster]
 
 
+def compute_snapshot_collaboration_features(file_commit, cmtList, id_mgr, 
link_type,
+                                 startDate=None, random=False):
+    """Generates the collaboration data from a file snapshot at a particular
+    point in time"""
+
+    '''
+    Detailed description: the fileSnapShot is a representation of how a file
+    looked at the time of a particular commit. The fileSnapshot is a
+    dictionary with key = a particular commit hash and the value is the how
+    the file looked at the time of that commit.How the file looked is
+    represented by a another dictionary with key = a code line number and the
+    value is a commit hash referencing the commit that contributed that
+    particular line. The commit hashes are then used to reference the people
+    involved.
+    '''
+
+    #------------------------
+    #variable declarations
+    #------------------------
+    maxDist = 25
+    author = True
+    fileState = file_commit.getFileSnapShot()
+    revCmtIds = file_commit.getrevCmts()
+    revCmts = [cmtList[revCmtId] for revCmtId in revCmtIds]
+
+    for cmt in revCmts:
+        # the fileState will be modified but for each loop we should start with
+        # the original fileState
+        fileState_mod = fileState.copy()
+
+        # check if commit is in the current revision of the file, if it is not
+        # we no longer have a need to process further since the commit is now
+        # irrelevant
+        if not (cmt.id in fileState_mod.values()):
+            continue
+
+        #find code lines of interest, these are the lines that are localized
+        #around the cmt.id hash, modify the fileState to include only the
+        #lines of interest
+        if (not (random)):
+            fileState_mod = lines_of_interest_features(fileState_mod, cmt.id, 
cmtList, file_commit)
+
+        #remove commits that occur prior to the specified startDate
+        if startDate != None:
+            fileState_mod = removePriorCommits(fileState_mod, cmtList, 
startDate)
+
+        #collaboration is meaningless without more than one line
+        #of code
+        if len(fileState_mod) > 1:
+            # identify code line clustering using function location information
+            feature_clusters = group_feature_lines(file_commit, fileState_mod, 
cmtList)
+            for feature in feature_clusters:
+                feature_cluster = feature_clusters[feature]
+                if feature_cluster:
+                    #calculate the collaboration coefficient for each code 
block
+                    computeCommitCollaboration(feature_cluster, cmt, id_mgr, 
link_type, maxDist, author)
+
+
 def groupFuncLines(file_commit, file_state, cmtList):
     '''
     cluster code lines that fall under the same function
@@ -242,6 +300,69 @@ def groupFuncLines(file_commit, file_state, cmtList):
 
     return func_blks
 
+
+def group_feature_lines(file_commit, file_state, cmtList):
+    """
+    cluster code lines that fall under the same feature
+    """
+    #feature_indx = {}
+    #indx = 0
+    feature_blks = {}
+    lines = sorted(map(int, file_state.keys()))
+    blk_start = {}
+    blk_end = {}
+
+    for features in file_commit.feature_info.values():
+        for feature in features:
+            #feature_indx[feature] = indx
+            blk_start[feature] = lines[0]
+            blk_end[feature] = lines[0]
+            feature_blks[feature] = []
+
+            #if not indx in feature_blks:
+            #    feature_blks[indx] = []
+            #if not feature in feature_indx:
+            #    feature_indx[feature] = indx
+            #    blk_start[indx] = lines[0]
+            #    blk_end[indx] = lines[0]
+            #    feature_blks[indx] = []
+            #    indx += 1
+
+    for i in range(0, len(file_state) - 1):
+        curr_line = lines[i]
+        next_line = lines[i + 1]
+        curr_cmt_id = file_state[str(curr_line)]
+        next_cmt_id = file_state[str(next_line)]
+        curr_features = file_commit.findFeatureList(curr_line)
+        next_features = file_commit.findFeatureList(next_line)
+
+        for feature in feature_blks:
+            if (curr_cmt_id == next_cmt_id) and (curr_line + 1 == next_line) 
and \
+                    (feature in curr_features) and (feature in next_features):
+                # nothing changed for this feature
+                blk_end[feature] += 1
+            else:
+                # block for this feature finished
+                feature_blks[feature]. \
+                    append(codeBlock.codeBlock(blk_start[feature], 
blk_end[feature],
+                                               
cmtList[str(curr_cmt_id)].getAuthorPI().getID(),
+                                               
cmtList[str(curr_cmt_id)].getCommitterPI().getID(),
+                                               curr_cmt_id))
+                blk_start[feature] = next_line
+                blk_end[feature] = next_line
+
+    # boundary case
+    for feature in feature_blks:
+        feature_blks[feature].append(
+            codeBlock.codeBlock(
+                blk_start[feature], blk_end[feature],
+                cmtList[str(next_cmt_id)].getAuthorPI().getID(),
+                cmtList[str(next_cmt_id)].getCommitterPI().getID(),
+                next_cmt_id))
+
+    return feature_blks
+
+
 def randomizeCommitCollaboration(codeBlks, fileState):
     '''
     randomizes the location in the file where commits were made
@@ -708,6 +829,64 @@ def linesOfInterest(fileState, snapShotCommit, maxDist, 
cmtlist, file_commit):
     return modFileState
 
 
+def lines_of_interest_features(file_state, snapshot_commit, cmt_list, 
file_commit):
+    """
+    Finds the regions of interest for analyzing the file.
+    We want to look at localized regions around the commit of
+    interest (snapShotCommit) and ignore code lines that are
+    located some far distance away.
+
+    - Input -
+    fileState:      code line numbers together with commit hashes
+    snapShotCommit: the commit hash that marks when the fileState was acquired
+    maxDist:        indicates how large the area of interest should be
+    file_commit: a fileCommit instance
+    - Output -
+    mod_filestate: the file state after line not of interest are removed
+    """
+    #variable declarations
+    snapshot_cmt_date = cmt_list[snapshot_commit].getCdate()
+    mod_file_state = {}
+    snapshot_feature_set = set()
+
+    #take a pass over the fileState to identify where the snapShotCommit
+    #made contributions to the fileState
+    snapshot_cmt_lines = []
+    for lineNum in file_state.keys():
+        cmt_id = file_state[lineNum]
+
+        if cmt_id == snapshot_commit:
+            snapshot_cmt_lines.append(lineNum)
+            # retrieve the function id that each line falls into
+            
snapshot_feature_set.update(file_commit.findFeatureList(int(lineNum)))
+    #end for line
+
+    # remove lines that are from commits that occur after the snapShotCmt
+    for lineNum, cmt_id in file_state.items():
+        if cmt_id in cmt_list:
+            cmt_date = cmt_list[cmt_id].getCdate()
+        else:
+            #must be a old commit that occurred in a prior release
+            continue
+
+        # check to keep lines committed in the past with respect to the current
+        # snapshot commit
+        if cmt_date <= snapshot_cmt_date:
+            # check if the line will fall under one of the functions that the
+            # snapshot commit lines fall under (ie. we only want to keep lines
+            # that are in the same functions as the snapshot commit
+
+            if any(com in snapshot_feature_set for com in 
file_commit.findFeatureList(int(lineNum))):
+                mod_file_state[lineNum] = file_state[lineNum]
+
+                # else: ignore line since it belongs to some function outside 
of
+                # the set of functions we are interested in
+
+                #else: forget line because it was in a future commit
+
+    return mod_file_state
+
+
 def blockDist(blk1, blk2):
     '''
     Finds the euclidean distance between two code blocks.
@@ -1181,6 +1360,111 @@ def computeProximityLinks(fileCommitList, cmtList, 
id_mgr, link_type, \
                                     for fileSnapShot
                                     in fileCommit.getFileSnapShots().items()]
 
+def compute_feature_proximity_links_perfile(fileCommitList, cmtList, id_mgr, 
link_type, \
+                          startDate=None, speedUp=True):
+    '''
+    Constructs network based on commit proximity information
+    '''
+
+    '''
+    Two contributors are linked when they make a commit that is in
+    close proximity to each other (ie. same file AND nearby line numbers).
+    Collaboration is quantified by a single metric indicating the
+    strength of collaboration between two individuals.
+    '''
+    for file_commit in fileCommitList.values():
+        if speedUp:
+            compute_snapshot_collaboration_features(file_commit, cmtList, 
id_mgr, link_type, startDate)
+        else:
+            [compute_snapshot_collaboration_features(
+                fileSnapShot[1], [fileSnapShot[0]], cmtList, id_mgr, 
link_type, startDate)
+             for fileSnapShot in file_commit.getFileSnapShots().items()]
+
+def compute_feature_proximity_links(file_commit_list, cmt_list, id_mgr, 
link_type, \
+                                    start_date=None, speed_up=True):
+    """
+    Constructs network based on commit proximity information, same as 
computeProximityLinks but for features
+    instead of functions.
+    """
+
+    '''
+    Because features (unlike functions) are split across files, we define 
collaboration differently:
+
+    Two contributors are linked when they make a commit that is within the 
same feature.
+    Collaboration between to contributors is quantified by the number of lines 
they worked on the same feature.
+    TODO!
+    '''
+
+    # First we calculate how many lines each contributor changed in each 
feature
+    author_feature_changes = {}
+
+    for file_commit in file_commit_list.values():
+        author = True
+        file_state = file_commit.getFileSnapShot()
+        revCmtIds = file_commit.getrevCmts()
+        revCmts = [cmt_list[revCmtId] for revCmtId in revCmtIds]
+
+        for cmt in revCmts:
+            # the fileState will be modified but for each loop we should start 
with
+            # the original fileState
+            file_state_mod = file_state.copy()
+
+            # check if commit is in the current revision of the file, if it is 
not
+            # we no longer have a need to process further since the commit is 
now
+            # irrelevant
+            if not (cmt.id in file_state_mod.values()):
+                continue
+
+            #find code lines of interest, these are the lines that are 
localized
+            #around the cmt.id hash, modify the fileState to include only the
+            #lines of interest
+            if not random:
+                file_state_mod = lines_of_interest_features(file_state_mod, 
cmt.id, cmt_list, file_commit)
+
+            #remove commits that occur prior to the specified startDate
+            if start_date is not None:
+                file_state_mod = removePriorCommits(file_state_mod, cmt_list, 
start_date)
+
+            #collaboration is meaningless without more than one line
+            #of code
+            if len(file_state_mod) > 1:
+                # identify code line clustering using feature location 
information
+                feature_clusters = group_feature_lines(file_commit, 
file_state_mod, cmt_list)
+                # We now have a 'feature -> codeblock list' mapping
+                for feature in feature_clusters:
+                    if feature not in author_feature_changes:
+                        author_feature_changes[feature] = {}
+                    author_changes = author_feature_changes[feature]
+
+                    codeBlks = feature_clusters[feature]
+
+                    #get all blocks contributed by the revision commit we are 
looking at
+                    revCmtBlks = [blk for blk in codeBlks if blk.cmtHash == 
cmt.id]
+
+                    #get the person responsible for this revision
+                    if author:
+                        revPerson = id_mgr.getPI( revCmtBlks[0].authorId )
+                    else:
+                        revPerson = id_mgr.getPI( revCmtBlks[0].committerId )
+
+                    if revPerson not in author_changes:
+                        author_changes[revPerson] = 0
+                    author_changes[revPerson] += computeBlksSize(revCmtBlks, 
[])
+
+    # Now we calculate the collaboration strength between authors as
+    # (SUM(MIN(line-changes of author1 on feature, line-changes of author2 on 
feature) FOR feature IN features))
+    for feature in author_feature_changes:
+        author_changes = author_feature_changes[feature]
+        for author1 in author_changes:
+            for author2 in author_changes:
+                if author1 is not author2:
+                    strength = min(author_changes[author1], 
author_changes[author2])
+                    author1.addSendRelation(link_type, author2.getID(), cmt, 
strength)
+                    author2.addReceiveRelation(link_type, author1.getID(), 
strength)
+        #del author_feature_changes[author1]
+
+    #raise Exception("feature proximity links is not implemented!")
+
 
 def computeCommitterAuthorLinks(cmtlist, id_mgr):
     '''
-- 
1.8.5.5
References:
- [codeface] Re: Feature-aware Collaboration Analysis with Codeface
  - From: Wolfgang Mauerer
- [codeface] [PATCH 0/9] Feature-aware Collaboration Analysis
  - From: Matthias Dittrich
[codeface] [PATCH 8/9] Added clustering functions for feature analysis.

Other related posts: