Am 17/10/2014 15:14, schrieb Matthias Dittrich: > Added some clustering functions similar to the function analysis functions, > but for clustering features. can you please describe the change in more detail? Other users should get the general idea from the commit description; the details are in the code, then. > > Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx> > --- > codeface/cluster/cluster.py | 284 > ++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 284 insertions(+) > > diff --git a/codeface/cluster/cluster.py b/codeface/cluster/cluster.py > index ba4c970..45cfb59 100755 > --- a/codeface/cluster/cluster.py > +++ b/codeface/cluster/cluster.py > @@ -197,6 +197,64 @@ def computeSnapshotCollaboration(file_commit, cmtList, > id_mgr, link_type, > maxDist, author) for cluster in > clusters if cluster] > > > +def compute_snapshot_collaboration_features(file_commit, cmtList, id_mgr, > link_type, > + startDate=None, random=False): > + """Generates the collaboration data from a file snapshot at a particular > + point in time""" > + > + ''' > + Detailed description: the fileSnapShot is a representation of how a file > + looked at the time of a particular commit. The fileSnapshot is a > + dictionary with key = a particular commit hash and the value is the how > + the file looked at the time of that commit.How the file looked is commit.<SPACE>How > + represented by a another dictionary with key = a code line number and the > + value is a commit hash referencing the commit that contributed that > + particular line. The commit hashes are then used to reference the people > + involved. that's a bit hard to understand, in particular the statement "The fileSnapshot (...) is how the file looked at the time of that commit.How the file looked is represented by a another dictionary" Are we talking about two different states of the file (represented by two different commits), or is the representation split across two data structures? > + ''' > + > + #------------------------ > + #variable declarations > + #------------------------ > + maxDist = 25 > + author = True > + fileState = file_commit.getFileSnapShot() > + revCmtIds = file_commit.getrevCmts() > + revCmts = [cmtList[revCmtId] for revCmtId in revCmtIds] > + > + for cmt in revCmts: > + # the fileState will be modified but for each loop we should start > with > + # the original fileState > + fileState_mod = fileState.copy() > + > + # check if commit is in the current revision of the file, if it is > not > + # we no longer have a need to process further since the commit is now > + # irrelevant > + if not (cmt.id in fileState_mod.values()): > + continue > + > + #find code lines of interest, these are the lines that are localized > + #around the cmt.id hash, modify the fileState to include only the > + #lines of interest > + if (not (random)): > + fileState_mod = lines_of_interest_features(fileState_mod, > cmt.id, cmtList, file_commit) > + > + #remove commits that occur prior to the specified startDate > + if startDate != None: > + fileState_mod = removePriorCommits(fileState_mod, cmtList, > startDate) > + > + #collaboration is meaningless without more than one line why can people not collaborate on a single line? > + #of code > + if len(fileState_mod) > 1: > + # identify code line clustering using function location > information > + feature_clusters = group_feature_lines(file_commit, > fileState_mod, cmtList) > + for feature in feature_clusters: > + feature_cluster = feature_clusters[feature] > + if feature_cluster: > + #calculate the collaboration coefficient for each code > block > + computeCommitCollaboration(feature_cluster, cmt, id_mgr, > link_type, maxDist, author) > + > + > def groupFuncLines(file_commit, file_state, cmtList): > ''' > cluster code lines that fall under the same function > @@ -242,6 +300,69 @@ def groupFuncLines(file_commit, file_state, cmtList): > > return func_blks > > + > +def group_feature_lines(file_commit, file_state, cmtList): > + """ > + cluster code lines that fall under the same feature > + """ > + #feature_indx = {} > + #indx = 0 please remove code that's been commented out from the commit. > + feature_blks = {} > + lines = sorted(map(int, file_state.keys())) > + blk_start = {} > + blk_end = {} > + > + for features in file_commit.feature_info.values(): > + for feature in features: > + #feature_indx[feature] = indx dito > + blk_start[feature] = lines[0] > + blk_end[feature] = lines[0] > + feature_blks[feature] = [] > + > + #if not indx in feature_blks: > + # feature_blks[indx] = [] > + #if not feature in feature_indx: > + # feature_indx[feature] = indx > + # blk_start[indx] = lines[0] > + # blk_end[indx] = lines[0] > + # feature_blks[indx] = [] > + # indx += 1 dito > + > + for i in range(0, len(file_state) - 1): > + curr_line = lines[i] > + next_line = lines[i + 1] > + curr_cmt_id = file_state[str(curr_line)] > + next_cmt_id = file_state[str(next_line)] > + curr_features = file_commit.findFeatureList(curr_line) > + next_features = file_commit.findFeatureList(next_line) > + > + for feature in feature_blks: > + if (curr_cmt_id == next_cmt_id) and (curr_line + 1 == next_line) > and \ > + (feature in curr_features) and (feature in > next_features): > + # nothing changed for this feature > + blk_end[feature] += 1 > + else: > + # block for this feature finished > + feature_blks[feature]. \ > + append(codeBlock.codeBlock(blk_start[feature], > blk_end[feature], > + > cmtList[str(curr_cmt_id)].getAuthorPI().getID(), > + > cmtList[str(curr_cmt_id)].getCommitterPI().getID(), > + curr_cmt_id)) > + blk_start[feature] = next_line > + blk_end[feature] = next_line > + > + # boundary case could you please explain the boundary case? > + for feature in feature_blks: > + feature_blks[feature].append( > + codeBlock.codeBlock( > + blk_start[feature], blk_end[feature], > + cmtList[str(next_cmt_id)].getAuthorPI().getID(), > + cmtList[str(next_cmt_id)].getCommitterPI().getID(), > + next_cmt_id)) > + > + return feature_blks > + > + > def randomizeCommitCollaboration(codeBlks, fileState): > ''' > randomizes the location in the file where commits were made > @@ -708,6 +829,64 @@ def linesOfInterest(fileState, snapShotCommit, maxDist, > cmtlist, file_commit): > return modFileState > > > +def lines_of_interest_features(file_state, snapshot_commit, cmt_list, > file_commit): > + """ > + Finds the regions of interest for analyzing the file. > + We want to look at localized regions around the commit of > + interest (snapShotCommit) and ignore code lines that are > + located some far distance away. > + > + - Input - > + fileState: code line numbers together with commit hashes > + snapShotCommit: the commit hash that marks when the fileState was > acquired > + maxDist: indicates how large the area of interest should be > + file_commit: a fileCommit instance > + - Output - > + mod_filestate: the file state after line not of interest are removed > + """ > + #variable declarations > + snapshot_cmt_date = cmt_list[snapshot_commit].getCdate() > + mod_file_state = {} > + snapshot_feature_set = set() > + > + #take a pass over the fileState to identify where the snapShotCommit > + #made contributions to the fileState > + snapshot_cmt_lines = [] > + for lineNum in file_state.keys(): > + cmt_id = file_state[lineNum] > + > + if cmt_id == snapshot_commit: > + snapshot_cmt_lines.append(lineNum) > + # retrieve the function id that each line falls into > + > snapshot_feature_set.update(file_commit.findFeatureList(int(lineNum))) > + #end for line > + > + # remove lines that are from commits that occur after the snapShotCmt > + for lineNum, cmt_id in file_state.items(): > + if cmt_id in cmt_list: > + cmt_date = cmt_list[cmt_id].getCdate() > + else: > + #must be a old commit that occurred in a prior release > + continue > + > + # check to keep lines committed in the past with respect to the > current > + # snapshot commit > + if cmt_date <= snapshot_cmt_date: > + # check if the line will fall under one of the functions that the > + # snapshot commit lines fall under (ie. we only want to keep > lines > + # that are in the same functions as the snapshot commit > + > + if any(com in snapshot_feature_set for com in > file_commit.findFeatureList(int(lineNum))): > + mod_file_state[lineNum] = file_state[lineNum] > + > + # else: ignore line since it belongs to some function > outside of > + # the set of functions we are interested in > + > + #else: forget line because it was in a future commit > + > + return mod_file_state > + > + > def blockDist(blk1, blk2): > ''' > Finds the euclidean distance between two code blocks. > @@ -1181,6 +1360,111 @@ def computeProximityLinks(fileCommitList, cmtList, > id_mgr, link_type, \ > for fileSnapShot > in fileCommit.getFileSnapShots().items()] > > +def compute_feature_proximity_links_perfile(fileCommitList, cmtList, id_mgr, > link_type, \ > + startDate=None, speedUp=True): > + ''' > + Constructs network based on commit proximity information > + ''' > + > + ''' > + Two contributors are linked when they make a commit that is in > + close proximity to each other (ie. same file AND nearby line numbers). > + Collaboration is quantified by a single metric indicating the > + strength of collaboration between two individuals. > + ''' > + for file_commit in fileCommitList.values(): > + if speedUp: > + compute_snapshot_collaboration_features(file_commit, cmtList, > id_mgr, link_type, startDate) > + else: > + [compute_snapshot_collaboration_features( > + fileSnapShot[1], [fileSnapShot[0]], cmtList, id_mgr, > link_type, startDate) > + for fileSnapShot in file_commit.getFileSnapShots().items()] > + > +def compute_feature_proximity_links(file_commit_list, cmt_list, id_mgr, > link_type, \ > + start_date=None, speed_up=True): > + """ > + Constructs network based on commit proximity information, same as > computeProximityLinks but for features > + instead of functions. > + """ > + > + ''' > + Because features (unlike functions) are split across files, we define > collaboration differently: > + > + Two contributors are linked when they make a commit that is within the > same feature. > + Collaboration between to contributors is quantified by the number of > lines they worked on the same feature. > + TODO! what exactly remains to do? Or is this an old stale comment? > + ''' > + > + # First we calculate how many lines each contributor changed in each > feature > + author_feature_changes = {} > + > + for file_commit in file_commit_list.values(): > + author = True > + file_state = file_commit.getFileSnapShot() > + revCmtIds = file_commit.getrevCmts() > + revCmts = [cmt_list[revCmtId] for revCmtId in revCmtIds] > + > + for cmt in revCmts: > + # the fileState will be modified but for each loop we should > start with > + # the original fileState > + file_state_mod = file_state.copy() > + > + # check if commit is in the current revision of the file, if it > is not > + # we no longer have a need to process further since the commit > is now > + # irrelevant > + if not (cmt.id in file_state_mod.values()): > + continue > + > + #find code lines of interest, these are the lines that are > localized > + #around the cmt.id hash, modify the fileState to include only the > + #lines of interest > + if not random: > + file_state_mod = lines_of_interest_features(file_state_mod, > cmt.id, cmt_list, file_commit) > + > + #remove commits that occur prior to the specified startDate > + if start_date is not None: > + file_state_mod = removePriorCommits(file_state_mod, > cmt_list, start_date) > + > + #collaboration is meaningless without more than one line > + #of code > + if len(file_state_mod) > 1: > + # identify code line clustering using feature location > information > + feature_clusters = group_feature_lines(file_commit, > file_state_mod, cmt_list) > + # We now have a 'feature -> codeblock list' mapping > + for feature in feature_clusters: > + if feature not in author_feature_changes: > + author_feature_changes[feature] = {} > + author_changes = author_feature_changes[feature] > + > + codeBlks = feature_clusters[feature] > + > + #get all blocks contributed by the revision commit we > are looking at > + revCmtBlks = [blk for blk in codeBlks if blk.cmtHash == > cmt.id] > + > + #get the person responsible for this revision > + if author: > + revPerson = id_mgr.getPI( revCmtBlks[0].authorId ) > + else: > + revPerson = id_mgr.getPI( revCmtBlks[0].committerId ) > + > + if revPerson not in author_changes: > + author_changes[revPerson] = 0 > + author_changes[revPerson] += computeBlksSize(revCmtBlks, > []) > + > + # Now we calculate the collaboration strength between authors as > + # (SUM(MIN(line-changes of author1 on feature, line-changes of author2 > on feature) FOR feature IN features)) > + for feature in author_feature_changes: > + author_changes = author_feature_changes[feature] > + for author1 in author_changes: > + for author2 in author_changes: > + if author1 is not author2: > + strength = min(author_changes[author1], > author_changes[author2]) > + author1.addSendRelation(link_type, author2.getID(), cmt, > strength) > + author2.addReceiveRelation(link_type, author1.getID(), > strength) > + #del author_feature_changes[author1] > + > + #raise Exception("feature proximity links is not implemented!") > + > > def computeCommitterAuthorLinks(cmtlist, id_mgr): > ''' > this is a fairly large change -- it would be quite helpful to have some overview in the commit message to understand what you intend to do. Best regards, Wolfgang