- The _get_feature_lines function calculates the feature sets in a similar way how _getFunctionsLines calculates the current functions for all source code lines. - Added some fields to codeface/fileCommit.py to save those results. - parse_feature_line function to parse a single line of cppstats output. - Added FileDict class to encapsulate information about the features of a source code file. - parse_feature_lines function to generate a FileDict instance from all lines of cppstats output. - We expect cppstats to be in path and check that the executable works before starting the analysis. - Update documentation. Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx> Reviewed-by: Wolfgang Mauerer <wolfgang.mauerer@xxxxxxxxxxx> --- README.md | 25 ++++++ codeface/VCS.py | 230 +++++++++++++++++++++++++++++++++++++++++++++++++ codeface/fileCommit.py | 70 +++++++++++++++ codeface/project.py | 24 +++++- codeface/util.py | 22 +++++ 5 files changed, 367 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 277a67a..eced236 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,31 @@ in the step "Database Setup", and modify codeface.conf accordingly. # Devel packages required for python packages sudo apt-get install libyaml-dev +* When using the feature or feature_file analysis you need to have a working + "cppstats" in your path. + One way to get it is: + + cd ~ + git clone https://github.com/clhunsen/cppstats.git + + Now create a file like ~/scripts/cppstats and add ~/scripts to your PATH. + It should have something along the lines of: + + #!/bin/bash + + cd ~/cppstats + PYTHONPATH="~/cppstats/lib" ~/cppstats/cppstats.py "$@" + + Note that the script has to be executable: + + chmod +x ~/scripts/cppstats + + and then add ~/scripts to your PATH. + (maybe you have to replace ~ with the full path (/home/$user) if it doesn't work). + + You can test this script by running "~/scripts/cppstats --help" and + validate that you get an help message + ## Preparing the R installation * Run `sudo R CMD javareconf`; make sure that the tool reports success in diff --git a/codeface/VCS.py b/codeface/VCS.py index c1060f2..079be83 100644 --- a/codeface/VCS.py +++ b/codeface/VCS.py @@ -34,15 +34,19 @@ # VCS-specific. # TODO: Unify range handling. Either a range is always a list, or always # represented by two parameters. +import itertools +import readline import commit import fileCommit import re import os +import bisect import ctags import tempfile import source_analysis import shutil +from fileCommit import FileDict from progressbar import ProgressBar, Percentage, Bar, ETA from ctags import CTags, TagEntry from logging import getLogger; log = getLogger(__name__) @@ -182,6 +186,181 @@ class VCS: return subsys=="__main__" or subsys in self.subsys_description.keys() +def parse_sep_line(line): + if not line.startswith("\"sep="): + raise ParseError( + ("expected that the csv file header starts with '\"sep=' " + "but it started with '{}'") + .format(line), 'CSVFile') + stripped = line.rstrip() + if not stripped.endswith("\""): + raise ParseError( + ("expected that the csv file header ends with '\"' " + "but the line was '{}'") + .format(line), 'CSVFile') + return stripped[5:-1] + + +def parse_line(sep, line): + """ + Parses a line from a csv file + :param sep: + :param line: + :return: + """ + # TODO: Handle escaping: sep is escaped with quotes, quotes are escaped with quotes + # 'test,test' will be '"test,test"' in the csv file + # 'test"this,"test' will be '"test""this,""test"' in the csv file + return [l.strip() for l in line.split(sep)] + + +class LineType: + IF = "#if" + ELSE = "#else" + ELIF = "#elif" + + +def parse_feature_line(sep, line): + """ + parse the current line which is something like: + FILENAME,LINE_START,LINE_END,TYPE,EXPRESSION,CONSTANTS + :param line: the line to parse + :return: start_line, end_line, line_type, feature_list + """ + parsed_line = parse_line(sep, line) + # FILENAME,LINE_START,LINE_END,TYPE,EXPRESSION,CONSTANTS + try: + start_line = int(parsed_line[1]) + end_line = int(parsed_line[2]) + line_type_raw = parsed_line[3] + if line_type_raw not in (LineType.IF, LineType.ELSE, LineType.ELIF): + raise ParseError( + ("could not parse feature line (because we could" + "not parse the line_type): \"{}\"") + .format(line), 'CSVFile') + line_type = line_type_raw + feature_list = parsed_line[5].split(';') + return start_line, end_line, line_type, feature_list + except ValueError: + raise ParseError( + ("could not parse feature line (most likely because we " + "could not parse the start- or end-line which should " + "be on index 2 and 3): \"{}\"") + .format(line), 'CSVFile') + + +def get_feature_lines(parsed_lines, filename): + """ + calculates an dictionary representing the feature sets for any line + of the given file. + :param parsed_lines: a list of tuples with + (start_line, end_line, line_type, feature_list) elements + :param filename: the name or the analysed files + (only used for descriptive error messages if the calculation fails) + :return: + feature_lines: a FileDict object to access the feature sets on any line + """ + # mapping line -> feature list, we only add changing elements + feature_lines = FileDict() + feature_lines.add_line(0, []) + + # we want a format like (is_start, features) for every line with an + # #ifdef (ie. line that changes the feature set) + annotated_lines = {} + + def check_line(line): + if line in annotated_lines: + raise ParseError( + ("every line index can be used at most once " + "(problematic line was {0} in file {1})") + .format(line, filename), filename) + + # We now transform the cppstats output in another output which will + # help to implement the algorithm below in a simple and fast way. + # The old format is a list of + # (start_line, end_line, line_type, feature_list) tuples for every + # #ifdef/#else. + # The new format is a list of (is_start, feature_set) + # for every #ifdef(/#else)/#endif + # We try to ignore #else wherever possible or handle + # the #else like a nested #if. + for start_line, end_line, line_type, feature_list in parsed_lines: + if start_line >= end_line: + raise ParseError( + ("start_line can't be greater or equal to end_line " + "(problematic line was {0} in file {1})") + .format(start_line, filename), filename) + + if line_type == LineType.IF: + # ifs start on their own line, however the end_line could + # already be used by the start of an else/elif + # (#else is the end of the previous #if + # and the start of another '#if') + check_line(start_line) + if end_line in annotated_lines: + # in that case we just say the #else line belongs to the + # virtual starting '#if' + end_line -= 1 + # Now end_line should be unused + check_line(end_line) + annotated_lines[start_line] = (True, feature_list) + annotated_lines[end_line] = (False, feature_list) + else: + # we try to mostly ignore else and elif if the feature_ + # list doesn't change + is_start, old_feature_list = annotated_lines[start_line] + if (not is_start) and old_feature_list == feature_list: + # We are on an ELSE, however the feature list did not + # change so we just delete the current line and move the + # list to the new end + del annotated_lines[start_line] + annotated_lines[end_line] = is_start, old_feature_list + elif is_start: + raise ParseError( + ("line {0} appeared twice as start line " + "(problematic file was {1})") + .format(start_line, filename), filename) + else: + # So we have a elif with different features, + # so we start more features now end add them to the ending + # later + # (so we handle this as if there was a new #ifdef started) + del annotated_lines[start_line] + annotated_lines[start_line] = (True, feature_list) + annotated_lines[end_line] = \ + (False, old_feature_list + feature_list) + + # Now that we have calculated the annotated_lines we just calculate the + # feature sets on those lines and save them in a FileDict instance. + # We can always access the last feature_list with the FileDict + # (because we sorted the lines) + for line in sorted(annotated_lines): + is_start, features = annotated_lines[line] + # Get last info + last_feature_list = feature_lines.get_line_info_raw(line) + # Copy last list and create new list for current line + new_feature_list = list(last_feature_list) + if is_start: + # if the current line starts a new list of features, + # we just need to add those to + # the new list (note that order matters in this case). + for r in features: + new_feature_list.insert(0, r) + else: + # if the current line ends a list of features, + # we remove them from the list + # (reverse order as adding). + for r in reversed(features): + item = new_feature_list.pop(0) + assert(item == r) + # Remove in next line + # (because we want to count the current #endif line as well). + line += 1 + + feature_lines.add_line(line, new_feature_list) + return feature_lines + + class gitVCS (VCS): def __init__(self): VCS.__init__(self) # Python OOP braindamage @@ -1067,6 +1246,57 @@ class gitVCS (VCS): src_line_rmv = re.sub(rmv_char, ' ', src_line.strip()) file_commit.addFuncImplLine(line_num, src_line_rmv) + @staticmethod + def _get_feature_lines(file_layout_src, file_commit): + """ + similar to _getFunctionLines but computes the line numbers of each + feature in the file. + """ + ''' + - Input - + file_layout_src: + dictionary with 'key=line number' and 'value=line of code' + file_commit: fileCommit instance where the results will be stored + + - Description - + The file_layout is used to construct a source code file that can be + parsed by cppstats to generate a cppstats csv file. + The cppstats csv file is then accessed to extract the feature sets + and line numbers to be saved in the fileCommit object + ''' + + # grab the file extension to determine the language of the file + fileExt = os.path.splitext(file_commit.filename)[1] + + # temporary file where we write transient data needed for ctags + srcFile = tempfile.NamedTemporaryFile(suffix=fileExt) + featurefile = tempfile.NamedTemporaryFile(suffix=".csv") + # generate a source code file from the file_layout_src dictionary + # and save it to a temporary location + for line in file_layout_src: + srcFile.write(line) + srcFile.flush() + + # run cppstats analysis on the file to get the feature locations + cmd = "/usr/bin/env cppstats --kind featurelocations --file {0} {1}"\ + .format(srcFile.name, featurefile.name).split() + output = execute_command(cmd).splitlines() + + results_file = open(featurefile.name, 'r') + sep = parse_sep_line(next(results_file)) + headlines = parse_line(sep, next(results_file)) + feature_lines = \ + get_feature_lines( + [parse_feature_line(sep, line) for line in results_file], + file_commit.filename) + + # clean up temporary files + srcFile.close() + featurefile.close() + + # save result to the file commit instance + file_commit.set_feature_infos(feature_lines) + def cmtHash2CmtObj(self, cmtHash): ''' input: cmtHash diff --git a/codeface/fileCommit.py b/codeface/fileCommit.py index aa99e84..6474669 100644 --- a/codeface/fileCommit.py +++ b/codeface/fileCommit.py @@ -23,6 +23,67 @@ single file.''' import commit import bisect + +class FileDict: + """ + A generic dictionary for saving per-line information. + We assume that this information is available on any line, + and that the information only changes on some lines. + So we only save the information on lines that change that info + and use bisect to retrieve that information (for any line). + """ + def __init__(self, line_list, line_dict): + """ + :rtype : FileDict + """ + self.line_list = line_list + self.line_dict = line_dict + self.lastItem = line_list[-1] + + def __init__(self): + """ + :rtype : FileDict + """ + self.line_list = [] + self.line_dict = {} + self.lastItem = -1 + + def __iter__(self): + return self.line_dict.__iter__() + + def get_line_info_raw(self, line_nr): + """ + Returns the info for the given line + (if the line was never set, the info for the last set line + is returned) + :param line_nr: the line to retrieve the information for. + :return: the information for the given line. + """ + i = bisect.bisect_right(self.line_list, line_nr) + info_line = self.line_list[i-1] + return self.line_dict[info_line] + + def get_line_info(self, line_nr): + return set(self.get_line_info_raw(line_nr)) + + def add_line(self, line_nr, info): + """ + Add the given information to the current dictionary. + Note: while filling the dictionary the line_nr argument has to + be incremented (this is only to make sure the caller + gets the intended behavior)! + :param line_nr: the line number of the information + :param info: the information for the current line + """ + if line_nr < self.lastItem: + raise ValueError("can only incrementally add items") + self.line_list.append(line_nr) + self.line_dict[line_nr] = info + + def values(self): + return self.line_dict.values() + + class FileCommit: def __init__(self): @@ -57,6 +118,9 @@ class FileCommit: # meta data self._src_elem_list = [] + # dictionary with key = line number, value = feature list + self.feature_info = FileDict() + #Getter/Setters def getFileSnapShots(self): return self.fileSnapShots @@ -84,6 +148,9 @@ class FileCommit: def setSrcElems(self, src_elem_list): self._src_elem_list.extend(src_elem_list) + def set_feature_infos(self, feature_line_infos): + self.feature_info = feature_line_infos + #Methods def addFileSnapShot(self, key, dict): self.fileSnapShots[key] = dict @@ -116,3 +183,6 @@ class FileCommit: def addFuncImplLine(self, lineNum, srcLine): id = self.findFuncId(lineNum) self.functionImpl[id].append(srcLine) + + def findFeatureList(self, lineNum): + return self.feature_info.get_line_info(int(lineNum)) diff --git a/codeface/project.py b/codeface/project.py index a311fb7..1d8c0b6 100644 --- a/codeface/project.py +++ b/codeface/project.py @@ -23,7 +23,7 @@ from .configuration import Configuration from .cluster.cluster import doProjectAnalysis from .ts import dispatch_ts_analysis from .util import (execute_command, generate_reports, layout_graph, - check4ctags, BatchJobPool, generate_analysis_windows) + check4ctags, check4cppstats, BatchJobPool, generate_analysis_windows) def loginfo(msg): ''' Pickleable function for multiprocessing ''' @@ -54,7 +54,20 @@ def project_analyse(resdir, gitdir, codeface_conf, project_conf, no_report, loglevel, logfile, recreate, profile_r, n_jobs): pool = BatchJobPool(int(n_jobs)) conf = Configuration.load(codeface_conf, project_conf) - project, tagging = conf["project"], conf["tagging"] + tagging = conf["tagging"] + if collab_type is not "default": + # as collab_type is ignored on some tagging values we should either + # => throw an exception to tell the user he specified something weird + # => set tagging to something valid + if tagging is not "proximity": + log.warn("tagging value is overwritten to proximity because of --collaboration") + tagging = "proximity" + conf["tagging"] = tagging + else: + # default is function + collab_type = "function" + + project = conf["project"] repo = pathjoin(gitdir, conf["repo"], ".git") project_resdir = pathjoin(resdir, project, tagging) range_by_date = False @@ -67,8 +80,11 @@ def project_analyse(resdir, gitdir, codeface_conf, project_conf, range_by_date = True # TODO: Sanity checks (ensure that git repo dir exists) - if 'proximity' == conf["tagging"]: - check4ctags() + if 'proximity' == tagging: + if collab_type is 'function': + check4ctags() + else: + check4cppstats() project_id, dbm, all_range_ids = project_setup(conf, recreate) diff --git a/codeface/util.py b/codeface/util.py index 7b8e4ab..9de4b5b 100644 --- a/codeface/util.py +++ b/codeface/util.py @@ -381,6 +381,28 @@ def check4ctags(): log.error("Ctags version '{0}' not found".format(prog_version)) raise Exception("Incompatible ctags-exuberant version") + +def check4cppstats(): + """ + check if the appropriate cppstats is installed on the system. + """ + # We can not check the version directly as there is no version switch on cppstats + # We just check if the first two lines of --help are OK. + line_1 = "usage: cppstats.py [-h] [--kind <K> | -a] [--list [LIST] | --file IN OUT]" + line_2 = " [--nobak] [--stf] [-l] [-v] [--check CHECK] [--dall]" + cmd = "/usr/bin/env cppstats --help".split() + res = execute_command(cmd).splitlines() + if not (res[0].startswith(line_1)): + error_message = "expected '{0}' in the first line but got '{1}'".format(line_1, res[0]) + log.error("program cppstats does not exist, or it is not working as expected ({0}".format(error_message)) + raise Exception("cppstats not found ({0})".format(error_message)) + + if not (res[1].startswith(line_2)): + error_message = "expected '{0}' in the second line but got '{1}'".format(line_2, res[1]) + log.error("program cppstats does not exist, or it is not working as expected ({0}".format(error_message)) + raise Exception("cppstats not found ({0})".format(error_message)) + + def generate_analysis_windows(repo, window_size_months): ''' Generates a list of revisions (commit hash) in increments of the window_size -- 1.8.5.5