[codeface] [PATCH] Fix case where email "From" field has atypical form

  • From: Mitchell Joblin <mitchell.joblin.ext@xxxxxxxxxxx>
  • To: codeface@xxxxxxxxxxxxx
  • Date: Thu, 12 Nov 2015 16:20:28 +0100

- Move the old author fix routine to the central place where
all other fix ups are performed

- Compose an email out of pattern matched name because in this case
the email field is typically generic for the entire mailing list

Signed-off-by: Mitchell Joblin <mitchell.joblin.ext@xxxxxxxxxxx>
---
codeface/R/ml/analysis.r | 13 +++++++++++++
codeface/R/ml/ml_utils.r | 12 ------------
2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/codeface/R/ml/analysis.r b/codeface/R/ml/analysis.r
index 36dfda6..68577f5 100644
--- a/codeface/R/ml/analysis.r
+++ b/codeface/R/ml/analysis.r
@@ -228,6 +228,19 @@ check.corpus.precon <- function(corp.base) {
## Trim trailing and leading whitespace
author <- str_trim(author)

+ ## Handle case where author is like
+ ## Adrian Prantl via llvm-dev <llvm-dev@xxxxxxxxxxxxxx>
+ pattern.1 <- " via [[:print:]]*"
+ if (grepl(pattern.1,author, TRUE)) {
+ ## Extract name and replace email part
+ name <- gsub(pattern.1, author, replacement="")
+
+ ## Generate ficticious email from name part
+ email <- paste("<", gsub(" ", "." ,name), "@unknown.tld>", sep="")
+
+ author <- paste(name, email)
+ }
+
## Check if email exists
email.exists <- grepl("<.+>", author, TRUE)

diff --git a/codeface/R/ml/ml_utils.r b/codeface/R/ml/ml_utils.r
index 963cd2d..4d18be4 100644
--- a/codeface/R/ml/ml_utils.r
+++ b/codeface/R/ml/ml_utils.r
@@ -171,21 +171,9 @@ fixup.network <- function(.net) {
return(.net[idx, idx])
}

-## Fix some common problems that appear in mailing list author
-## specifications.
-fixup.authors <- function(authors) {
- authors <- gsub(pattern=" via [[:print:]]*>?|\\]?", x=authors,
- replacement="")
-
- authors[authors==""] <- NA
- return(authors)
-}
-
## Author name normalisation. Replace the name/email pairs found in
## the messages with a decomposed name and a unique in-database ID
do.normalise <- function(conf, authors) {
- authors <- fixup.authors(authors)
-
authorIDs <- sapply(authors, function(namestr) {
if (is.na(namestr)) {
return(NA)
--
2.1.4


Other related posts: