From: Thomas Bock <bockthom@xxxxxxxxxxxxxxxxx>
In the mailing-list analysis, some persons with name 'NULL' could occur
in the database due to obscure patterns of names and mail addresses.
This patch fixes the parsing of two patterns:
1) only a mail address in angle brackets is provided: as the parsing result,
the first part of the mail address is returned as the author name.
(e.g., "From: <hans.huber@xxxxxxxxxxxxx')
2) a name looks like a mail address: as the parsing result, use only the
first part of that as name to avoid parsing problems in further steps of
the analysis and the ID service.
(e.g., "From: 'hans.huber@xxxxxxxxxxxxx' <hans.huber@xxxxxxxxxxxxx>")
[CH: Rephrase commit message and small fixes.]
Signed-off-by: Thomas Bock <bockthom@xxxxxxxxxxxxxxxxx>
Reviewed-by: Claus Hunsen <hunsen@xxxxxxxxxxxxxxxxx>
Reviewed-by: Wolfgang Mauerer <wolfgang.mauerer@xxxxxxxxxxxxxxxxx>
---
codeface/R/ml/analysis.r | 43 +++++++++++++++++++++++++++++++++++++------
1 file changed, 37 insertions(+), 6 deletions(-)
diff --git a/codeface/R/ml/analysis.r b/codeface/R/ml/analysis.r
index 3218864..f24d207 100644
--- a/codeface/R/ml/analysis.r
+++ b/codeface/R/ml/analysis.r
@@ -287,7 +287,7 @@ check.corpus.precon <- function(corp.base) {
## In some cases only an email is provided
if (name=="") {
- name <- gsub("\\.", " ",gsub("@.*", "", email))
+ name <- gsub("\\.", " ", gsub("@.*", "", email))
}
author <- paste(name, ' <', email, '>', sep="")
@@ -298,15 +298,46 @@ check.corpus.precon <- function(corp.base) {
## Get email and name parts
r <- regexpr("<.+>", author, TRUE)
- if(r[[1]] == 1) {
- email <- substr(author, r, r + attr(r,"match.length")-1)
- name <- sub(email, "", author, fixed=TRUE)
- name <- fix.name(name)
+ ## email is at start
+ if(r == 1) {
+ ## Check if only an email is provided
+ if(attr(r, "match.length") == nchar(author)) {
+ ## Only an email like "<hans.huber@xxxxxxxxxxxxx>" is provided
+ email <- substr(author, r + 1, r + nchar(author) - 2)
+ name <- gsub("\\.", " ", gsub("@.*", "", email))
+ } else {
+ ## email and name both are provided
+ email <- substr(author, r, r + attr(r, "match.length") - 1)
+ name <- sub(email, "", author, fixed=TRUE)
+ name <- fix.name(name)
+ }
+
email <- str_trim(email)
- author <- paste(name,email)
+ author <- paste(name, ' <', email, '>', sep="")
+ }
+ }
+
+ ## Check if name looks like an email address (i.e., there are more than
+ ## one @ symbol in the author string): Since that causes parsing problems
+ ## in further steps of the analysis and the ID service, we use only the
+ ## local part of an email address as name.
+ ## E.g., "'hans.huber@xxxxxxxxxxxxx' <hans.huber@xxxxxxxxxxxxx>"
+ if (length(gregexpr(pattern = "@", author, fixed = TRUE)[[1]]) > 1) {
+ ## Get email and name parts first
+ r <- regexpr("<.+>", author, TRUE)
+ email <- substr(author, r, r + attr(r, "match.length") - 1)
+ name <- sub(email, "", author, fixed=TRUE)
+ name <- fix.name(name)
+
+ if(regexpr("\\S+@\\S+", name, TRUE) == 1) {
+ ## Name looks like an email address. Use only local part as name.
+ name <- gsub("\\.", " ", gsub("@.*", "", name))
}
+
+ author <- paste(name, email)
}
+ ## return new author string
return(author)
}
--
2.10.2