This patch extends the fixing routine which re-parses the date header to
fix the parsed date. To extend the existing code with another pattern,
the re-parsing is now augmented with the corresponding loop over all
defined patterns.
The newly introduced pattern is "%a, %d %b %Y %H:%M" (omitted seconds),
e.g., "Wed, 21 Aug 2013 15:02 +0200".
Fix trailing whitespace on the way.
Signed-off-by: Claus Hunsen <hunsen@xxxxxxxxxxxxxxxxx>
---
codeface/R/ml/analysis.r | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/codeface/R/ml/analysis.r b/codeface/R/ml/analysis.r
index 07bb897..fc1b629 100644
--- a/codeface/R/ml/analysis.r
+++ b/codeface/R/ml/analysis.r
@@ -356,10 +356,19 @@ check.corpus.precon <- function(corp.base) {
headers = meta(doc, tag = "header")
date.header = grep("^Date:", headers, value = TRUE, useBytes = TRUE)
- ## re-parse the header using adapted pattern
- ## TODO: are there other potential pattern?
- adapted.format = "%d %b %Y %H:%M:%S" # missing weekday; e.g., "Date: 20
Feb 2009 20:24:54 +0100"
- date.new = strptime(gsub("Date: ", "", date.header), format =
adapted.format, tz = "GMT")
+ ## try to re-parse the header using adapted patterns
+ date.formats = c(
+ "%d %b %Y %H:%M:%S", # missing weekday; e.g., "Date: 20 Feb 2009
20:24:54 +0100"
+ "%a, %d %b %Y %H:%M" #missing seconds; e.g. "Date: Wed, 21 Aug 2013
15:02 +0200"
+ )
+
+ for (date.format in date.formats) {
+ date.new = strptime(gsub("Date: ", "", date.header), format =
date.format, tz = "GMT")
+ # if the date has been parsed correctly, break the loop
+ if (!is.na(date.new)) {
+ break()
+ }
+ }
return(date.new)
}
@@ -826,7 +835,7 @@ store.mail <- function(conf, forest, corp, ml.id ) {
dat <- merge(dat, dates.df, by="ID")
dat$ID <- NULL
colnames(dat)[which(colnames(dat)=="threadID")] <- "threadId"
-
+
## Re-order columns to match the order as defined in the database to
## improve the stability
dat = dat[c("projectId", "threadId", "mlId", "author", "subject",
"creationDate")]
--
2.10.0