Due to occurring pattern in real-world mbox files (e.g., QEMU from
GMane), the 'Date' header could not be parsed as it was preceeded by
"DATE:". The case-sensitive grep commands did not find those.
Consequently, the 'Date' header identification is changed to case-
insensitive.
Furthermore, if the 'Date' header is missing in a message, parsing for
this message is aborted early.
Signed-off-by: Claus Hunsen <hunsen@xxxxxxxxxxxxxxxxx>
---
codeface/R/ml/analysis.r | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/codeface/R/ml/analysis.r b/codeface/R/ml/analysis.r
index 6fa2b69..205f344 100644
--- a/codeface/R/ml/analysis.r
+++ b/codeface/R/ml/analysis.r
@@ -348,7 +348,14 @@ check.corpus.precon <- function(corp.base) {
## get the date header as inside the mbox file
headers = meta(doc, tag = "header")
- date.header = grep("^Date:", headers, value = TRUE, useBytes = TRUE)
+ date.header = grep("^Date: ", headers, value = TRUE, useBytes = TRUE,
ignore.case = TRUE)
+ date.header.plain = gsub("^Date: ", "", date.header, ignore.case = TRUE)
+
+ ## break early if 'Date' header is missing
+ if (length(date.header.plain) == 0) {
+ logwarn(paste("Mail is missing header 'Date':", meta(doc, tag = "id")))
+ return(NA)
+ }
## patterns without time-zone pattern
date.formats.without.tz = c(
@@ -365,7 +372,8 @@ check.corpus.precon <- function(corp.base) {
## try to re-parse the header using adapted patterns:
## parse date until any match with a pattern is found (date.new is not NA)
for (date.format in date.formats) {
- date.new = strptime(gsub("Date: ", "", date.header), format =
date.format, tz = "GMT")
+ date.new = strptime(date.header.plain, format = date.format, tz = "GMT")
+
# if the date has been parsed correctly, break the loop
if (!is.na(date.new)) {
break()
--
2.10.0