dbWriteTable cannot handle tab characters properly, so they need to
removed from mail subjects before the mails are written to the database.
Otherwise, columns in the database get displaced.
This cleanup step has been erformed already for sub-sequence analysis,
but not for the global ML analysis yet. For better memory efficiency,
the handling of subjects in the ML analysis is now performed in the
'check.corpus.precon' method, i.e., as early as possible. The new
implementation also leaves room for further steps to perform.
Signed-off-by: Claus Hunsen <hunsen@xxxxxxxxxxxxxxxxx>
---
codeface/R/ml/analysis.r | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/codeface/R/ml/analysis.r b/codeface/R/ml/analysis.r
index 2bc12fd..88bff32 100644
--- a/codeface/R/ml/analysis.r
+++ b/codeface/R/ml/analysis.r
@@ -402,6 +402,17 @@ check.corpus.precon <- function(corp.base) {
return(list(date.new, date.offset))
}
+ ## Fix subject (remove problematic characters)
+ fix.subject <- function(doc) {
+ ## get subject from headers
+ subject = meta(doc, tag = "heading")
+
+ ## Remove TABS -- dbWriteTable cannot handle these properly
+ subject <- gsub("\t", " ", subject, fixed=TRUE, useBytes=TRUE)
+
+ return(subject)
+ }
+
## Apply checks of conditions to all documents
fix.corpus.doc <- function(doc) {
meta(doc, tag="header") <- rmv.multi.refs(doc)
@@ -411,6 +422,8 @@ check.corpus.precon <- function(corp.base) {
meta(doc, tag="datetimestamp") <- fixed.date[[1]]
meta(doc, tag="datetimestampOffset") <- fixed.date[[2]]
+ meta(doc, tag="heading") <- fix.subject(doc)
+
return(doc)
}
@@ -772,10 +785,6 @@ dispatch.steps <- function(conf, repo.path, data.path,
forest.corp, cycle,
numberOfAuthors=thread.info$authors,
numberOfMessages=thread.info$messages)
- ## Remove tabs in subjects -- dbWriteTable cannot handle this properly
- dat$subject <- as.character(dat$subject)
- dat$subject <- gsub("\t", " ", dat$subject, fixed=TRUE, useBytes=TRUE)
-
res <- dbWriteTable(conf$con, "mail_thread", dat, append=TRUE,
row.names=FALSE)
if (!res) {
stop("Could not add to table mail_thread!")
--
2.10.2