-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA512 Op 15-01-11 21:39, Robert McLeod schreef: > Oh yea, where can I find it? Attached to this email. Second one I currently use to generate interesting stuff from 'unpublished' documents that are in fact floating somewhere. Stefan -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.16 (GNU/Linux) Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/ iEYEAREKAAYFAk0yB7oACgkQYH1+F2Rqwn3fKACdENodHgbPhx669cOdK11EIVjo e2sAoJVTk82BFiyoXb248hNM9AptNNSu =09u4 -----END PGP SIGNATURE-----
import os import sys import re import codecs from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulStoneSoup import re, htmlentitydefs from os.path import join c = re.compile('.*
FM (.*)
TO (.*)
INFO (.*)') c1 = re.compile('(.*) (PRIORITY|IMMEDIATE)') c2 = re.compile('(.*) ([0-9]+).*') def splitupall(writer, contents): tagtitle = contents.split('/') tag = '' title = '' priority = '' thisid = '' if len(tagtitle) == 2: tag = tagtitle[0].strip(' ') title = tagtitle[1].strip(' ') else: title = tagtitle[0].strip(' ') m1 = c1.match(title) if m1: priority = m1.group(2).strip(' ') title = m1.group(1).strip(' ') m2 = c2.match(title) if m2: thisid = m2.group(2).strip(' ') else: m2 = c2.match(title) if m2: thisid = m2.group(2).strip(' ') title = m2.group(1).strip(' ') writer.write("%s\t%s\t%s\t%s\t%s\n"%(docname,tag,title,priority,thisid)) writer.flush() to = codecs.open("/tmp/to.sql", "w", "utf-8") info = codecs.open("/tmp/info.sql", "w", "utf-8") for root, dirs, files in os.walk('cable'): for name in files: if name.endswith('.html'): f = open(os.path.join(root, name), 'r') soup = BeautifulSoup(''.join(f.read()), convertEntities=BeautifulStoneSoup.ALL_ENTITIES) f.close() docname = name.replace('.html', '') pre = soup.findAll("pre") m = c.match(pre[0].contents[0]) if m: for i in m.group(2).split('
'): splitupall(to,i) for i in m.group(3).split('
'): splitupall(info,i) to.close() info.close()
import sys import re import codecs import os import datetime documentid = re.compile("^[0-9]{2}[A-Z]{4,}[0-9]{2,}$") date = re.compile("^([0-9]+)/([0-9]+)/([0-9]+) ([0-9]+):([0-9]+)$") classification = re.compile("^(UNCLASSIFIED|UNCLASSIFIED//FOR OFFICIAL USE ONLY|CONFIDENTIAL|CONFIDENTIAL//NOFORN|SECRET|SECRET//NOFORN)$") origin = re.compile("^(Embassy.*)") output = codecs.open('/tmp/rtlcables.sql', 'w', 'utf-8') for root, dirs, files in os.walk('rtl'): for name in files: doc = codecs.open(os.path.join(root, name), 'r', 'utf-8') hasid = '' hasdate = '' hasclass = '' hasorigin = '' maand = '' alldoc = doc.read() for i in alldoc.split("\n"): if hasid == '': match = documentid.match(i) if match is not None: hasid = match.group(0) if hasdate == '': match = date.match(i) if match is not None: maand = match.group(1) jaar = match.group(3) created = "%s-%s-%s %s:%s:00"%(match.group(3), match.group(1), match.group(2), match.group(4), match.group(5)) if hasclass == '': match = classification.match(i) if match is not None: hasclass = match.group(1) if hasorigin == '': match = origin.match(i) if match is not None: hasorigin = match.group(1) output.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (jaar, maand, hasid, created, datetime.datetime.now(), hasclass, hasorigin, alldoc.replace("\t", '\\t').replace("\n", '\\n')))