[cablewiki] Re: Taxonomy

  • From: Stefan de Konink <stefan@xxxxxxxxx>
  • To: cablewiki@xxxxxxxxxxxxx, hamstar@xxxxxxxxxxxxxx
  • Date: Sat, 15 Jan 2011 21:46:50 +0100

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512

Op 15-01-11 21:39, Robert McLeod schreef:
> Oh yea, where can I find it?

Attached to this email. Second one I currently use to generate
interesting stuff from 'unpublished' documents that are in fact floating
somewhere.


Stefan
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2.0.16 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/

iEYEAREKAAYFAk0yB7oACgkQYH1+F2Rqwn3fKACdENodHgbPhx669cOdK11EIVjo
e2sAoJVTk82BFiyoXb248hNM9AptNNSu
=09u4
-----END PGP SIGNATURE-----
import os
import sys
import re
import codecs
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import BeautifulStoneSoup

import re, htmlentitydefs
from os.path import join

c = re.compile('.*&#x000A;FM (.*)&#x000A;TO (.*)&#x000A;INFO (.*)')
c1 = re.compile('(.*) (PRIORITY|IMMEDIATE)')
c2 = re.compile('(.*) ([0-9]+).*')

def splitupall(writer, contents):
        tagtitle = contents.split('/')
        tag = ''
        title = ''
        priority = ''
        thisid = ''
        
        if len(tagtitle) == 2:
                tag = tagtitle[0].strip(' ')
                title = tagtitle[1].strip(' ')
        else:
                title = tagtitle[0].strip(' ')
        
        m1 = c1.match(title)
        if m1:
                priority = m1.group(2).strip(' ')
                title = m1.group(1).strip(' ')
        
                m2 = c2.match(title)
                if m2:
                        thisid = m2.group(2).strip(' ')
        else:
                m2 = c2.match(title)
                if m2:
                        thisid = m2.group(2).strip(' ')
                        title = m2.group(1).strip(' ')

        writer.write("%s\t%s\t%s\t%s\t%s\n"%(docname,tag,title,priority,thisid))
        writer.flush()


to = codecs.open("/tmp/to.sql", "w", "utf-8")
info = codecs.open("/tmp/info.sql", "w", "utf-8")

for root, dirs, files in os.walk('cable'):
        for name in files:
                if name.endswith('.html'):

                        f = open(os.path.join(root, name), 'r')
                        soup = BeautifulSoup(''.join(f.read()), 
convertEntities=BeautifulStoneSoup.ALL_ENTITIES)
                        f.close()

                        docname = name.replace('.html', '')

                        pre = soup.findAll("pre")

                        m = c.match(pre[0].contents[0])

                        if m:
                                for i in m.group(2).split('&#x000A;'):
                                        splitupall(to,i)
                                        
                                
                                for i in m.group(3).split('&#x000A;'):
                                        splitupall(info,i)

to.close()
info.close()
import sys
import re
import codecs
import os
import datetime

documentid = re.compile("^[0-9]{2}[A-Z]{4,}[0-9]{2,}$")
date = re.compile("^([0-9]+)/([0-9]+)/([0-9]+) ([0-9]+):([0-9]+)$")
classification = re.compile("^(UNCLASSIFIED|UNCLASSIFIED//FOR OFFICIAL USE 
ONLY|CONFIDENTIAL|CONFIDENTIAL//NOFORN|SECRET|SECRET//NOFORN)$")
origin = re.compile("^(Embassy.*)")

output = codecs.open('/tmp/rtlcables.sql', 'w', 'utf-8')

for root, dirs, files in os.walk('rtl'):
        for name in files:
                doc = codecs.open(os.path.join(root, name), 'r', 'utf-8')

                hasid = ''
                hasdate = ''
                hasclass = ''
                hasorigin = ''
                maand = ''

                alldoc = doc.read()

                for i in alldoc.split("\n"):
                        if hasid == '':
                                match = documentid.match(i)
                                if match is not None:
                                        hasid = match.group(0)
                        if hasdate == '':
                                match = date.match(i) 
                                if match is not None:
                                        maand = match.group(1)
                                        jaar  = match.group(3)
                                        created = "%s-%s-%s 
%s:%s:00"%(match.group(3), match.group(1), match.group(2), match.group(4), 
match.group(5))
                        if hasclass == '':
                                match = classification.match(i)
                                if match is not None:
                                        hasclass = match.group(1)
        
                        if hasorigin == '':
                                match = origin.match(i)
                                if match is not None:
                                        hasorigin = match.group(1)

                output.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (jaar, maand, 
hasid, created, datetime.datetime.now(), hasclass, hasorigin, 
alldoc.replace("\t", '\\t').replace("\n", '\\n')))

Other related posts: