#!/usr/bin/python

import korpus2db

import sys
import string

k = korpus2db.Korpus()
f = open(sys.argv[1], "rt")
segliste = string.split(f.read(), "<segmentgrenze>")
tagset_de = k.dbconn.executelist("select name, tagsetid from tagsets where sprachenid = %s and name = '%s'" %(1, "tagset_ims_de"))
tagset_en = k.dbconn.executelist("select name, tagsetid from tagsets where sprachenid = %s and name = '%s'" %(2, "tagset_ims_en"))
tagsets=[tagset_de[0], tagset_en[0]]
sprachen = ["de", "en"]
pc_id = 1
pe_id = 1
for i in range(len(segliste)):
    segment = [segliste[i]]
    #sys.stderr.write("segment = %s\n" %`segment`)
    sprache = 1 + (i % 2)
    if sprache == 1:
        segstart = k.segment_idx
        sys.stderr.write("Processing %s -> %s\n" %(i,segstart))
    elif sprache == 2:
        k.segment_idx = segstart
    else:
       raise IndexError
    k.tag_idx = k.dbconn.getMaxIndex("tagid", "tagset_ims_%s" %sprachen[sprache-1]) 
    k.entersegment(segment, k.sent_idx, -1, sprache, tagsets[sprache-1])
    if sprache == 2:
        k.dbconn.update("insert into phrase_count(pc_id, segmentnr, zaehler) values (%s, %s, 1)" %(pc_id, segstart))
        k.dbconn.update("insert into phrase_examples(pe_id, pc_id, segmentnr) values(%s, %s, %s)" %(pe_id, pc_id, segstart))
        pc_id = pc_id + 1
        pe_id = pe_id + 1
f.close()
k.close()
