Source code for solthiruthi.data_parser

#!/usr/bin/python
# (C) 2015-2106 - Muthiah Annamalai
# parse data files for Tamil proper nouns
from __future__ import print_function
import codecs
import sys
import json
import tamil
import re
import pprint

[docs]class WordList:
    # data structure for a WordList containing only one category
    def __init__(self,cat):
        self.category = cat
        self.words = []
[docs]    def add(self,word):
        self.words.append(word.strip())

[docs]class DataParser:
    def __init__(self,files):
        self.categories = []
        self.files = files
    
[docs]    def process(self):
        for filename in self.files:
            self.parse_data(filename)
        for cat in self.categories:
            cat.words = set(cat.words) #unique elements only
        return
    
[docs]    @staticmethod
    def run(args):
        #print(u">> starting data processing for files <<")
        #print(u"|".join(args))
        obj = DataParser( args )
        obj.process()
        return obj
    
[docs]    def parse_data(self,filename):
        cat = None
        #print(u">> file %s"%filename)
        with codecs.open(filename,'r','utf-8') as fp:
            for line in fp.readlines():
                if line.startswith('>>'):
                    if cat:
                        self.categories.append( cat )
                        cat = None
                    newcat = line.replace('>>','').strip()
                    cat = WordList( cat = newcat)
                elif line.startswith('#'):
                    continue
                elif cat:
                    word = u"".join(re.split('\s+',line)[1:])
                    if len(word) > 0 :
                        cat.add( word )
                    else:
                        #odd looking line - we'll keep it anyway
                        cat.add(line.strip())
        if cat:
           self.categories.append( cat ) 
        return
    
[docs]    def analysis(self):
        r = {'catlen':0,'total':0,'dict':{}}
        r['catlen']=len(self.categories)
        word_count = []
        for cat in self.categories:
            cat_wlen = len(cat.words)
            r['dict'][cat.category] = list(cat.words)
            word_count.append( cat_wlen )
        r['total'] = sum(word_count)
        return r
                
    def __unicode__(self):
        "print the statistics of the wordlist etc"
        rep = (u"# categories = %d"%len(self.categories))
        word_count = []
        for cat in self.categories:
            cat_wlen = len(cat.words)
            rep += u" %s -> %d\n"%(cat.category,cat_wlen)
            word_count.append( cat_wlen )
        rep += 'Total words -> %d \n'%sum(word_count)
        return rep
        
if __name__ == u"__main__":
    if len(sys.argv) < 2:
        print(u"usage: python data_parser.py <filename1> ... <filenamen>")
        print(u"       this command shows categories of words and their frequencies in document(s);")
        sys.exit(-1)
    obj = DataParser.run(sys.argv[1:])
    r = obj.analysis()
    # if you wanted to save the results to JSON
    with codecs.open("ref.json","w","utf-8") as fp:
        #pprint.pprint( json.dumps(r), fp )
        fp.write(json.dumps(r))
    
    print(u"cat %d / total words %d"%(r['catlen'],r['total']))