Source code for solthiruthi.data_parser
#!/usr/bin/python
# (C) 2015-2106 - Muthiah Annamalai
# parse data files for Tamil proper nouns
from __future__ import print_function
import codecs
import sys
import json
import tamil
import re
import pprint
[docs]class WordList:
# data structure for a WordList containing only one category
def __init__(self,cat):
self.category = cat
self.words = []
[docs] def add(self,word):
self.words.append(word.strip())
[docs]class DataParser:
def __init__(self,files):
self.categories = []
self.files = files
[docs] def process(self):
for filename in self.files:
self.parse_data(filename)
for cat in self.categories:
cat.words = set(cat.words) #unique elements only
return
[docs] @staticmethod
def run(args):
#print(u">> starting data processing for files <<")
#print(u"|".join(args))
obj = DataParser( args )
obj.process()
return obj
[docs] def parse_data(self,filename):
cat = None
#print(u">> file %s"%filename)
with codecs.open(filename,'r','utf-8') as fp:
for line in fp.readlines():
if line.startswith('>>'):
if cat:
self.categories.append( cat )
cat = None
newcat = line.replace('>>','').strip()
cat = WordList( cat = newcat)
elif line.startswith('#'):
continue
elif cat:
word = u"".join(re.split('\s+',line)[1:])
if len(word) > 0 :
cat.add( word )
else:
#odd looking line - we'll keep it anyway
cat.add(line.strip())
if cat:
self.categories.append( cat )
return
[docs] def analysis(self):
r = {'catlen':0,'total':0,'dict':{}}
r['catlen']=len(self.categories)
word_count = []
for cat in self.categories:
cat_wlen = len(cat.words)
r['dict'][cat.category] = list(cat.words)
word_count.append( cat_wlen )
r['total'] = sum(word_count)
return r
def __unicode__(self):
"print the statistics of the wordlist etc"
rep = (u"# categories = %d"%len(self.categories))
word_count = []
for cat in self.categories:
cat_wlen = len(cat.words)
rep += u" %s -> %d\n"%(cat.category,cat_wlen)
word_count.append( cat_wlen )
rep += 'Total words -> %d \n'%sum(word_count)
return rep
if __name__ == u"__main__":
if len(sys.argv) < 2:
print(u"usage: python data_parser.py <filename1> ... <filenamen>")
print(u" this command shows categories of words and their frequencies in document(s);")
sys.exit(-1)
obj = DataParser.run(sys.argv[1:])
r = obj.analysis()
# if you wanted to save the results to JSON
with codecs.open("ref.json","w","utf-8") as fp:
#pprint.pprint( json.dumps(r), fp )
fp.write(json.dumps(r))
print(u"cat %d / total words %d"%(r['catlen'],r['total']))