Source code for solthiruthi.scoring
## -*- coding: utf-8 -*-
## (C) 2017 Muthiah Annamalai,
##
from __future__ import print_function
import codecs
import math
import operator
import os
import sys
from tamil import utf8
from pprint import pprint
from . import resources
PYTHON3 = (sys.version[0] == '3')
if PYTHON3:
from functools import reduce
[docs]class NGStats:
def __init__(self):
self.unigram_file = resources.mk_path("tvu_unigram.txt")
self.bigram_file = resources.mk_path("tvu_bigram.txt")
self.default = (1e-1,1e-4)
self.bigram = dict()
self.unigram = dict()
self.load()
[docs] def load(self):
with codecs.open(self.bigram_file,"r","utf-8") as fp:
for L in fp.readlines():
a,b = L.split("-")
a=a.strip()
b=b.strip()
self.bigram[a]=float(b)
normalize = 1+sum(self.bigram.values())
for k,v in self.bigram.items():
self.bigram[k] = v/normalize
with codecs.open(self.unigram_file,"r","utf-8") as fp:
for L in fp.readlines():
a,b = L.split("-")
a=a.strip()
b=b.strip()
self.unigram[a]=float(b)
normalize = 1+sum(self.unigram.values())
for k,v in self.unigram.items():
self.unigram[k] = v/normalize
#pprint(self.unigram)
#pprint(sum(self.unigram.values()))
def _ngram_scorer(self,letters,dictionary,default):
res = list()
for letter in letters:
res.append( dictionary.get(letter,default) )
return res
[docs] def unigram_score(self,letters):
return self._ngram_scorer(letters,self.unigram,self.default[0])
[docs] def bigram_score(self,letters):
return self._ngram_scorer(letters,self.bigram,self.default[1])
ngstats = NGStats()
[docs]def unigram_score(letters):
return math.log10(reduce(operator.mul,ngstats.unigram_score(letters),1.0))
[docs]def bigram_scores(letters):
g1_letters = list()
g2_letters = list()
L = len(letters)
# (1,2,3, ... ) sequence is grouped (1, 2 3, 4 5, ...)
L = len(letters)
g1_letters.append(letters[0])
for idx,l in enumerate(letters[min(L-1,1):]):
if idx == 0:
l_prev = l
continue
curr = u''.join([l_prev,l])
g1_letters.append(curr)
l_prev = l
# (1,2,3, ... ) sequence is grouped (1 2, 3 4, ... )
for idx,l in enumerate(letters):
if idx == 0:
l_prev = l
continue
curr = u''.join([l_prev,l])
g2_letters.append(curr)
l_prev = l
if len(g2_letters) == 0:
g2_letters.append(letters[0])
#pprint(letters)
#pprint(g1_letters)
#pprint(g2_letters)
s1 = reduce(operator.mul,ngstats.bigram_score(g1_letters),1.0)
s2 = reduce(operator.mul,ngstats.bigram_score(g2_letters),1.0)
return list(map(math.log10,[s1,s2]))
if __name__ == u"__main__":
# a ab abc 123456789
input_words = u"டைட்டானிக் படத்தில் ஜேக் மற்றும் ரோஸ் தன் காதலை வெளிப்படுத்தும் இரு தவளைகள்".split()
for word in input_words:
pprint(word)
letters = utf8.get_letters(word)
pprint(bigram_scores(letters))
pprint(unigram_score(letters))