Source code for ngram.LetterModels

# -*- coding: utf-8 -*-
# This file is part of open-tamil ngrams package
# (C) முத்தையா அண்ணாமலை 2013-2015,2017
# 
# N-gram language model for Tamil letters

import tamil
import copy
from .Corpus import Corpus
import codecs
import operator

[docs]class Letters: def __init__(self,filename): self.letter = dict() self.letter.update(zip( tamil.utf8.tamil_letters, map(lambda x : 0, tamil.utf8.tamil_letters) ) ) self.corpus = Corpus( filename ) def __del__(self): try: del self.corpus except Exception: pass def __unicode__( self ): op = u"" for lett,freq in self.letter.items(): op = op + u"%s => %d\n"%(lett,freq) print(max(self.letter.values())) return op
[docs] def save(self,filename): raise Exception('Not implemented')
[docs]class Unigram(Letters): def __init__(self,filename): Letters.__init__(self,filename)
[docs] def frequency_model( self ): """ build a letter frequency model for Tamil letters from a corpus """ # use a generator in corpus for next_letter in self.corpus.next_tamil_letter(): # update frequency from corpus self.letter[next_letter] = self.letter[next_letter] + 1
[docs] def save(self,filename): with codecs.open(filename,"w","utf-8") as fp: for k,v in sorted(self.letter.items(),key=operator.itemgetter(1),reverse=True): if v == 0: continue fp.write(u"%s - %d\n"%(k,v)) return True
[docs]class Bigram(Unigram): def __init__(self,filename): Unigram.__init__(self,filename) self.letter2 = dict() for k in tamil.utf8.tamil_letters: self.letter2[k] = copy.copy( self.letter )
[docs] def language_model(self,verbose=True): """ builds a Tamil bigram letter model """ # use a generator in corpus prev = None for next_letter in self.corpus.next_tamil_letter(): # update frequency from corpus if prev: self.letter2[prev][next_letter] += 1 if ( verbose ) : print(prev) print(next_letter) print( self.letter2[prev][next_letter] ) prev = next_letter #update always return
[docs] def save(self,filename): with codecs.open(filename,"w","utf-8") as fp: d = {} for k,v in self.letter2.items(): for k2,v2 in v.items(): if v2 == 0: continue d[k+k2] = v2 for k,v in sorted(d.items(),key=operator.itemgetter(1),reverse=True): fp.write(u"%s - %d\n"%(k,v)) return True
[docs]class Trigram(Unigram): def __init__(self,filename): Unigram.__init__(self,filename) self.letter3 = dict()
[docs] def language_model(self,verbose=True): """ builds a Tamil bigram letter model """ # use a generator in corpus p2 = None p1 = None for next_letter in self.corpus.next_tamil_letter(): # update frequency from corpus if p2: trig = p2+p1+next_letter self.letter3[trig] = 1 + self.letter3.get(trig,0) p2 = p1 p1 = next_letter #update always return
[docs] def save(self,filename): with codecs.open(filename,"w","utf-8") as fp: for k,v in sorted(self.letter3.items(),key=operator.itemgetter(1),reverse=True): if v == 0: continue fp.write(u"%s - %d\n"%(k,v)) return True