## -*- coding: utf-8 -*-
## (C) 2015 Muthiah Annamalai,
## This module is part of solthiruthi project under open-tamil umbrella.
## This code maybe used/distributed under MIT LICENSE.
from __future__ import print_function
import abc
import codecs
import copy
from tamil import utf8
# Suffix removal algorithm
[docs]class RemoveSuffix(object):
__metaclass__ = abc.ABCMeta
def __init__(self):
self.possible_suffixes = None
self.replace_suffixes = {} #valid dictionary
self.reversed_suffixes = []
[docs] @abc.abstractmethod
def setSuffixes(self):
pass
[docs] @abc.abstractmethod
def apply(self,word):
pass
[docs] def prepareSuffixes(self):
assert self.possible_suffixes
# reverse the words in each letter.
for word in self.possible_suffixes:
self.reversed_suffixes.append( utf8.reverse_word(word) )
return
[docs] def removeSuffix(self,word):
removed = False
if not self.possible_suffixes:
# init once
self.setSuffixes()
self.prepareSuffixes()
word_lett = utf8.get_letters(word)
rword_lett = copy.copy(word_lett)
rword_lett.reverse()
#print('rev word ->',rword_lett)
rword = u"".join(rword_lett)
longest_match = ""
for itr in range(len(self.reversed_suffixes)):
suffix = self.reversed_suffixes[itr]
#print(itr,utf8.get_letters(suffix))
if rword.startswith(suffix):
if len(longest_match) <= len(suffix):
longest_match = suffix
#print('L-match-->',utf8.get_letters(longest_match))
continue
if len(longest_match) > 0:
removed = True
sfx = []
for itr in range(len(utf8.get_letters(longest_match))):
sfx.append( word_lett.pop() )
word = u"".join(word_lett)
sfx.reverse()
sfx= u"".join(sfx)
# rule to replace suffix
alt_suffix = self.replace_suffixes.get(sfx,None)
if alt_suffix:
word = word + alt_suffix
return word,removed
# remove prefix using the suffix removal algorithm via reversal of word
[docs]class RemovePrefix(RemoveSuffix):
def __init__(self):
super(RemovePrefix,self).__init__()
[docs] def setSuffixes(self):
self.replace_suffixes = {u"மா":u"",u"பேர்":u"",u"அதி":u"",u"பெரிய":u"",u"பெரு":u"",u"சின்ன":u"",\
u"ஆதி":u"",u"சிறு":u"",u"அக்":u"",u"இக்":u"",u"எக்":u""}
self.possible_suffixes=[utf8.reverse_word(word) for word in self.replace_suffixes.keys()]
[docs] def apply(self,word):
return self.removePrefix(word)
[docs] def removePrefix(self,word):
word_lett = utf8.get_letters(word)
word_lett.reverse()
a,b = self.removeSuffix(u"".join(word_lett))
return [utf8.reverse_word(a),b]
[docs]class RemoveCaseSuffix(RemoveSuffix):
def __init__(self):
super(RemoveCaseSuffix,self).__init__()
[docs] def apply(self,word):
return self.removeSuffix(word)
[docs] def setSuffixes(self):
accusative = u"ை"
instrumental =u"ஆல்"
associative=u"ஓடு"
dative=u"க்கு"
genitive=u"இன்"
possessive=u"உடைய"
locative=u"இடம்"
ablative=u"இடமிருந்து"
self.possible_suffixes=[u"உக்கு",u"க்கு",u"ளை",u"கள்",
accusative,instrumental,associative,
dative,genitive,possessive,locative,ablative]
self.replace_suffixes = dict()
for w in self.possible_suffixes:
self.replace_suffixes[w] = u""
return
[docs]class RemoveHyphenatesNumberDate(RemoveCaseSuffix):
""" Done correctly (மேல்) 65536-மேல்,
ivan paritchayil இரண்டாவது, 2-வது """
pass
[docs]class RemoveVerbSuffixTense(RemoveCaseSuffix):
def __init__(self):
super(RemoveCaseSuffix,self).__init__()
self.tenses = { "present" :u"கிற்",
"past" : u"த",
"future" : u"வ" }
[docs] def setSuffixes(self):
"""
"""
tense_endings = [u"ஏன்",u"ஆய்",u"ஆர்",u"ஆன்",u"ஆள்",u"அது",u"ஓம்", u"அன"]
self.possible_suffixes=tense_endings
self.replace_suffixes = tense_endings
[docs]class RemovePluralSuffix(RemoveSuffix):
def __init__(self):
super(RemovePluralSuffix,self).__init__()
[docs] def apply(self,word):
return self.removeSuffix(word)
[docs] def setSuffixes(self):
self.replace_suffixes = {u"ற்கள்":u"ல்",u"கள்":u"",u"ல்":u"", u"ட்கள்": u"ள்", u"ங்கள்":u"ம்"}
self.possible_suffixes=list(self.replace_suffixes.keys())
[docs]class RemoveNegationSuffix(RemoveCaseSuffix):
def __init__(self):
super(RemoveNegationSuffix,self).__init__()
[docs] def setSuffixes(self):
self.replace_suffixes = {u"கே":u"",u"ல்லை":u"",u"ாதே":u"", u"ாமல்":u""}
self.possible_suffixes=list(self.replace_suffixes.keys())
[docs]class CaseFilter(object):
def __init__(self,*filter_obj_list):
object.__init__(self)
self.filters = filter_obj_list
[docs] def apply(self,word_in):
word = [word_in,None]
for filter_obj in self.filters:
word = filter_obj.apply( word[0] )
return word[0]
[docs]def xkcd():
obj = RemovePluralSuffix()
objf = CaseFilter(obj)
expected = [u"பதிவி",u"கட்டளை",u"அவர்",u"பள்ளி"]
words_list = [u"பதிவில்",u"கட்டளைகள்",u"அவர்கள்",u"பள்ளிகள்"]
for w,x in zip(words_list,expected):
rval = obj.removeSuffix(w)
trunc_word = objf.apply( w )
assert( trunc_word == rval[0] )
assert(rval[1])
print(utf8.get_letters(w),'->',rval[1])
assert(rval[0] == x)
return
if __name__ == "__main__":
xkcd()