Source code for solthiruthi.heuristics

## -*- coding: utf-8 -*-
## (C) 2015-2017 Muthiah Annamalai,
##
from __future__ import print_function
import abc
import sys
from tamil import utf8
from pprint import pprint

PYTHON3 = (sys.version[0] == '3')

[docs]def get_letters(word): if isinstance(word,list): chars = word else: chars = utf8.get_letters(word) return chars
[docs]class Rule: __metaclass__ = abc.ABCMeta
[docs] @abc.abstractmethod def apply( self, word, ctx ): """ @word is just that. @ctx is a dict of NwordsPrevious, NwordsNext, and a list of surrounding words for as items. e.g. ctx = {'NPrev' : 4, 'Prev' : [w1,w2,w3,w4],'NNext':2,'Next':[w1,w2]} return value should be boolean (False if error found) and an optional reason as second argument """ return False,None
[docs]class Sequential:
[docs] @staticmethod def in_sequence( word, ref_set, ref_reason, freq_threshold = 2 ): """ ignore ctx information right now. If repetition/match length >= @freq_threshold then we flag-it """ chars = get_letters(word) flag = True #no error assumed reason = None #no reason freq_count = 0 for char in chars: if char in ref_set: freq_count += 1 if freq_count >= freq_threshold: flag = False break continue freq_count = 0 # continue loop if not flag: reason = ref_reason return flag,reason
[docs]class AdjacentVowels(Rule): """ donot allow adjacent vowels in the word. ஆஅக்காள் (originally -> அக்காள்) will be flagged """ reason = u"ஒன்றைத்தொடர்ந்துஒன்று உயிரெழுத்துக்கள் வரக்கூடாது. இது பெரும்பாலும் பிழையாக இருக்கும்." uyir_letters = set(utf8.uyir_letters)
[docs] def apply(self, word, ctx=None): """ ignore ctx information right now """ return Sequential.in_sequence(word,AdjacentVowels.uyir_letters,AdjacentVowels.reason)
[docs]class AdjacentConsonants(Rule): """ donot allow adjacent consonants in the word. this may not be as useful as AdjacentVowels rules """ reason = u"ஒன்றைத்தொடர்ந்துஒன்று மெய் எழுத்துக்கள் வரக்கூடாது. இது பெரும்பாலும் பிழையாக இருக்கும்." mei_letters = set(utf8.mei_letters) agaram_letters = set(utf8.agaram_letters) def __init__(self,freq=2): self.freq_threshold = freq
[docs] def apply(self, word, ctx=None): """ ignore ctx information right now """ flag,reason = Sequential.in_sequence(word,AdjacentConsonants.mei_letters,AdjacentConsonants.reason,self.freq_threshold) if flag: flag,reason = Sequential.in_sequence(word,AdjacentConsonants.agaram_letters,AdjacentConsonants.reason,self.freq_threshold) return flag,reason
[docs]class RepeatedLetters(Rule): """ donot allow more than one repetition of a letter in word """ reason = u"ஒரே எழுத்து பல முரை (>= 2) தொடர்ச்சியாக வந்தால் அது பிழையான சொல் ஆகும்"
[docs] def apply(self,word,ctx=None): """ ignore ctx information right now """ chars = get_letters(word) flag = True #no error assumed reason = None #no reason prev_letter = None for char in chars: if prev_letter == char: flag = False break prev_letter = char # continue loop if not flag: reason = RepeatedLetters.reason return flag,reason
[docs]class BadIME(Rule): """ donot allow vowels with kombu, thunaikaal etc in the word. ஆாள் (originally intended as -> ஆள்) will be flagged """ reason = u"சொல்லில் பிழை காரணம், இல்லாத தமிழ் எழுத்து.." uyir_letters = set(utf8.uyir_letters)
[docs] def apply(self, word, ctx=None): """ ignore ctx information right now """ chars = get_letters(word) flag = True #no error assumed reason = None #no reason prev_char = None for char in chars: rule1,rule2,rule3 = False,False,False # rule 1 : uyir followed by kombugal rule1 = (char[-1] in utf8.accent_symbols) and (char[0] in utf8.uyir_letters) if not rule1: # rule 2 : two pullis adjacent to each other rule2 = len(char) >= 2 and (char[-1] == utf8.pulli_symbols[0]) and (char[-2] == char[-1] ) if not rule2: # rule 3 : none of the accent symbols repeat # exclusions to rule 3 : non-standard Unicode encoding of periya kombu / siriya kombu with thunai kaal rule3 = len(char) >= 2 and (char[-1] in utf8.accent_symbols) and (char[-2] in utf8.accent_symbols) \ and not( char[-1] == u"ா" and char[-2] in [u"ெ",u"ே"]) if rule1 or rule2 or rule3: flag = False reason = BadIME.reason break prev_char = char # continue loop #print([flag,reason]) return flag,reason