Source code for solthiruthi.suggestions

## -*- coding: utf-8 -*-
# (C) 2015-2016 Muthiah Annamalai
#  <[email protected]>
# 
# This function provides a list of alternatives for downstream use as suggestor in
# the Tamil spelling modules/programs

# Ref: http://norvig.com/spell-correct.html
from tamil.utf8 import tamil_letters, get_letters, mei_letters, agaram_letters, sanskrit_letters, \
sanskrit_mei_letters, uyir_letters

from pprint import pprint
[docs]def norvig_suggestor(word,alphabets=None,nedits=1,limit=float("inf")): if not alphabets: alphabets = tamil_letters if not type(word) is list: wordL = get_letters(word) else: wordL = word # recursive method for edit distance > 1 if nedits > 1: result = [] for nAlternate in norvig_suggestor(wordL,alphabets,nedits-1,limit-len(result)): if len(result) > limit: break result.extend( norvig_suggestor(nAlternate,alphabets,1,limit-len(result)) ) return set(result) ta_splits = [ [u"".join(wordL[:idx-1]),u"".join(wordL[idx:])] for idx in range(len(wordL) + 1)] #pprint( ta_splits ) ta_deletes = [a + b[1:] for a, b in ta_splits if b] ta_transposes = [a + b[1] + b[0] + b[2:] for a, b in ta_splits if len(b)>1] ta_replaces = [a + c + b[1:] for a, b in ta_splits for c in alphabets ] ta_replaces2 = [ c + b for a, b in ta_splits for c in alphabets ] ta_inserts = [a + c + b for a, b in ta_splits for c in alphabets] # TODO: add a normalizing pass word words in vowel+consonant forms to eliminate dangling ligatures return set(ta_deletes + ta_transposes + ta_replaces + ta_replaces2 + ta_inserts )
[docs]def mayangoli_suggestor(): """ Rules: ண, ன - mayakkam ல, ழ, ள - mayakkam ர, ற - mayakkam ivattrilum ithan uyirmei varisayilum mayakkangalai kaanalaam. """ pass
[docs]def kombu_suggestor(): """ """ pass