Source code for transliterate.algorithm

## -*- coding: utf-8 -*-
# (C) 2013 Muthiah Annamalai
#
# Implementation of transliteration algorithm flavors
# and later used in TamilKaruvi (2007) by your's truly.
#
import tamil

[docs]def reverse_transliteration_table(table_in): table_out = {} keys = table_in.keys() keys = sorted(keys) keys.reverse() for k in keys: v = table_in[k] table_out[v] = k return table_out
[docs]class Tamil2English:
[docs] @staticmethod def transliterate(table,tamil_str): letters = tamil.utf8.get_letters(tamil_str) ta2en_map = reverse_transliteration_table(table) eng_transliterated = u"".join([ta2en_map.get(tl,tl) for tl in letters]) return eng_transliterated
# BlindIterative Algorithm from TamilKaruvi - less than optimal -
[docs]class BlindIterative:
[docs] @staticmethod def transliterate(table,english_str): """ @table - has to be one of Jaffna or Azhagi etc. @english_str - """ # # -details- # # While text remains: # Lookup the English part in anywhere in the string position # If present: # Lookup the Corresponding Tamil Part & Append it to the string. # Else: # Continue # out_str = english_str # for consistent results we need to work on sorted keys eng_parts = list(table.keys()) eng_parts.sort() eng_parts.reverse() for eng_part in eng_parts: tamil_equiv = table[eng_part] parts = out_str.split( eng_part ) out_str = tamil_equiv.join( parts ) return out_str
# Basically english_str can be transliterated to many possible Tamil words # First condition is all english letters need to be used. # Secondly all generated Tamil words from the englist_str will have to be # scored by their bigram score. # Return the highest scoring string
[docs]class Greedy: def __init__(self,table,lexicon=None): self.table = table self.options = [] self.scores = [0.0] self.lexicon = lexicon self.full_search = False
[docs] def score(self): max_idx = 0 for idx,op in enumerate(self.options): prev = '' n_ending_uyir = 0 n_consequetive_mei = 0 for letter in tamil.utf8.get_letters_iterable(op): if (letter in tamil.utf8.mei_letters) and (prev in tamil.utf8.mei_letters ): n_consequetive_mei += 1 prev = letter if prev in tamil.utf8.uyir_letters: n_ending_uyir = 1 w_score = len(op) - 2*n_consequetive_mei - 2*n_ending_uyir if w_score > self.scores[max_idx]: max_idx = idx self.scores.append( w_score ) return max_idx
#check if level=0 and letter is mei, then return False #all other cases return True
[docs] def skip_mei(self,level,letter): if level > 0: return True return not( letter in tamil.utf8.mei_letters)
[docs] def generate(self,english_str,partial='',level=0): if len(english_str) == 0: self.options.append(partial) return if level >= 1 and len(partial) == 0: return for itr,s in enumerate(english_str): curr = s if itr < len(english_str)-1: nxt = english_str[itr+1] else: nxt = '' w1 = self.table.get(curr,None) if w1: self.skip_mei(level,w1) and self.generate(english_str[itr+1:],partial+w1,level+1) w2 = self.table.get(curr.upper()+nxt,None) if w2: self.skip_mei(level,w2) and self.generate(english_str[itr+2:],partial+w2,level+1) #w2 = self.table.get(prev+curr,None) #if w2: self.generate(english_str[itr+1:],partial+w2) w3 = self.table.get(curr+nxt,None) if w3: self.skip_mei(level,w3) and self.generate(english_str[itr+2:],partial+w3,level+1) #w4 = self.table.get(curr.upper()+nxt.upper(),None) #if w4: self.skip_mei(level,w4) and self.generate(english_str[itr+2:],partial+w4,level+1) #w4 = self.table.get(prev+curr+nxt,None) #if w4: self.generate(english_str[itr+2:],partial+w4) prev = curr if ( not self.full_search ): break return
[docs] def pick_dictionary_words(self): if not self.lexicon: return self.options = list(filter(self.lexicon.isWord,self.options))
[docs] def run(self,english_str): self.generate(english_str) self.pick_dictionary_words() idx = self.score() if len(self.options) < 1: return english_str best = self.options[idx] self.options = set(self.options) #for w in self.options: # print(w) #print(u'Total choices => ',len(self.options)) #print(u'Best => %s'%best) return best
[docs] @staticmethod def transliterate(table,english_str,lexicon=None): g = Greedy(table,lexicon) return g.run(english_str),g
# Azhagi has a many-to-one mapping - using a Tamil language model and Baye's conditional probabilities # to break the tie when two or more Tamil letters have the same English syllable. Currently # this predictive transliterator is not available in this package. Also such an algorithm could be # used with any table. # # challenge use a probabilistic model on Tamil language to score the next letter, # instead of using the longest/earliest match # http://www.mazhalaigal.com/tamil/learn/keys.php
[docs]class Predictive:
[docs] @staticmethod def transliterate(table,english_str): raise Exception("Not Implemented!") pass
# Sequential Iterative Algorithm modified from TamilKaruvi
[docs]class Iterative:
[docs] @staticmethod def transliterate(table,english_str): """ @table - has to be one of Jaffna or Azhagi etc. @english_str - """ # # -details- # # While text remains: # Lookup the First-Matching-Longest-English part # If present: # Lookup the Corresponding Tamil Part & Append it. # Else: # Continue # out_str = english_str # for consistent results we need to work on sorted keys eng_parts = list(table.keys()) eng_parts.sort() eng_parts.reverse() MAX_ITERS_BEFORE_YOU_DROP_LETTER = max(list(map(len,eng_parts))) remaining = len(english_str) out_str = '' pos = 0 while pos < remaining: # try to find the longest prefix in input from L->R in greedy fashion iters = MAX_ITERS_BEFORE_YOU_DROP_LETTER success = False while iters >= 0: curr_prefix = english_str[pos:min(pos+iters-1,remaining)] curr_prefix_lower = None if ( len(curr_prefix) >= 2 ): curr_prefix_lower = curr_prefix[0].lower() + curr_prefix[1:] ## print curr_prefix iters = iters - 1 if ( curr_prefix in eng_parts ): out_str = out_str + table[curr_prefix] pos = pos + len( curr_prefix) success = True break elif ( curr_prefix_lower in eng_parts ): out_str = out_str + table[curr_prefix_lower] pos = pos + len( curr_prefix_lower ) success = True break # replacement was a success if ( success ): continue # too-bad we didn't find a replacement - just copy char to output ## print "concatennate the unmatched =>",english_str[pos],"<=" if ord(english_str[pos]) < 128: rep_char = english_str[pos] else: rep_char = "?" out_str = out_str + rep_char pos = pos + 1 return out_str