Source code for tamil.wordutils

## This Python file uses the following encoding: utf-8
##
## (C) 2015 Muthiah Annamalai <[email protected]>
from __future__ import print_function, division
import copy
import collections
from . import utf8

[docs]def combinations(symbols_in): if isinstance(symbols_in,list): symbols = symbols_in else: symbols = utf8.get_letters(symbols_in) uniq_symbols = list(set(symbols)) N = len(uniq_symbols) input_zip = zip(range(0,N),uniq_symbols) for count in range(0,2**N): bin_rep = bin(count)[2:] diff_zeros = N - len(bin_rep) bin_rep = '0'*diff_zeros + bin_rep filter_symbol = lambda idx: bin_rep[idx] == '1' and uniq_symbols[idx] or u'' word_combo = u''.join( map(filter_symbol, range(0,N)) ) yield word_combo return
[docs]def default_true(*args): return True
[docs]def permutations(symbols,predicate=default_true,prefix=u""): if not isinstance(symbols,list): # raise Exception(u'symbols என்ற உள்ளீடு iterable interface கொண்டதாக வேண்டும். அது சரம் (str) வகையாக இருந்தால் tamil.utf8.get_letters() பயன்பாட்டை முதலில் உபயொகிக்க!') symbols = utf8.get_letters(symbols) if len(symbols) == 1: yield symbols[0] for idx in range(0,len(symbols)): new_list = copy.copy(symbols) new_pfx = prefix+symbols[idx] if not predicate(new_pfx): continue del new_list[idx] for vars in permutations(new_list,predicate,new_pfx): yield symbols[idx] + vars del new_list return
[docs]def tamil_permutations(inword): if isinstance(inword,list): letters = inword else: letters = utf8.get_letters(inword) for word in permutations( letters ): yield word return
[docs]def is_palindrome(*args): return palindrome(*args)
[docs]def palindrome(symbols_in): if isinstance(symbols_in,list): symbols = symbols_in else: symbols = utf8.get_letters(symbols_in) N = len(symbols) for fw in range(0,N//2): rev = N-1 - fw if symbols[fw] != symbols[rev]: return False return True
[docs]def all_plaindromes(dictionary): if not callable( getattr(dictionary,'isWord',[]) ): raise Exception("@dictionary என்ற உள்ளீட்டில் isWord என்ற செயல்பாட்டு கூறு கிடையாது. இது விதிவிலக்கான நிலை") if not callable( getattr(dictionary,'getAllWords',[]) ): raise Exception("@dictionary என்ற உள்ளீட்டில் getAllWords என்ற செயல்பாட்டு கூறு கிடையாது. இது விதிவிலக்கான நிலை") for word in dictionary.getAllWords(): if is_palindrome(word): yield word return
[docs]def anagrams(word,dictionary,permutations=tamil_permutations): if not callable( getattr(dictionary,'isWord',[]) ): raise Exception("@dictionary என்ற உள்ளீட்டில் isWord என்ற செயல்பாட்டு கூறு கிடையாது. இது விதிவிலக்கான நிலை") for anagram in permutations(word): if dictionary.isWord(anagram): yield anagram return
[docs]def is_anagram(wordA,wordB): return sorted(wordA)== sorted(wordB)
[docs]def anagrams_in_dictionary(dictionary): if not all ([callable( getattr(dictionary,'isWord',[])),callable( getattr(dictionary,'getAllWordsIterable',[]))]): raise Exception("dictionary object has insufficient methods") anagrams = dict() try: anagrams_by_len = collections.Counter() except AttributeError: anagrams_by_len = dict() for in_word in dictionary.getAllWordsIterable(): word = utf8.get_letters(in_word) sword = u''.join(sorted(word)) try: equivs = anagrams[sword] except KeyError as ke: equivs = list() anagrams_by_len[sword] = 0 equivs.append( in_word ) anagrams[sword] = equivs anagrams_by_len[sword] += 1 items_to_del = copy.deepcopy(filter(lambda a: a[1] == 1,anagrams_by_len.items())) for itm,counts in items_to_del: del anagrams[itm] del anagrams_by_len[itm] del items_to_del itr = 0 from operator import itemgetter rval_anagram_count = sorted(anagrams_by_len.items(),key=itemgetter(1)) for k,v in rval_anagram_count: itr = itr + 1 #print(u"%d/ items #%d"%(itr,v)) #print(u"%d anagrams found"%itr) return rval_anagram_count,anagrams
# combinations filtered by dictionary - yields all possible sub-words of a word. # e.g. 'bat' -> 'tab', 'bat', 'at', etc.
[docs]def combinagrams(word,dictionary,limit=float("inf")): count = 0 for word_part in combinations(word): for valid_word in anagrams(word_part,dictionary,tamil_permutations): count = count + 1 if count > limit: return yield valid_word return
# permutations of a word filtered by dictionary - yields all possible sub-words of a word. # e.g. 'bullpen' -> 'pen' 'bull', 'ben' 'pull', 'pub' 'nell', 'nell' 'pub' .etc.
[docs]def permutagrams(word,dictionary): matches = dict() for perm_word in permutations(word): if (perm_word in matches): continue matches[perm_word] = list() actual_splits = word_split(perm_word,dictionary) if len(actual_splits) > 0: matches[perm_word].append(actual_splits) yield actual_splits del matches return
[docs]def rhymes_with(inword,reverse_dictionary): if not all ([callable( getattr(reverse_dictionary,'isWord',[])),callable( getattr(reverse_dictionary,'getWordsEndingWith',[]))]): raise Exception("reverse dictionary object has insufficient methods") rhyming = list() if isinstance(inword,list): letters = inword else: letters = utf8.get_letters(inword) MAX = len(letters)*2 while len(rhyming) < MAX and len(letters) > 0: partial_word = u"".join( letters ) matches = list( reverse_dictionary.getWordsEndingWith( partial_word ) ) #print "%d -> %d"%(len(letters),len(matches)) rhyming.extend( matches ) del letters[0] #rhyming = list(set(rhyming)) return set(rhyming[0:min(len(rhyming)-1,MAX)])
[docs]def greedy_split(inword,dictionary): if not all ([callable( getattr(dictionary,'isWord',[])),callable( getattr(dictionary,'hasWordsStartingWith',[]))]): raise Exception("dictionary object has insufficient methods") if isinstance(inword,list): letters = inword else: letters = utf8.get_letters(inword) solution = list() longest_idx = 0 prev_idx = 0 idx = 0 possible = True while possible: idx = prev_idx prev_word = u"" while idx < len(letters): #print("%d -> %d"%(idx,prev_idx)) word = u"".join(letters[prev_idx:idx+1]) if dictionary.hasWordsStartingWith(word): if dictionary.isWord(word): prev_word = word #print("word => %s"%word) longest_idx = idx+1 elif word == inword: possible = False break else: #print "prev_ word" #pprint(prev_word) #pprint(solution) #pprint(idx) if len(prev_word) == 0: possible = False break idx = idx + 1 prev_idx = longest_idx #print(" \t --> word %s|%s|%d"%(prev_word,str(possible),prev_idx)) solution.append( prev_word ) do_brk = len(prev_word) == 0 if (prev_idx) >= len(letters) or do_brk: possible = not do_brk break #print(u"//".join(solution)) if possible: return solution return list()
[docs]def word_split(inword,dictionary): if not callable( getattr(dictionary,'isWord',[])): raise Exception("dictionary object has insufficient methods") if isinstance(inword,list): letters = inword else: letters = utf8.get_letters(inword) solutions = list() idx = 0 while idx < len(letters)-1: #print idx prev_word = u"".join(letters[0:idx+1]) next_word = u"".join(letters[idx+1:]) temp_sol = list() #print prev_word,next_word sol1 = greedy_split(prev_word,dictionary) if len(sol1) > 0: sol2 = greedy_split(next_word,dictionary) if len(sol2) > 0: tmpsol = list() tmpsol.extend(sol1) tmpsol.extend(sol2) if not (tmpsol in solutions): solutions.append(tmpsol) # try cross product of s1, and s2 computed recursively! s1 = word_split(prev_word,dictionary) s2 = word_split(next_word,dictionary) for sols in s1: for sols2 in s2: l = list() l.extend(sols) l.extend(sols2) if not (l in solutions): solutions.append(l) idx = idx + 1 return ((solutions))
# dummy dictionary interface for use with anagrams DictionaryWithPredicate = collections.namedtuple('DictionaryWithPredicate',['isWord']) # Utility class
[docs]class DictionaryFixedWordList(object): def __init__(self,wlist): self.wlist = wlist object.__init__(self)
[docs] def isWord(self,word): return word in self.wlist
[docs] def hasWordsStartingWith(self,pfx): return any( [ w.startswith(pfx) for w in self.wlist ] )