Source code for tamil.wordutils

## This Python file uses the following encoding: utf-8
##
## (C) 2015 Muthiah Annamalai <[email protected]>
from __future__ import print_function, division
import copy
import collections
from . import utf8

[docs]def combinations(symbols_in):
    if isinstance(symbols_in,list):
        symbols = symbols_in
    else:
        symbols = utf8.get_letters(symbols_in)
    
    uniq_symbols = list(set(symbols))
    N = len(uniq_symbols)
    input_zip = zip(range(0,N),uniq_symbols)
    for count in range(0,2**N):
        bin_rep = bin(count)[2:]
        diff_zeros = N - len(bin_rep)
        bin_rep = '0'*diff_zeros + bin_rep
        filter_symbol = lambda idx: bin_rep[idx] == '1' and uniq_symbols[idx] or u''
        word_combo = u''.join( map(filter_symbol, range(0,N)) )
        yield word_combo
    return

[docs]def default_true(*args):
    return True

[docs]def permutations(symbols,predicate=default_true,prefix=u""):
    if not isinstance(symbols,list):
        #        raise Exception(u'symbols என்ற உள்ளீடு iterable interface கொண்டதாக வேண்டும். அது சரம் (str) வகையாக இருந்தால் tamil.utf8.get_letters() பயன்பாட்டை முதலில் உபயொகிக்க!')
        symbols = utf8.get_letters(symbols)
    
    if len(symbols) == 1:
        yield symbols[0]
    
    for idx in range(0,len(symbols)):
        new_list = copy.copy(symbols)
        new_pfx = prefix+symbols[idx]
        if not predicate(new_pfx):
            continue
        del new_list[idx]
        for vars in permutations(new_list,predicate,new_pfx):
            yield symbols[idx] + vars
        del new_list
    return

[docs]def tamil_permutations(inword):
    if isinstance(inword,list):
        letters = inword
    else:
        letters = utf8.get_letters(inword)
    for word in permutations( letters ):
        yield word
    return

[docs]def is_palindrome(*args):
    return palindrome(*args)
    
[docs]def palindrome(symbols_in):
    if isinstance(symbols_in,list):
        symbols = symbols_in
    else:
        symbols = utf8.get_letters(symbols_in)
    N = len(symbols)
    for fw in range(0,N//2):
        rev = N-1 - fw
        if symbols[fw] != symbols[rev]:
            return False
    return True

[docs]def all_plaindromes(dictionary):
    if not callable( getattr(dictionary,'isWord',[]) ):
        raise Exception("@dictionary என்ற உள்ளீட்டில் isWord என்ற செயல்பாட்டு கூறு கிடையாது. இது விதிவிலக்கான நிலை")
    if not callable( getattr(dictionary,'getAllWords',[]) ):
        raise Exception("@dictionary என்ற உள்ளீட்டில் getAllWords என்ற செயல்பாட்டு கூறு கிடையாது. இது விதிவிலக்கான நிலை")
        
    for word in dictionary.getAllWords():
        if  is_palindrome(word):
            yield word
    return

[docs]def anagrams(word,dictionary,permutations=tamil_permutations):
    if not callable( getattr(dictionary,'isWord',[]) ):
        raise Exception("@dictionary என்ற உள்ளீட்டில் isWord என்ற செயல்பாட்டு கூறு கிடையாது. இது விதிவிலக்கான நிலை")
    for anagram in permutations(word):
        if dictionary.isWord(anagram):
            yield anagram
    return

[docs]def is_anagram(wordA,wordB):
    return sorted(wordA)== sorted(wordB) 

[docs]def anagrams_in_dictionary(dictionary):
    if not all ([callable( getattr(dictionary,'isWord',[])),callable( getattr(dictionary,'getAllWordsIterable',[]))]):
        raise Exception("dictionary object has insufficient methods")
    anagrams = dict()
    try:
        anagrams_by_len = collections.Counter()
    except AttributeError:
        anagrams_by_len = dict()
        
    for in_word in dictionary.getAllWordsIterable():
        word = utf8.get_letters(in_word)
        sword = u''.join(sorted(word))
        try:
            equivs = anagrams[sword]
        except KeyError as ke:
            equivs = list() 
            anagrams_by_len[sword] = 0
        equivs.append( in_word )
        anagrams[sword] = equivs
        anagrams_by_len[sword] += 1
    items_to_del = copy.deepcopy(filter(lambda a: a[1] == 1,anagrams_by_len.items()))
    for itm,counts in items_to_del:
        del anagrams[itm]
        del anagrams_by_len[itm]
    del items_to_del
    
    itr = 0
    from operator import itemgetter
    rval_anagram_count = sorted(anagrams_by_len.items(),key=itemgetter(1))
    for k,v in rval_anagram_count:
        itr = itr +  1
        #print(u"%d/ items #%d"%(itr,v))
    #print(u"%d anagrams found"%itr)
    return rval_anagram_count,anagrams

# combinations filtered by dictionary - yields all possible sub-words of a word.
# e.g. 'bat' -> 'tab', 'bat', 'at', etc.
[docs]def combinagrams(word,dictionary,limit=float("inf")):
    count = 0
    for word_part in combinations(word):
        for valid_word in anagrams(word_part,dictionary,tamil_permutations):
            count = count + 1
            if count > limit:
                return
            yield valid_word
    return

# permutations of a word filtered by dictionary - yields all possible sub-words of a word.
# e.g. 'bullpen' -> 'pen' 'bull', 'ben' 'pull', 'pub' 'nell', 'nell' 'pub' .etc.    
[docs]def permutagrams(word,dictionary):
    matches = dict()
    for perm_word in permutations(word):
        if (perm_word in matches):
           continue
        matches[perm_word] = list()
        actual_splits = word_split(perm_word,dictionary)
        if len(actual_splits) > 0:
            matches[perm_word].append(actual_splits)
            yield actual_splits
    del matches
    return

[docs]def rhymes_with(inword,reverse_dictionary):
    if not all ([callable( getattr(reverse_dictionary,'isWord',[])),callable( getattr(reverse_dictionary,'getWordsEndingWith',[]))]):
        raise Exception("reverse dictionary object has insufficient methods")
    rhyming = list()
    if isinstance(inword,list):
        letters = inword
    else:
        letters = utf8.get_letters(inword)
    
    MAX = len(letters)*2
    while len(rhyming) < MAX and len(letters) > 0:
        partial_word = u"".join( letters )
        matches = list( reverse_dictionary.getWordsEndingWith( partial_word ) )
        #print "%d -> %d"%(len(letters),len(matches))
        rhyming.extend( matches )
        del letters[0]
        
    #rhyming = list(set(rhyming))
    return set(rhyming[0:min(len(rhyming)-1,MAX)])

[docs]def greedy_split(inword,dictionary):
    if not all ([callable( getattr(dictionary,'isWord',[])),callable( getattr(dictionary,'hasWordsStartingWith',[]))]):
        raise Exception("dictionary object has insufficient methods")
    
    if isinstance(inword,list):
        letters = inword
    else:
        letters = utf8.get_letters(inword)
    solution = list()
    
    longest_idx = 0
    prev_idx = 0
    idx = 0
    possible = True
    while possible:
        idx = prev_idx
        prev_word = u""
        while idx < len(letters):
            #print("%d -> %d"%(idx,prev_idx))
            word = u"".join(letters[prev_idx:idx+1])
            if dictionary.hasWordsStartingWith(word):
                if dictionary.isWord(word):
                    prev_word = word
                    #print("word => %s"%word)
                    longest_idx = idx+1
                elif word == inword:
                    possible = False
                    break
            else:
                #print "prev_ word"
                #pprint(prev_word)
                #pprint(solution)
                #pprint(idx)
                if len(prev_word) == 0:
                    possible = False
                    break
                
            idx = idx + 1     
        
        prev_idx = longest_idx
        
        #print(" \t --> word %s|%s|%d"%(prev_word,str(possible),prev_idx))
        solution.append( prev_word )
        do_brk = len(prev_word) == 0
        if (prev_idx) >= len(letters) or do_brk:
            possible = not do_brk
            break
    
    #print(u"//".join(solution))
    if possible:
        return solution
    
    return list()

[docs]def word_split(inword,dictionary):
    if not callable( getattr(dictionary,'isWord',[])):
        raise Exception("dictionary object has insufficient methods")
    
    if isinstance(inword,list):
        letters = inword
    else:
        letters = utf8.get_letters(inword)
    
    solutions = list()
    
    idx = 0
    while idx < len(letters)-1:
        #print idx
        prev_word = u"".join(letters[0:idx+1])
        next_word = u"".join(letters[idx+1:])
        temp_sol = list()
        #print prev_word,next_word
        sol1 = greedy_split(prev_word,dictionary)
        if len(sol1) > 0:
            sol2 = greedy_split(next_word,dictionary)
            if len(sol2) > 0:
                tmpsol = list()
                tmpsol.extend(sol1)
                tmpsol.extend(sol2)
                if not (tmpsol in solutions):
                    solutions.append(tmpsol)
                # try cross product of s1, and s2 computed recursively!
                s1 = word_split(prev_word,dictionary)
                s2 = word_split(next_word,dictionary)
                for sols in s1:
                    for sols2 in s2:
                        l = list()
                        l.extend(sols)
                        l.extend(sols2)
                        if not (l in solutions):
                            solutions.append(l)
        idx = idx + 1
    
    return ((solutions))
    
# dummy dictionary interface for use with anagrams
DictionaryWithPredicate = collections.namedtuple('DictionaryWithPredicate',['isWord'])

# Utility class
[docs]class DictionaryFixedWordList(object):
    def __init__(self,wlist):
        self.wlist = wlist        
        object.__init__(self)
    
[docs]    def isWord(self,word):
        return word in self.wlist
    
[docs]    def hasWordsStartingWith(self,pfx):
        return any( [ w.startswith(pfx) for w in self.wlist ] )