Source code for spell.spell

# -*- coding: utf-8 -*-
# (C) 2016 Muthiah Annamalai
#
# This file is part of 'open-tamil' package
# It implements a data-driven spell checker for Tamil language
#
from __future__ import print_function

import argparse
import copy
import codecs
import functools
import itertools
import json
import operator
import pprint
import re
import string
import sys
import threading
import time

import tamil
from transliterate import azhagi, jaffna, combinational, algorithm
from solthiruthi.suggestions import norvig_suggestor
from solthiruthi.morphology import RemoveCaseSuffix, RemovePluralSuffix, RemovePrefix, RemoveVerbSuffixTense, CaseFilter
from solthiruthi.dictionary import DictionaryBuilder, TamilVU, EnglishLinux
from solthiruthi.heuristics import BadIME,  AdjacentConsonants, AdjacentVowels
from ngram.Distance import Dice_coeff, edit_distance

# Make Bi-Lingual dictionary

PYTHON3 = ( sys.version_info[0] == 3 )
if PYTHON3:
    unicode = str

_DEBUG = False

# save 6s for the code on a old machine
[docs]class LoadDictionary(threading.Thread):
    DEBUG = False
    lock = threading.Lock()
    def __init__(self):
        threading.Thread.__init__(self,name="LoadDictionaryInBackground")

[docs]    def run(self):
        start = time.time()
        Speller.get_dictionary()
        Speller.get_english_dictionary()
        if LoadDictionary.DEBUG: print("LOADED DICTIONARY in  %g (s)"%(time.time() - start))
        return

[docs]class DeletionFilter:
[docs]    @staticmethod
    def get_suggestions(letters,lexicon):
        rval = []
        L = len(letters)
        for idx,letter in enumerate(letters):
            muthal = idx == 0 and u"" or u"".join(letters[0:idx])
            meethi = idx == L and u"" or u"".join(letters[min(L-1,idx+2):])
            walt = muthal + meethi
            if (lexicon.isWord(walt)): rval.append(walt)
        return rval

[docs]class OttruSplit:
    """ யாரிகழ்ந்து = [ய்  + ஆரிகழ்ந்து], [யார், இகழ்ந்து] ,[யாரிக், அழ்ந்து], [யாரிகழ்ந்த்,உ]"""
    def __init__(self,word,letters):
        self.word = word
        if word != u"".join(letters):
            letters = tamil.utf8.get_letters(word)
        self.letters = letters
        self.results = list()

[docs]    def run(self,lexicon):
        self.generate_splits()
        return self.filter(lexicon)

[docs]    def generate_splits(self):
        """
            யாரிகழ்ந்து =
                [['ய்', 'ஆரிகழ்ந்து'],
                 ['யார்', 'இகழ்ந்து'],
                 ['யாரிக்', 'அழ்ந்து'],
                 ['யாரிகழ்ந்த்', 'உ']]
        """
        L = len(self.letters)-1
        for idx,letter in enumerate(self.letters):
            if not( letter in tamil.utf8.grantha_uyirmei_letters):
                continue
            muthal = idx == 0 and u"" or u"".join(self.letters[0:idx])
            meethi = idx == L and u"" or u"".join(self.letters[idx+1:])
            mei,uyir = tamil.utf8.splitMeiUyir(letter)
            muthal = muthal + mei
            meethi = uyir + meethi
            self.results.append([muthal,meethi])
        return len(self.results) > 0

[docs]    def filter(self,lexicon):
        self.results = list( filter(lambda x: all( map(lexicon.isWord,x) ),self.results) )
        return self.results

[docs]class Mayangoli:
    varisai = [[ u"ல்", u"ழ்",u"ள்"],[u"ர்", u"ற்"],[u"ந்",u"ன்",u"ண்"],[u"ங்",u"ஞ்"]]#வரிசை.

    def __init__(self,word,letters):
        self.word = word
        if word != u"".join(letters):
            letters = tamil.utf8.get_letters(word)
        self.letters = letters
        self.matches_and_positions = []
        self.alternates = []
        self.pos_classes = []

[docs]    @staticmethod
    def run(word,letters):
        obj = Mayangoli(word,letters)
        obj.find_letter_positions()
        if len(obj.matches_and_positions) == 0:
            return []
        obj.find_correspondents()
        obj.generate_word_alternates()
        return obj.alternates

[docs]    def find_letter_positions(self):
        for idx,letter in enumerate(self.letters):
            p = tamil.utf8.splitMeiUyir(letter)
            if len(p) == 1:
                continue
            mei,uyir=p
            for r in range(0,len(Mayangoli.varisai)):
                for c in range(0,len(Mayangoli.varisai[r])):
                    if mei == Mayangoli.varisai[r][c]:
                        self.matches_and_positions.append((idx,r,c))
        return len(self.matches_and_positions) > 0

[docs]    def find_correspondents(self):
        for pos,r,c in self.matches_and_positions:
            src_letter  = self.letters[pos]
            _,src_uyir = tamil.utf8.splitMeiUyir(src_letter)
            alt_letters = list()
            for alternate_mei in Mayangoli.varisai[r]:
                alt_letters.append( tamil.utf8.joinMeiUyir(alternate_mei,src_uyir) )
            self.pos_classes.append(alt_letters)
        return True

    def _generate_combinations(self):
        return itertools.product(*self.pos_classes)

[docs]    def generate_word_alternates(self):
        # find matches in Mayangoli classes
        # if there are no Mayangoli matches then we return []
        # for each match we find the class and find corresponding uyirmei alternates
        # generate the combinations of these alternates in the said word positions
        # caller will filter the new word alternates (returned)
        # based on substituting these correspondents
        for position_sub in self._generate_combinations():
            alt_letters = copy.copy(self.letters)
            if _DEBUG: pprint.pprint(position_sub)
            idx =0
            for pos,r,c in self.matches_and_positions:
                alt_letters[pos] = position_sub[idx]
                idx += 1
            word_alt = u''.join(alt_letters)
            self.alternates.append(word_alt)
        return True

[docs]class Typographical:
[docs]    @staticmethod
    def checkFormErrors(word,errmsg=None):
        r1=BadIME()
        r2=AdjacentConsonants()
        r2.freq_threshold=4
        r3=AdjacentVowels()
        item0 = operator.itemgetter(0)
        if errmsg and r1.apply(word)[0]:
            errmsg.append(u"BadIME")
            print("Bad IME")
        return any(list(map(lambda obj: not item0(obj.apply(word)),[r1,r2,r3])))

[docs]class Speller(object):
    TVU_dict = None
    ENL_dict = None
    punctuation = string.punctuation+'()[]{}'
    def __init__(self,filename=None,lang="ta",mode="non-web"):
        object.__init__(self)
        self.lang = lang.lower()
        self.filename = filename
        self.user_dict = set()
        self.case_filter = CaseFilter( RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix() )
        if not self.in_tamil_mode():
            self.alphabets = [a for a in string.ascii_lowercase]
        else:
            self.alphabets = None

        if mode == "web":
            return

        if not self.filename:
            self.interactive()
        else:
            self.spellcheck(self.filename)

[docs]    def in_tamil_mode(self):
        return self.lang != u"en"

[docs]    @staticmethod
    def get_dictionary():
        LoadDictionary.lock.acquire()
        if not Speller.TVU_dict:
            Speller.TVU_dict,_ = DictionaryBuilder.create(TamilVU)
        LoadDictionary.lock.release()
        return Speller.TVU_dict

[docs]    @staticmethod
    def get_english_dictionary():
        LoadDictionary.lock.acquire()
        if not Speller.ENL_dict:
            Speller.ENL_dict,_ = DictionaryBuilder.create(EnglishLinux)
        LoadDictionary.lock.release()
        return Speller.ENL_dict

[docs]    def language(self):
        if self.in_tamil_mode():
            return "tamil"
        return "english"

[docs]    def checklang(self,word):
        if self.in_tamil_mode():
            return tamil.utf8.all_tamil(word)
        for w in word.lower():
            if not ( w in string.ascii_lowercase ):
                return False
        return True

    # full-text interface driver for unittest @ Dec 10, 2017
[docs]    def noninteractive_spellcheck(self,text):
        nwords = 0
        npass = 0
        nfail = 0
        fail_n_suggs = dict()
        for word in re.split('\s+',text):
            if len(word) < 1:
                continue
            nwords += 1
            result,suggs = self.REST_interface(word)
            nfail += int(not result)
            npass += int(result)
            if not result:
                fail_n_suggs[word] = suggs
        obj = {'total':nwords,
        'correct_words':npass,
        'wrong_words':nfail,
        'word_suggestions':fail_n_suggs}
        return obj

    # Ref: https://www.tinymce.com/docs/plugins/spellchecker/
[docs]    def REST_interface(self,word):
        # returns JSON data in TinyMCE format
        ok,suggs = self.check_word_and_suggest( word )
        if _DEBUG:
            print("REST => %d"%ok)
            pprint.pprint(suggs)
        if ok:
            return ok, {}
        return ok, suggs

[docs]    @staticmethod
    def dice_comparison(ref_word,word):
        """ use this class method for SORTED"""
        val = Dice_coeff(ref_word,word)
        if ( val == 1 ):
            return 0
        return (2*(val - 0.5) > 0) and 1 or -1

[docs]    def suggestion_policy(self,word,suggs):
        # pick suggestions that are only +/- 2 letter length different
        filter_suggs = []
        tamil_length = lambda w: len(tamil.utf8.get_letters(w))
        ref_wl = tamil_length(word)
        accept_min_max = [max(ref_wl-2,1),ref_wl+1]
        filter_suggs = filter(lambda w: tamil_length(w) >= accept_min_max[0] and len(w) <= accept_min_max[1], suggs)
        # sort the suggestions by Dice coefficient
        filter_suggs = set(filter_suggs)
        if len(filter_suggs) == 0:
            # guess!
            filter_suggs = suggs
            filter_suggs=sorted(filter_suggs,cmp=tamil.utf8.compare_words_lexicographic)
            filter_suggs[min(10,len(filter_suggs)-1):]=[]
            return filter_suggs
        filter_suggs=sorted(filter_suggs,cmp=Speller.dice_comparison)
        return filter_suggs

[docs]    def str_suggestions(self,word):
        if self.in_tamil_mode():
            return u"சொல் \"%s\" மாற்றங்கள்"%word
        return u"SUGGESTIONS for \"%s\""%word

[docs]    def mayangoli_suggestions(self,word,letters):
        alternates = Mayangoli.run(word,letters)
        alternates = filter(lambda w: w != word, alternates)
        if _DEBUG:
            for idx,w in enumerate(alternates):
                pprint.pprint(["Myangoli",idx,w])
        return copy.copy(alternates)

[docs]    def interactive(self):
        try:
            while( True ):
                if PYTHON3:
                    word = input(u">> ")
                else:
                    word = raw_input(u">> ")
                    word = word.decode("utf-8").strip()
                word = re.sub(u"\s+","",word)

                # skip empty words
                if len(word) < 1:
                    continue

                if not self.checklang(word):
                    print(u"EXCEPTION \"%s\" is not a %s Word"%(word,self.language()))
                    continue
                ok,suggs = self.check_word_and_suggest( word )
                suggs = self.suggestion_policy(word,suggs)
                if not ok:
                    words_per_row = 4
                    option_str = u", ".join( [ u"(%d) %s"%(itr,wrd) + ((itr > 0 and itr%words_per_row == 0) and u"\n" or u"") for itr,wrd in enumerate(suggs)] )
                    print(u"%s\n\t %s"%(self.str_suggestions(word),option_str))
                else:
                    print(self.in_tamil_mode() and  u"சரி" or u"OK")
        except KeyboardInterrupt as ke:
            pass
        except EOFError as eof:
            pass
        finally:
            print(self.in_tamil_mode() and  u"\nவணக்கம்!" or "\nBYE!")
        return

[docs]    def spellcheck(self,filename):
        new_document = []
        data = codecs.open(filename,u"r",u"utf-8")
        lines = data.readlines()
        for line in lines:
            words = tamil.utf8.get_words( tamil.utf8.get_letters(line) )
            for word  in words:
                # FIXME : handle punctuation
                #word = filter( tamil.utf8.is_tamil_unicode_predicate, word )
                ok,suggs = self.check_word_and_suggest( word )
                if PYTHON3 and not ok:
                    suggs = list(suggs)
                if not ok:
                    option = suggs[0]
                    # take user input.
                    # FIXME: User options to include DONTREPLACE/KEEP, DELETE WORD, etc.
                    option_str = u", ".join( [ u"(%d) %s"%(itr,wrd) for itr,wrd in enumerate(suggs)] )
                    if self.in_tamil_mode():
                        print(u"வரி \"%s\""%line.strip())
                        print(u"'%s' சொல்லை கொண்டு\n\t சொல்லை '%s' மாற்றிடு\n"%(option_str,word))
                    else:
                        print(u"Line, \"%s\""%line.strip())
                        print(u" Replace word %s with\n\t => %s\n"%(word, option_str))
                    try:
                        if self.in_tamil_mode():
                            choice_str="விருப்பம் [-1 புறக்கணி, 0-%d மாற்றவும்]:"
                        else:
                            choice_str=u"option [-1 ignore, 0-%d replace]: "
                        choice = input(choice_str%(len(suggs)-1))
                        if PYTHON3:
                            choice = int(choice)
                        if choice == -1:
                            if self.in_tamil_mode():
                                print(u"வார்த்தை மாறாத இருந்தது")
                            else:
                                print(u"Not replacing word")

                            option = word
                            self.user_dict.add(word)
                        else:
                            option = suggs[choice]
                    except Exception as ie:
                        print (str(ie))
                    if self.in_tamil_mode():
                        replace_msg=u"வார்த்தை %s -> %s இதற்காக மாற்றவும்\n"
                    else:
                        replace_msg = u" replacing word %s -> %s\n"
                    print(replace_msg%(word,option))
                    new_document.append( unicode(option) )
                else:
                    new_document.append( word )
            new_document.append(u"\n")
        if self.in_tamil_mode():
            print(u"*********** ஆவணத்தில் உள்ள பிழைகளை திருத்திய பின் *********")
        else:
            print(u"*********** cleaned up document **********")
        print(u" ".join(new_document))

[docs]    def get_lang_dictionary(self):
        if not self.in_tamil_mode():
            return Speller.get_english_dictionary()
        return Speller.get_dictionary()

[docs]    def isWord(self, word):
        # Plain old dictionary checks
        LANG_dict = self.get_lang_dictionary()
        is_dict_word = LANG_dict.isWord(word)
        in_user_dict = word in self.user_dict or is_dict_word
        return in_user_dict

[docs]    def add_numeral_words(self,lexicon):
        if not self.in_tamil_mode():
            return

        units = (u'பூஜ்ஜியம்', u'ஒன்று', u'இரண்டு', u'மூன்று', u'நான்கு', u'ஐந்து', u'ஆறு', u'ஏழு', u'எட்டு', u'ஒன்பது', u'பத்து') # 0-10
        teens = (u'பதினொன்று', u' பனிரண்டு', u'பதிமூன்று', u'பதினான்கு', u'பதினைந்து',u'பதினாறு', u'பதினேழு', u'பதினெட்டு', u'பத்தொன்பது') # 11-19
        tens = (u'பத்து', u'இருபது', u'முப்பது', u'நாற்பது', u'ஐம்பது',u'அறுபது', u'எழுபது', u'எண்பது', u'தொன்னூறு') # 10-90
        tens_suffix = (u'இருபத்து', u'முப்பத்து', u'நாற்பத்து', u'ஐம்பத்து', u'அறுபத்து', u'எழுபத்து', u'எண்பத்து', u'தொன்னூத்து') # 10+-90+
        hundreds = ( u'நூறு', u'இருநூறு', u'முந்நூறு', u'நாநூறு',u'ஐநூறு', u'அறுநூறு', u'எழுநூறு', u'எண்ணூறு', u'தொள்ளாயிரம்') #100 - 900
        hundreds_suffix = (u'நூற்றி', u'இருநூற்றி', u'முந்நூற்று', u'நாநூற்று', u'ஐநூற்று', u'அறுநூற்று', u'எழுநூற்று', u'எண்ணூற்று',u'தொள்ளாயிரத்து') #100+ - 900+
        one_thousand_prefix = (u'ஓர்',)
        thousands = (u'ஆயிரம்',u'ஆயிரத்தி')

        one_prefix = (u'ஒரு',)
        lakh = (u'இலட்சம்',u'இலட்சத்து')
        crore = (u'கோடி',u'கோடியே')

        mil = (u'மில்லியன்',)
        bil = (u'பில்லியன்',)
        tril = (u'டிரில்லியன்',)

        if lexicon.isWord(tril[0]):
            return

        numerals = list()
        for wordset in [units,tens,teens,tens_suffix,hundreds,hundreds_suffix,one_thousand_prefix,thousands,one_prefix,lakh,crore,mil,bil,tril]:
            numerals.extend(wordset)
        #with codecs.open("numerals.json","w","utf-8") as fp:
        #    fp.write(json.dumps(numerals))
        for word in numerals:
            lexicon.add(word)

[docs]    @staticmethod
    def scrub_ws(word):
        return re.sub(u'[\s{}()\[\]]+',u'',word)

[docs]    def check_word_and_suggest( self,word, errmsg = None ):
        word = word.strip()
        # skip known punctuation at end of line
        while len(word) >= 1 and any(map(word.endswith,Speller.punctuation)):
            word = word[:-1]
        while len(word) >= 1 and any(map(word.startswith,string.whitespace)):
            word = word[1:]

        # is number then we propose a numeral
        if self.in_tamil_mode():
            numword = word.replace(u',',u'')
            if re.match(u'[+|-]*[\d]+',numword):
                try:
                    num = float(numword)
                    posnum = num
                    if num < 0:
                        posnum = -1*num
                    numeral_form = tamil.numeral.num2tamilstr(posnum)
                    if num < 0:
                        numeral_form = u"கழித்தல் "+numeral_form
                    return (False,[numeral_form])
                except Exception as ioe:
                    pass

            # dates are okay
            if any(map(word.endswith,[u"-இல்",u"-ஆம்",u"-இலிருந்து", u"-வரை"])):
                if re.search('^\d+',word):
                    return (True,[word]) #word is okay

            # check if words are transliterated
            if any(filter(lambda x: x in string.ascii_letters,tamil.utf8.get_letters(word))):
                # letter-sequence only
                en_word = Speller.scrub_ws(word)
                EN_Lexicon = Speller.get_english_dictionary()
                if EN_Lexicon.isWord(en_word):
                    return (False,['']) #English word - nosub- yet until we have parallel dictionaries or translation. TBD.

                #is english letter
                ta = algorithm.Iterative.transliterate(jaffna.Transliteration.table,en_word)
                # TBD: potential for having ANN to tell if english text is pure English word
                # or a romanized Tamil word. Output of classifier can be useful here.
                return (False,[ta])

            # check if it matches Tamil numeral and has close match.
            # propose suggestions from that list.
            # TBD

        # hyphens are not okay
        if word.find(u"-") >= 0:
            return (False,[word.replace(u"-",u" ")])#re.sub(u"^w"," ",word))
        # replace other spurious ()[] punctuations by concatenation
        #word = u"".join(filter(lambda l: not( l in Speller.punctuation), tamil.utf8.get_letters(word)))
        orig_word = u"%s"%word

        # remove digits
        word = re.sub(u'\d+',u'',word)
        letters = tamil.utf8.get_letters(word)
        TVU_dict = self.get_lang_dictionary()
        self.add_numeral_words(TVU_dict)

        # Check if this 'word' is any common kind of error
        if Typographical.checkFormErrors(word,errmsg):
            if errmsg: errmsg.append("TypographicalError")

        if not self.checklang(word):
            print("Word is not in desired language!")
            return (False,[u""])

        if len(word) < 1:
            print("Word is too small")
            return (False,[u''])

        # plain old dictionary + user dictionary check
        if self.isWord(word):
            return (True,word)

        # Remove case and redo the dictionary + user check
        word_nocase = self.case_filter.apply( word )
        if ( self.isWord( word_nocase ) ):
            return (True,word_nocase)
        else:
            word = word_nocase

        # Consider splitting the word and see if it has 2 sub-words
        # e.g. செயல்பட => செயல் + பட
        alt = tamil.wordutils.greedy_split(word,TVU_dict)
        greedy_results = list()
        if len(alt) >= 1:
            greedy_results = [u" ".join(alt),u"-".join(alt)]
            greedy_results.extend(alt)
            #return (False, greedy_results )

        # if there are no other suggestions than deletion filter, we return
        # in presence of other suggestions we can just return suggestions
        suggs = DeletionFilter.get_suggestions(letters,TVU_dict)
        if len(suggs) > 0:
            if len(greedy_results) == 0:
                return (False,suggs)
            else:
                greedy_results.extend(suggs)

        # ottru splitting for Tamil language mode
        ottru_options = []
        if self.in_tamil_mode():
            # discover words like யாரிகழ்ந்து are accepted.
            ottru = OttruSplit(word,letters)
            ottru.run(TVU_dict)
            if len(ottru.results) > 0:
                return (True,word)
            ottru_options = ottru.results

        # TODO: Noun Declension - ticket-

        # suggestions at edit distance 1
        norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor( word, self.alphabets, 2,limit=25))
        combinagram_suggests = list(tamil.wordutils.combinagrams(word,TVU_dict,limit=25))
        pfx_options = TVU_dict.getWordsStartingWith( u"".join( letters[:-1] ) )

        # FIXME: score  the options
        options = greedy_results
        options.extend( ottru_options )
        options.extend( list(norvig_suggests) )
        options.extend( combinagram_suggests )
        options.extend( pfx_options )

        # filter the options against a dictionary!
        options = filter(TVU_dict.isWord,options )
        if PYTHON3:
            options = list(options)

        if self.in_tamil_mode():
            options.extend( self.mayangoli_suggestions(orig_word,letters) )

        # sort the options
        if not self.in_tamil_mode():
            options.sort()
        else:
            if PYTHON3:
                options = sorted( options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic) )
            else:
                options = sorted( options, cmp=tamil.utf8.compare_words_lexicographic )

        # remove replacements with single-letter words
        WL = len(tamil.utf8.get_letters(word))
        if WL > 3:
            options = filter( lambda x:  len(tamil.utf8.get_letters(x)) > 2, options )

        # remove dupes in list
        options2 = []
        prev = None
        for val in options:
            if val.strip() != prev:
                options2.append(val.strip())
            prev = val.strip()
        del options
        if _DEBUG:
            print("@deduplication")
            pprint.pprint(options2)

        # score by Dice or Edit-Distance coefficients
        options_score = [0.0 for i in range(len(options2))]
        for itr,sugg_word in enumerate(options2):
            #options_score[itr] = Dice_coeff( word, sugg_word )
            options_score[itr] = (len(word)-edit_distance(word,sugg_word))/(1.0*len(orig_word))*Dice_coeff( word, sugg_word )/3.0 #dice coeff is weighted down
        options = zip( options2, options_score)

        # limit options by score
        options = sorted(options,key=operator.itemgetter(1),reverse=True)
        options = [word_pair[0] for word_pair in options]
        #L = 40
        # limit to first top -L=20 only which is good enough
        #options = options[0:min(len(options),L)]
        if _DEBUG:
            pprint.pprint("@after scoring/sorting")
            pprint.pprint(options)

        # eliminate single letter options
        options = filter(lambda x : not( x in tamil.utf8.tamil_letters), options)

        # Due to suggestion policy we may have words which are found in error but we dont have
        # replacements for them!

        # TBD: options should not have the 'word'!
        return (False, options )

[docs]def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(u"files",nargs='*',default=[])
    parser.add_argument(u"-debug",action=u"store_true",\
                        default=False,\
                        help=u"enable debugging information on screen")
    parser.add_argument(u"-l",u"--lang",default=u"TA",choices=(u"TA",u"EN"),\
                        help=u"option to specify English or Tamil (default) language")
    parser.add_argument(u"-i",u"--interactive",help=u"use the interactive mode",\
                        default=False,action=u"store_true")
    args = parser.parse_args()

    if not args.interactive and len(args.files) < 1:
        parser.print_help()
        sys.exit(0)
    LoadDictionary().start()

    if args.interactive:
        lang = args.lang.lower()
        Speller(filename=None,lang=lang)
        sys.exit(0)
    else:
        for file_name in args.files:
            Speller(file_name,lang="ta")

if __name__ == u'__main__':
    main()
#TBD: dieties, divinity, language, people, places, personalities to be added.
#TBD: colors, cities, places, countries, currencies to be added.
#TBD: proper nouns common names etc.
#Find bugs in TinyMCE where spell module does not highlight all the mentioned words.
#TBD: Rank options by scoring bigram models
#TBD: Insertion errors are not searched.