Source code for ngram.Distance
# -*- coding: utf-8 -*-
#
# (C) முத்தையா அண்ணாமலை 2013-2015
#
# N-gram language model for Tamil letters
import tamil
[docs]def edit_distance(wordA,wordB):
"""" Implements Daegmar-Levenshtein edit distance algorithm:
Ref: https://en.wikipedia.org/wiki/Edit_distance
Ref: https://en.wikipedia.org/wiki/Levenshtein_distance"""
if not type(wordA) is list:
lettersA = tamil.utf8.get_letters(wordA)
else:
lettersA = wordA
if not type(wordB) is list:
lettersB = tamil.utf8.get_letters(wordB)
else:
lettersB = wordB
n_A = len(lettersA)
n_B = len(lettersB)
dist_AB = [[0 for i in range(0,n_B+1)] for i in range(0,(n_A+1))]
# Target prefix reached by insertion
for j in range(1,n_B+1):
dist_AB[0][j] = j
for i in range(1,n_A+1):
dist_AB[i][0] = i
for j in range(1,n_B+1):
for i in range(1,n_A+1):
if (lettersA[i-1] == lettersB[j-1]):
new_dist = dist_AB[i-1][j-1]
else:
new_dist = min( [dist_AB[i-1][j]+1, dist_AB[i][j-1]+1, dist_AB[i-1][j-1]+1] ) #del, ins, or sub
dist_AB[i][j] = new_dist
return dist_AB[-1][-1]
[docs]def Jaccard_coeff(*args):
return (1.0 - Dice_coeff(*args))
[docs]def Dice_coeff(wordA,wordB):
"""
# Calculate edit-distance - Implements the Dice coefficent
# Ref: https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
# distance should be between 0 - 1.0. can be used as a similarity match
"""
if not type(wordA) is list:
lettersA = tamil.utf8.get_letters(wordA)
else:
lettersA = wordA
if not type(wordB) is list:
lettersB = tamil.utf8.get_letters(wordB)
else:
lettersB = wordB
n_A = len(lettersA)
n_B = len(lettersB)
# OK only if unique - set(lettersA).intersection(set(lettersB))
n_AB = len( list( filter( lambda cmnL: cmnL in lettersB, lettersA) ) )
return (2.0*n_AB)/(n_A+n_B)