I wrote a little python function to calculate the Levenshtein distance for an assignment. I figured it was some time ago since I released code on my blog, so here goes.
#!/usr/bin/python/
from math import ceil, log10
def flatten(x):
result = []
for el in x:
if hasattr(el, "__iter__") and not isinstance(el, basestring):
result.extend(flatten(el))
else:
result.append(el)
return result
def LevenshteinDistance(str1, str2):
def _print():
PAD = int(max( map(lambda x: len(str(x)), flatten(table))))
headline = " ".ljust(PAD+1)
for l in " " + str2:
headline += l.ljust(PAD) + " "
print headline
for y in range(len(table)):
idx = y - 1
if idx < 0:
l = " ".ljust(PAD)
else:
l = str1[y-1].ljust(PAD)
for x in table[y]:
l += " " + str(x).ljust(PAD)
print l
table = [ [ 0 for y in range(len(str2)+1)] for x in range(len(str1)+1)]
for i in range( len(str1) + 1):
table[i][0] = i
for j in range( len(str2) + 1):
table[0][j] = j
for i in range(len(str1)):
for j in range(len(str2)):
cost = not (str1[i] == str2[j])
table[i+1][j+1] = min( table[i][j+1] +1, table[i+1][j] +1, table[i][j] + cost)
return table[-1][-1]
# Testing the function
if __name__ == "__main__":
print "Testing LevenshteinDistance"
strings = ("LevenshteinDistance", "String", "Pseudocode", "meilenstein","levenshtein")
for str1 in strings:
for str2 in strings:
if str1 != str2:
print "%s >< %s => %d" % (str1, str2, LevenshteinDistance(str1, str2))
If you run the sucker you’ll get the following output from the test-cases:
bromer-2% ./LevenshteinDistance.py Testing LevenshteinDistance LevenshteinDistance >< String => 16 LevenshteinDistance >< Pseudocode => 16 LevenshteinDistance >< meilenstein => 12 LevenshteinDistance >< levenshtein => 9 String >< LevenshteinDistance => 16 String >< Pseudocode => 10 String >< meilenstein => 9 String >< levenshtein => 9 Pseudocode >< LevenshteinDistance => 16 Pseudocode >< String => 10 Pseudocode >< meilenstein => 10 Pseudocode >< levenshtein => 10 meilenstein >< LevenshteinDistance => 12 meilenstein >< String => 9 meilenstein >< Pseudocode => 10 meilenstein >< levenshtein => 4 levenshtein >< LevenshteinDistance => 9 levenshtein >< String => 9 levenshtein >< Pseudocode => 10 levenshtein >< meilenstein => 4
It’s pretty straight forward. For debugging I added a _print() function that will print the matrix. For example, the finished matrix for a “levenshtein” vs. “LevenshteinDistance” comparison looks like this:
L e v e n s h t e i n D i s t a n c e
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
l 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
e 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
v 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
e 4 4 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
n 5 5 4 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
s 6 6 5 4 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14
h 7 7 6 5 4 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13
t 8 8 7 6 5 4 3 2 1 2 3 4 5 6 7 8 9 10 11 12
e 9 9 8 7 6 5 4 3 2 1 2 3 4 5 6 7 8 9 10 11
i 10 10 9 8 7 6 5 4 3 2 1 2 3 4 5 6 7 8 9 10
n 11 11 10 9 8 7 6 5 4 3 2 1 2 3 4 5 6 7 8 9
levenshtein >< LevenshteinDistance => 9