Levenshtein distance
March 28th, 2006
We usually want to talk about Levenshtein distance when we want to compare two different strings. Pratically, the Levenshtein distance (as its creator’s name, Vladimir Levenshtein) is a measure of the similarity between two strings. If you want to look at the algorithm in detail, then you should take a look at this page available on wikipedia.
The Levenshtein distance is nothing but else than the number of steps needed to transform a string into another one. For example, if we want to go from “cat” to “cats”, then we have a Levenshtein distance equal to 1 (since we added only a ‘s’, so one step).
Here is the code I done (tested with gcc, after a strip --strip-unneeded is only 4.5kb large):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define MAX_STR_LEN 100
typedef unsigned short int Matrix;
static int levenshtein(const char *, const char *);
static int min(const int, const int, const int);
static char *strlower(char *);
int
main(int argc, char **argv)
{
unsigned short int i, ss, c_sens = 1;
char str[1][MAX_STR_LEN];
if (argc < 3) {
printf("Usage: %s [-i] <string1> <string2>n", argv[0]);
return EXIT_FAILURE;
}
ss = 0;
for (i = 1; i < argc; i++)
{
if (!strcmp(argv[i], "-i")) {
c_sens = 0;
} else {
strcpy(str[ss], argv[i]);
ss++;
}
}
if (!c_sens) {
strlower(str[0]);
strlower(str[1]);
}
printf("[%s]/[%s]:%dn", str[0], str[1], levenshtein(str[0], str[1]));
return EXIT_SUCCESS;
}
static int
levenshtein(const char *str1, const char *str2)
{
unsigned short int n, m, i, j, cost;
Matrix matrix[MAX_STR_LEN][MAX_STR_LEN];
n = strlen(str1);
m = strlen(str2);
if (n == 0)
return m;
if (m == 0)
return n;
for (i = 0; i <= n; i++)
matrix[i][0] = i;
for (i = 0; i <= m; i++)
matrix[0][i] = i;
for (i = 1; i <= n; i++)
{
for (j = 1; j <= m; j++)
{
cost = ((str1[i - 1] == str2[j - 1]) ? 0 : 1);
matrix[i][j] = min( matrix[i - 1][j] + 1,
matrix[i][j - 1] + 1,
matrix[i - 1][j - 1] + cost);
}
}
return matrix[n][m];
}
static int
min(const int a, const int b, const int c)
{
unsigned short int min = a;
if (b < min)
min = b;
if (c < min)
min = c;
return min;
}
static char *
strlower(char *a)
{
unsigned short int i, len = strlen(a);
for (i = 0; i < len; i++)
*(a + i) = tolower(*(a + i));
return a;
}
If you want a case insensitive computation, use the -i switch on the command line.
This version could be improved by using pointers when threating the matrix, but maybe I would do it for a newer release.