Damerau–Levenshtein distance in SQL
Couple of years ago I needed to implement a kind of fuzzy matching algorithm in SQL Server. Today I have just found my code 😊. I implemented it in SQL 2005 and it works on newer versions as well. Code is based on the Damerau–Levenshtein distance algorithm. I was using a SQL CLR user defined scalar function: inputs are 2 strings and returns a number between 0 and 1. If it is more close to 1 it means the two input strings are closer to each other. Of course SSIS Fuzzy Lookup Transformation may work better 😉. Here is the code I was using:
1using System;
2using System.Data;
3using System.Data.SqlClient;
4using System.Data.SqlTypes;
5using Microsoft.SqlServer.Server;
6
7public partial class UserDefinedFunctions
8{
9 // This function is also exposed to the database. It accepts two names,
10 // performs a Damerau-Levenshtein edit distance calculation on them and
11 // calculates a similarity score. this is the fuzzy matching.
12 [Microsoft.SqlServer.Server.SqlFunction]
13 [return: Microsoft.SqlServer.Server.SqlFacet(Precision = 5, Scale = 4)]
14 public static SqlDecimal FuzzyMatching(SqlString string1, SqlString string2)
15 {
16 SqlDecimal result;
17 // Special case: Either string is NULL, result is NULL
18 if (string1 == SqlString.Null || string2 == SqlString.Null)
19 result = SqlDecimal.Null;
20 else
21 {
22 // Special case: Either string is empty string, result is 0.0
23 int strlen1 = string1.Value.Length;
24 int strlen2 = string2.Value.Length;
25 if (strlen1 == 0 || strlen2 == 0)
26 return new SqlDecimal(0.0);
27 SqlInt32 distance = CalculateDamLev(string1, string2);
28 result = new SqlDecimal(1.0 - (double)distance / Math.Max(strlen1, strlen2));
29 }
30 return result;
31 }
32
33 // Accepts 3 numbers, returns the minimum
34 private static int Min3(int a, int b, int c)
35 {
36 return Math.Min(a, Math.Min(b, c));
37 }
38
39 // This private function calculates the Damerau-Levenshtein edit distance
40 private static SqlInt32 CalculateDamLev(SqlString string1, SqlString string2)
41 {
42 SqlInt32 result;
43 // Special case: If either string is NULL, the result is NULL
44 if (string1 == SqlString.Null || string2 == SqlString.Null)
45 result = SqlInt32.Null;
46 {
47 // Special case: If either string is length 0, the result is the
48 // length of the other string
49 int strlen1 = string1.Value.Length;
50 int strlen2 = string2.Value.Length;
51 if (strlen1 == 0 || strlen2 == 0)
52 result = new SqlInt32(Math.Max(strlen1, strlen2));
53 else
54 {
55 // d is a table with lenStr1+1 rows and lenStr2+1 columns
56 int[,] calarray = new int[strlen1 + 1, strlen2 + 1];
57
58 // initialize the array
59 for (int i = 0; i < strlen1; i++)
60 calarray[i, 0] = i;
61 for (int i = 0; i < strlen2; i++)
62 calarray[0, i] = i;
63
64 // loop through the array
65 for (int i = 1; i <= strlen1; i++)
66 for (int j = 1; j <= strlen2; j++)
67 {
68 int cost = 0;
69 cost = (char.ToUpper(string1.Value[i - 1]) == char.ToUpper(string2.Value[j - 1])) ? 0 : 1;
70 calarray[i, j] = Min3(
71 calarray[i - 1, j] + 1, // deletion
72 calarray[i, j - 1] + 1, // insertion
73 calarray[i - 1, j - 1] + cost // substitution
74 );
75 if (i > 1
76 && j > 1
77 && char.ToUpper(string1.Value[i - 1]) == char.ToUpper(string2.Value[j - 2])
78 && char.ToUpper(string1.Value[i - 2]) == char.ToUpper(string2.Value[j - 1]))
79 calarray[i, j] = Math.Min(
80 calarray[i, j],
81 calarray[i - 2, j - 2] + cost // transposition
82 );
83 }
84 result = new SqlInt32(calarray[strlen1, strlen2]);
85 }
86 }
87 return result;
88 }
89};
I may used some sources from the internet that time, but I do not remember. However if you feel that part of your code is shown here and would like to have the credit, please drop me a line.