1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20   
 21  """A class to calculate a similarity based on the Levenshtein  
 22  distance. See http://en.wikipedia.org/wiki/Levenshtein_distance. 
 23   
 24  If available, the python-Levenshtein package will be used which will provide 
 25  better performance as it is implemented natively. See 
 26  http://trific.ath.cx/python/levenshtein/ 
 27  """ 
 28   
 29  import math 
 30  import sys 
 31   
 33      """Calculates the distance for use in similarity calculation. Python 
 34      version.""" 
 35      l1 = len(a) 
 36      l2 = len(b) 
 37      if stopvalue == -1: 
 38          stopvalue = l2 
 39      current = range(l1+1) 
 40      for i in range(1,l2+1): 
 41          previous, current = current, [i]+[0]*l1 
 42          least = l2 
 43          for j in range(1, l1 + 1): 
 44              change = previous[j-1] 
 45              if a[j-1] != b[i-1]: 
 46                  change = change + 1 
 47              insert = previous[j] + 1 
 48              delete = current[j-1] + 1 
 49              current[j] = min(insert, delete, change) 
 50              if least > current[j]: 
 51                  least = current[j] 
 52           
 53           
 54          if least > stopvalue: 
 55              return least 
 56       
 57      return current[l1] 
  58   
 60      """Same as python_distance in functionality. This uses the fast C  
 61      version if we detected it earlier. 
 62       
 63      Note that this does not support arbitrary sequence types, but only  
 64      string types.""" 
 65      return Levenshtein.distance(a,b) 
  66   
 67  try: 
 68      import Levenshtein as Levenshtein 
 69      distance = native_distance 
 70  except Exception: 
 71      print >> sys.stderr, "Python-Levenshtein not found. Continuing with built-in (slower) fuzzy matching." 
 72      distance = python_distance 
 73   
 76          self.MAX_LEN = max_len 
  77   
 97   
 99          """Returns the similarity between a and b based on Levenshtein distance. It 
100      can stop prematurely as soon as it sees that a and b will be no simmilar than 
101      the percentage specified in stoppercentage. 
102   
103      The Levenshtein distance is calculated, but the following should be noted: 
104          * Only the first MAX_LEN characters are considered. Long strings differing 
105            at the end will therefore seem to match better than they should. See the 
106            use of the variable penalty to lessen the effect of this. 
107          * Strings with widely different lengths give the opportunity for shortcut. 
108            This is by definition of the Levenshtein distance: the distance will be  
109            at least as much as the difference in string length. 
110          * Calculation is stopped as soon as a similarity of stoppercentage becomes 
111            unattainable. See the use of the variable stopvalue. 
112          * Implementation uses memory O(min(len(a), len(b)) 
113          * Excecution time is O(len(a)*len(b)) 
114      """ 
115          l1, l2 = len(a), len(b) 
116          if l1 == 0 or l2 == 0: 
117              return 0 
118           
119          if l1 > l2: 
120              l1,l2 = l2,l1 
121              a, b = b, a 
122               
123           
124           
125          assert 0 <= stoppercentage <= 100 
126          maxsimilarity = 100 - 100.0*abs(l1 - l2)/l2 
127          if maxsimilarity < stoppercentage: 
128              return maxsimilarity * 1.0 
129   
130           
131          penalty = 0 
132          if l2 > self.MAX_LEN: 
133              b = b[:self.MAX_LEN] 
134              l2 = self.MAX_LEN 
135              penalty += 7 
136              if l1 > self.MAX_LEN: 
137                  a = a[:self.MAX_LEN] 
138                  l1 = self.MAX_LEN 
139                  penalty += 7 
140           
141           
142          stopvalue = math.ceil((100.0 - stoppercentage)/100 * l2) 
143          dist = distance(a, b, stopvalue) 
144          if dist > stopvalue: 
145              return stoppercentage - 1.0 
146           
147           
148           
149          if dist != 0: 
150              penalty = 0 
151          return 100 - (dist*1.0/l2)*100 - penalty