1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20   
 21   
 22  """Class to perform translation memory matching from a store of translation units""" 
 23   
 24  from translate.search import lshtein 
 25  from translate.search import terminology 
 26  from translate.storage import base 
 27  from translate.storage import po 
 28  from translate.misc.multistring import multistring 
 29  import heapq 
 30   
 32      """Returns the length of the source string""" 
 33      return len(unit.source) 
  34   
 36      """Compares using sourcelen""" 
 37       
 38      xlen = sourcelen(x) 
 39      ylen = sourcelen(y) 
 40      return cmp(xlen, ylen) 
  41   
 43      """A class that will do matching and store configuration for the matching process""" 
 44 -    def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False): 
  45          """max_candidates is the maximum number of candidates that should be assembled, 
 46          min_similarity is the minimum similarity that must be attained to be included in 
 47          the result, comparer is an optional Comparer with similarity() function""" 
 48          if comparer is None: 
 49              comparer = lshtein.LevenshteinComparer(max_length) 
 50          self.comparer = comparer 
 51          self.setparameters(max_candidates, min_similarity, max_length) 
 52          self.usefuzzy = usefuzzy 
 53          self.inittm(store) 
 54          self.addpercentage = True 
  55           
 68   
 70          """Initialises the memory for later use. We use simple base units for  
 71          speedup.""" 
 72          self.existingunits = {} 
 73          self.candidates = base.TranslationStore() 
 74           
 75          if not isinstance(stores, list): 
 76              stores = [stores] 
 77          for store in stores: 
 78              self.extendtm(store.units, store=store, sort=False) 
 79          self.candidates.units.sort(sourcelencmp) 
  80           
 81           
 82   
 83 -    def extendtm(self, units, store=None, sort=True): 
  84          """Extends the memory with extra unit(s). 
 85           
 86          @param units: The units to add to the TM. 
 87          @param store: Optional store from where some metadata can be retrieved 
 88          and associated with each unit. 
 89          @param sort:  Optional parameter that can be set to False to supress  
 90          sorting of the candidates list. This should probably only be used in  
 91          inittm(). 
 92          """ 
 93          if not isinstance(units, list): 
 94              units = [units] 
 95          candidates = filter(self.usable, units) 
 96          for candidate in candidates: 
 97              simpleunit = base.TranslationUnit("") 
 98               
 99               
100              if isinstance(candidate.source, multistring): 
101                  if len(candidate.source.strings) > 1: 
102                      simpleunit.orig_source = candidate.source 
103                      simpleunit.orig_target = candidate.target 
104                  simpleunit.source = unicode(candidate.source) 
105                  simpleunit.target = unicode(candidate.target) 
106              else: 
107                  simpleunit.source = candidate.source 
108                  simpleunit.target = candidate.target 
109               
110               
111               
112               
113              simpleunit.addnote(candidate.getnotes(origin="translator")) 
114              simpleunit.fuzzy = candidate.isfuzzy() 
115              if store: 
116                  simpleunit.filepath = store.filepath 
117                  simpleunit.translator = store.translator 
118                  simpleunit.date = store.date 
119              self.candidates.units.append(simpleunit) 
120          if sort: 
121              self.candidates.units.sort(sourcelencmp) 
 122   
123 -    def setparameters(self, max_candidates=10, min_similarity=75, max_length=70): 
 124          """Sets the parameters without reinitialising the tm. If a parameter  
125          is not specified, it is set to the default, not ignored""" 
126          self.MAX_CANDIDATES = max_candidates 
127          self.MIN_SIMILARITY = min_similarity 
128          self.MAX_LENGTH = max_length 
 129            
131          """Calculates a length beyond which we are not interested. 
132          The extra fat is because we don't use plain character distance only.""" 
133          return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH) 
 134   
136          """Calculates the minimum length we are interested in. 
137          The extra fat is because we don't use plain character distance only.""" 
138          return max(len(text) * (min_similarity/100.0), 1) 
 139       
141          """Returns a list of possible matches for given source text. 
142           
143          @type text: String 
144          @param text: The text that will be search for in the translation memory 
145          @rtype: list 
146          @return: a list of units with the source and target strings from the  
147          translation memory. If self.addpercentage is true (default) the match  
148          quality is given as a percentage in the notes. 
149          """ 
150          bestcandidates = [(0.0,None)]*self.MAX_CANDIDATES 
151          heapq.heapify(bestcandidates) 
152           
153           
154          min_similarity = self.MIN_SIMILARITY 
155           
156           
157           
158   
159           
160          startlength = self.getstartlength(min_similarity, text) 
161          startindex = 0 
162          for index, candidate in enumerate(self.candidates.units): 
163              if len(candidate.source) >= startlength: 
164                  startindex = index 
165                  break 
166           
167           
168          stoplength = self.getstoplength(min_similarity, text)  
169   
170          for candidate in self.candidates.units[startindex:]: 
171              cmpstring = candidate.source 
172              if len(cmpstring) > stoplength: 
173                  break 
174              similarity = self.comparer.similarity(text, cmpstring, min_similarity) 
175              if similarity < min_similarity: 
176                  continue 
177              lowestscore = bestcandidates[0][0] 
178              if similarity > lowestscore: 
179                  targetstring = candidate.target 
180                  heapq.heapreplace(bestcandidates, (similarity, candidate)) 
181                  if min_similarity < bestcandidates[0][0]: 
182                      min_similarity = bestcandidates[0][0] 
183                      stoplength = self.getstoplength(min_similarity, text)  
184           
185           
186          def notzero(item): 
187              score = item[0] 
188              return score != 0 
 189          bestcandidates = filter(notzero, bestcandidates) 
190           
191          bestcandidates.sort() 
192          bestcandidates.reverse() 
193          return self.buildunits(bestcandidates) 
 194   
196          """Builds a list of units conforming to base API, with the score in the comment""" 
197          units = [] 
198          for score, candidate in candidates: 
199              if hasattr(candidate, "orig_source"): 
200                  candidate.source = candidate.orig_source 
201                  candidate.target = candidate.orig_target 
202              newunit = po.pounit(candidate.source) 
203              newunit.target = candidate.target 
204              newunit.markfuzzy(candidate.fuzzy) 
205              newunit.filepath = candidate.filepath 
206              newunit.translator = candidate.translator 
207              newunit.date = candidate.date 
208              candidatenotes = candidate.getnotes().strip() 
209              if candidatenotes: 
210                  newunit.addnote(candidatenotes) 
211              if self.addpercentage: 
212                  newunit.addnote("%d%%" % score) 
213              units.append(newunit) 
214          return units 
 215   
217      """A matcher with settings specifically for terminology matching""" 
218 -    def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None): 
 219          if comparer is None: 
220              comparer = terminology.TerminologyComparer(max_length) 
221          matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer) 
222          self.addpercentage = False 
 223   
225          """Normal initialisation, but convert all source strings to lower case""" 
226          matcher.inittm(self, store) 
227          for unit in self.candidates.units: 
228              unit.source = unit.source.lower() 
 229   
234               
239               
241          """Normal matching after converting text to lower case. Then replace 
242          with the original unit to retain comments, etc.""" 
243          text = text.lower() 
244          matches = matcher.matches(self, text) 
245          return matches 
  246