| Trees | Indices | Help | 
 | 
|---|
|  | 
 1  # -*- coding: utf-8 -*- 
 2  #  
 3  # Copyright 2006 Zuza Software Foundation 
 4  #  
 5  # This file is part of translate. 
 6  # 
 7  # translate is free software; you can redistribute it and/or modify 
 8  # it under the terms of the GNU General Public License as published by 
 9  # the Free Software Foundation; either version 2 of the License, or 
10  # (at your option) any later version. 
11  #  
12  # translate is distributed in the hope that it will be useful, 
13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
15  # GNU General Public License for more details. 
16  # 
17  # You should have received a copy of the GNU General Public License 
18  # along with translate; if not, write to the Free Software 
19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
20   
21  """A class that does terminology matching""" 
22   
23  import re 
24   
25  # We don't want to miss certain forms of words that only change a little 
26  # at the end. Now we are tying this code to English, but it should serve 
27  # us well. For example "category" should be found in "categories",  
28  # "copy" should be found in "copied" 
29  # 
30  # The tuples define a regular expression to search for, and what with 
31  # what it should be replaced. 
32  ignorepatterns = [("y\s*$", "ie"),          #category/categories, identify/identifies, apply/applied 
33                    ("[\s-]*", ""),           #down time / downtime, pre-order / preorder 
34                    ("-", " "),               #pre-order / pre order 
35                    (" ", "-"),               #pre order / pre-order 
36                   ] 
37   
38  #TODO: compile regexes 
39   
43   
45          """returns the match quality of term b in the text a""" 
46          # We could segment the words, but mostly it will give less ideal  
47          # results, since we'll miss plurals, etc. Then we also can't search for 
48          # multiword terms, such as "Free Software". Ideally we should use a  
49          # stemmer, like the Porter stemmer. 
50           
51          # So we just see if the word occurs anywhere. This is not perfect since 
52          # we might get more than we bargained for. The term "form" will be found 
53          # in the word "format", for example. A word like "at" will trigger too 
54          # many false positives.  
55   
56          # First remove a possible disambiguating bracket at the end 
57          b = re.sub("\s+\(.*\)\s*$", "", b) 
58   
59          if len(b) <= 2: 
60              return 0 
61               
62          pos = a[:self.MAX_LEN].find(b) 
63          if pos >= 0: 
64              return 100 - pos * 10 / len(a[:self.MAX_LEN]) 
65   
66          for ignorepattern in ignorepatterns: 
67              newb = re.sub(ignorepattern[0], ignorepattern[1], b) 
68              if newb in a[:self.MAX_LEN]: 
69                  return 80 
70          return 0 
71   
| Trees | Indices | Help | 
 | 
|---|
| Generated by Epydoc 3.0.1 on Wed Mar 26 12:49:37 2008 | http://epydoc.sourceforge.net |