1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20  """reads a set of .po or .pot files to produce a pootle-terminology.pot""" 
 21   
 22  from translate.storage import factory 
 23  from translate.lang import factory as lang_factory 
 24  from translate.storage import po 
 25  from translate.misc import optrecurse 
 26  import sys 
 27  import os 
 28  import re 
 29   
 31      """a specialized Option Parser for the terminology tool...""" 
 32   
 33       
 34      formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]") 
 35       
 36      xmlpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>") 
 37   
 38      sortorders = [ "frequency", "dictionary", "length" ] 
 39   
 40      files = 0 
 41      units = 0 
 42   
 44          """parses the command line options, handling implicit input/output args""" 
 45          (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 
 46           
 47          if args and not options.input: 
 48              if not options.output and len(args) > 1: 
 49                  options.input = args[:-1] 
 50                  args = args[-1:] 
 51              else: 
 52                  options.input = args 
 53                  args = [] 
 54          if args and not options.output: 
 55              options.output = args[-1] 
 56              args = args[:-1] 
 57          if not options.output: 
 58              options.output = "pootle-terminology.pot" 
 59          if args: 
 60              self.error("You have used an invalid combination of --input, --output and freestanding args") 
 61          if isinstance(options.input, list) and len(options.input) == 1: 
 62              options.input = options.input[0] 
 63          return (options, args) 
  64   
 66          """sets the usage string - if usage not given, uses getusagestring for each option""" 
 67          if usage is None: 
 68              self.usage = "%prog " + " ".join([self.getusagestring(option) for option in self.option_list]) + \ 
 69                      "\n  input directory is searched for PO files, terminology PO file is output file" 
 70          else: 
 71              super(TerminologyOptionParser, self).set_usage(usage) 
  72   
 80   
 82          """recurse through directories and process files""" 
 83          if self.isrecursive(options.input, 'input') and getattr(options, "allowrecursiveinput", True): 
 84              if isinstance(options.input, list): 
 85                  inputfiles = self.recurseinputfilelist(options) 
 86              else: 
 87                  inputfiles = self.recurseinputfiles(options) 
 88          else: 
 89              if options.input: 
 90                  inputfiles = [os.path.basename(options.input)] 
 91                  options.input = os.path.dirname(options.input) 
 92              else: 
 93                  inputfiles = [options.input] 
 94          if os.path.isdir(options.output): 
 95              options.output = os.path.join(options.output,"pootle-terminology.pot") 
 96          self.stopwords = {} 
 97          self.stoprelist = [] 
 98          actions = { '+': frozenset(), ':': frozenset(['skip']), 
 99                      '<': frozenset(['phrase']), '=': frozenset(['word']), 
100                      '>': frozenset(['word','skip']), 
101                      '@': frozenset(['word','phrase']) } 
102          if options.stopwordfile != None: 
103              stopfile = open(options.stopwordfile, "r") 
104              try: 
105                  for stopline in stopfile: 
106                      stoptype = stopline[0] 
107                      if stoptype == '#' or stoptype == "\n": 
108                          continue 
109                      elif stoptype == '/': 
110                          self.stoprelist.append(re.compile(stopline[1:-1]+'$')) 
111                      else:                     
112                          self.stopwords[stopline[1:-1]] = actions[stoptype] 
113              except KeyError, character: 
114                  self.warning("Bad line in stopword list %s starts with" % (options.stopwordfile), options, sys.exc_info()) 
115              stopfile.close() 
116          self.glossary = {} 
117          self.initprogressbar(inputfiles, options) 
118          for inputpath in inputfiles: 
119              self.files += 1 
120              fullinputpath = self.getfullinputpath(options, inputpath) 
121              try: 
122                  success = self.processfile(None, options, fullinputpath) 
123              except Exception, error: 
124                  if isinstance(error, KeyboardInterrupt): 
125                      raise 
126                  self.warning("Error processing: input %s" % (fullinputpath), options, sys.exc_info()) 
127                  success = False 
128              self.reportprogress(inputpath, success) 
129          del self.progressbar 
130          self.outputterminology(options) 
 131   
132 -    def clean(self, string, options): 
 133          """returns the cleaned string that contains the text to be matched""" 
134          for accelerator in options.accelchars: 
135              string = string.replace(accelerator, "") 
136          string = self.formatpat.sub(" ", string) 
137          string = self.xmlpat.sub(" ", string) 
138          string = string.strip() 
139          return string 
 140   
141 -    def addphrases(self, words, skips, translation, partials=True): 
 142          """adds (sub)phrases with non-skipwords and more than one word""" 
143          if (len(words) > skips + 1 and 
144              'skip' not in self.stopwords.get(words[0], frozenset()) and 
145              'skip' not in self.stopwords.get(words[-1], frozenset())): 
146              self.glossary.setdefault(' '.join(words), []).append(translation) 
147          if partials: 
148              part = list(words) 
149              while len(part) > 2: 
150                  if 'skip' in self.stopwords.get(part.pop(), frozenset()): 
151                      skips -= 1 
152                  if (len(part) > skips + 1 and 
153                      'skip' not in self.stopwords.get(part[0], frozenset()) and 
154                      'skip' not in self.stopwords.get(part[-1], frozenset())): 
155                      self.glossary.setdefault(' '.join(part), []).append(translation) 
 156                       
157   
158 -    def processfile(self, fileprocessor, options, fullinputpath): 
 159          """process an individual file""" 
160          inputfile = self.openinputfile(options, fullinputpath) 
161          inputfile = factory.getobject(inputfile) 
162          sourcelang = lang_factory.getlanguage(options.sourcelanguage) 
163          rematchignore = frozenset(('word','phrase')) 
164          defaultignore = frozenset() 
165          for unit in inputfile.units: 
166              self.units += 1 
167              if unit.isheader() or not unit.istranslated(): 
168                  continue 
169              if unit.hasplural(): 
170                  continue 
171              if not options.invert: 
172                  source = self.clean(unit.source, options) 
173                  target = self.clean(unit.target, options) 
174              else: 
175                  target = self.clean(unit.source, options) 
176                  source = self.clean(unit.target, options) 
177              if len(source) <= 1: 
178                  continue 
179              for sentence in sourcelang.sentences(source): 
180                  words = [] 
181                  skips = 0 
182                  for word in sourcelang.words(sentence): 
183                      if options.ignorecase or (options.foldtitle and word.istitle()): 
184                          word = word.lower() 
185                      ignore = defaultignore 
186                      if word in self.stopwords: 
187                          ignore = self.stopwords[word] 
188                      else: 
189                          for stopre in self.stoprelist: 
190                              if stopre.match(word) != None: 
191                                  ignore = rematchignore 
192                                  break 
193                      translation = (source, target, unit, fullinputpath) 
194                      if 'word' not in ignore: 
195                           
196                          root = word 
197                          if len(word) > 3 and word[-1] == 's' and word[0:-1] in self.glossary: 
198                              root = word[0:-1] 
199                          elif len(root) > 2 and root + 's' in self.glossary: 
200                              self.glossary[root] = self.glossary.pop(root + 's') 
201                          self.glossary.setdefault(root, []).append(translation) 
202                      if 'phrase' in ignore: 
203                           
204                          while len(words) > 2: 
205                              if 'skip' in self.stopwords.get(words.pop(0),defaultignore): 
206                                  skips -= 1 
207                              self.addphrases(words, skips, translation) 
208                          words = [] 
209                          skips = 0 
210                      else: 
211                          words.append(word) 
212                          if 'skip' in ignore: 
213                              skips += 1 
214                          if len(words) > options.termlength + skips: 
215                              while len(words) > options.termlength + skips: 
216                                  if 'skip' in self.stopwords.get(words.pop(0),defaultignore): 
217                                      skips -= 1 
218                                  self.addphrases(words, skips, translation) 
219                          else: 
220                              self.addphrases(words, skips, translation, partials=False) 
221                   
222                  while len(words) > 2: 
223                      if 'skip' in self.stopwords.get(words.pop(0),defaultignore): 
224                          skips -= 1 
225                      self.addphrases(words, skips, translation) 
 226   
228          """saves the generated terminology glossary""" 
229          termfile = po.pofile() 
230          terms = {} 
231          locre = re.compile(r":[0-9]+$") 
232          print "%d terms from %d units in %d files" % (len(self.glossary), self.units, self.files) 
233          for term, translations in self.glossary.iteritems(): 
234              if len(translations) <= 1: 
235                  continue 
236              filecounts = {} 
237              sources = {} 
238              termunit = po.pounit(term) 
239              locations = {} 
240              sourcenotes = {} 
241              transnotes = {} 
242              targets = {} 
243              fullmsg = False 
244              for source, target, unit, filename in translations: 
245                  sources[source] = 1 
246                  filecounts[filename] = filecounts.setdefault(filename, 0) + 1 
247                  if term.lower() == self.clean(unit.source, options).lower(): 
248                      fullmsg = True 
249                      target = self.clean(unit.target, options) 
250                      if options.ignorecase or (options.foldtitle and target.istitle()): 
251                          target = target.lower() 
252                      unit.settarget(target) 
253                      if target != "": 
254                          targets.setdefault(target, []).append(filename) 
255                      if term.lower() == unit.source.strip().lower(): 
256                          sourcenotes[unit.getnotes("source code")] = None; 
257                          transnotes[unit.getnotes("translator")] = None; 
258                  else: 
259                      unit.settarget("") 
260                  unit.setsource(term) 
261                  termunit.merge(unit, overwrite=False, comments=False) 
262                  for loc in unit.getlocations(): 
263                      locations.setdefault(locre.sub("", loc)) 
264              numsources = len(sources) 
265              numfiles = len(filecounts) 
266              numlocs = len(locations) 
267              if numfiles < options.inputmin or numlocs < options.locmin: 
268                  continue 
269              if fullmsg: 
270                  if numsources < options.fullmsgmin: 
271                      continue 
272              elif numsources < options.substrmin: 
273                  continue 
274              if len(targets.keys()) > 1: 
275                  txt = '; '.join(["%s {%s}" % (target, ', '.join(files)) 
276                                       for target, files in targets.iteritems()]) 
277                  if termunit.gettarget().find('};') < 0: 
278                      termunit.settarget(txt) 
279                      termunit.markfuzzy() 
280                  else: 
281                       
282                      termunit.addnote(txt, "translator") 
283              for location in locations.keys(): 
284                  termunit.addlocation(location) 
285              for sourcenote in sourcenotes.keys(): 
286                  termunit.addnote(sourcenote, "source code") 
287              for transnote in transnotes.keys(): 
288                  termunit.addnote(transnote, "translator") 
289              for file, count in filecounts.iteritems(): 
290                  termunit.othercomments.append("# (poterminology) %s (%d)\n" % (file, count)) 
291              terms[term] = (((10 * numfiles) + numsources, termunit)) 
292           
293          termlist = terms.keys() 
294          print "%d terms after thresholding" % len(termlist) 
295          termlist.sort(lambda x, y: cmp(len(x),len(y))) 
296          for term in termlist: 
297              words = term.split() 
298              if len(words) <= 2: 
299                  continue 
300              while len(words) > 2: 
301                  words.pop() 
302                  if terms[term][0] == terms.get(' '.join(words),[0])[0]: 
303                      del terms[' '.join(words)] 
304              words = term.split() 
305              while len(words) > 2: 
306                  words.pop(0) 
307                  if terms[term][0] == terms.get(' '.join(words),[0])[0]: 
308                      del terms[' '.join(words)] 
309          print "%d terms after subphrase reduction" % len(terms.keys()) 
310          termitems = terms.values() 
311          if options.sortorders == None: 
312              options.sortorders = self.sortorders 
313          while len(options.sortorders) > 0: 
314              order = options.sortorders.pop() 
315              if order == "frequency": 
316                  termitems.sort(lambda x, y: cmp(y[0],x[0])) 
317              elif order == "dictionary": 
318                  termitems.sort(lambda x, y: cmp(x[1].source.lower(), y[1].source.lower())) 
319              elif order == "length": 
320                  termitems.sort(lambda x, y: cmp(len(x[1].source), len(y[1].source))) 
321              else: 
322                  self.warning("unknown sort order %s" % order, options) 
323          for count, unit in termitems: 
324              termfile.units.append(unit) 
325          open(options.output, "w").write(str(termfile)) 
  326   
328      formats = {"po":("po", None), None:("po", None)} 
329      parser = TerminologyOptionParser(formats) 
330      parser.add_option("-I", "--ignore-case", dest="ignorecase", 
331          action="store_true", default=False, help="make all terms lowercase") 
332      parser.add_option("-F", "--fold-titlecase", dest="foldtitle", 
333          action="store_true", default=False, help="fold \"Title Case\" to lowercase") 
334      parser.add_option("", "--accelerator", dest="accelchars", default="", 
335          metavar="ACCELERATORS", help="ignores the given accelerator characters when matching") 
336      parser.add_option("-t", "--term-words", type="int", dest="termlength", default="3", 
337                        help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH") 
338      parser.add_option("", "--inputs-needed", type="int", dest="inputmin", default="2", 
339                        help="omit terms appearing in less than MIN input files (default 2)", metavar="MIN") 
340      parser.add_option("", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1", 
341                        help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN") 
342      parser.add_option("", "--substr-needed", type="int", dest="substrmin", default="2", 
343                        help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN") 
344      parser.add_option("", "--locs-needed", type="int", dest="locmin", default="2", 
345                        help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN") 
346      parser.add_option("", "--sort", dest="sortorders", action="append", 
347                        type="choice", choices=parser.sortorders, metavar="ORDER", 
348                        help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(parser.sortorders)) 
349      parser.add_option("-S", "--stopword-list", type="string", dest="stopwordfile", 
350                        help="name of file containing stopword list", metavar="FILENAME") 
351      parser.add_option("", "--source-language", dest="sourcelanguage", default="en", 
352                        help="the source language code (default 'en')", metavar="LANG") 
353      parser.add_option("-v", "--invert", dest="invert", 
354          action="store_true", default=False, help="invert the source and target languages for terminology") 
355      parser.set_usage() 
356      parser.description = __doc__ 
357      parser.run() 
 358   
359   
360  if __name__ == '__main__': 
361      main() 
362