| Trees | Indices | Help | 
 | 
|---|
|  | 
 1  # -*- coding: utf-8 -*- 
 2  # 
 3  # Copyright 2006 Zuza Software Foundation 
 4  #  
 5  # This file is part of translate. 
 6  # 
 7  # translate is free software; you can redistribute it and/or modify 
 8  # it under the terms of the GNU General Public License as published by 
 9  # the Free Software Foundation; either version 2 of the License, or 
10  # (at your option) any later version. 
11  #  
12  # translate is distributed in the hope that it will be useful, 
13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
15  # GNU General Public License for more details. 
16  # 
17  # You should have received a copy of the GNU General Public License 
18  # along with translate; if not, write to the Free Software 
19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
20  # 
21   
22  """Module to deal with different types and uses of segmentation""" 
23   
24  #XXX: This module is now deprecated: Use language specific segmenters in the 
25  # lang package (character_iter, word_iter, sentence_iter, etc.). 
26   
27  punctuation = u".,;:!?-@#$%^*_()[]{}/\\'\"<>‘’‚‛“”„‟′″‴‵‶‷‹›«»±³¹²°¿©®×£¥" 
28   
30      """Returns an iterator over the characters in text.""" 
31      #We don't return more than one consecutive whitespace character 
32      prev = 'A' 
33      for c in text: 
34          if c.isspace() and prev.isspace(): 
35              continue 
36          prev = c 
37          if not (c in punctuation): 
38              yield c.lower() 
39   
43   
45      """Returns an iterator over the words in text.""" 
46      #TODO: Consider replacing puctuation with space before split() 
47      for w in text.split(): 
48          yield w.strip(punctuation).lower() 
49   
53   
55      """Returns an iterator over the senteces in text.""" 
56      #TODO: This is very naïve. We really should consider all punctuation, 
57      #and return the punctuation with the sentence. 
58      #TODO: Search for capital letter start with next sentence to avoid 
59      #confusion with abbreviations. And remember Afrikaans "'n" :-) 
60      for s in text.split(". "): 
61          yield s.strip() 
62           
66   
| Trees | Indices | Help | 
 | 
|---|
| Generated by Epydoc 3.0.1 on Wed Mar 26 12:49:41 2008 | http://epydoc.sourceforge.net |