1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20   
 21   
 22  """This module contains all the common features for languages. 
 23   
 24  Supported features: 
 25  language code (km, af) 
 26  language name (Khmer, Afrikaans) 
 27  Plurals 
 28    Number of plurals (nplurals) 
 29    Plural equation 
 30  pofilter tests to ignore 
 31   
 32  Segmentation 
 33    characters 
 34    words 
 35    sentences 
 36   
 37  TODO: 
 38  Ideas for possible features: 
 39   
 40  Language-Team information 
 41   
 42  Segmentation 
 43    phrases 
 44   
 45  Punctuation 
 46    End of sentence 
 47    Start of sentence 
 48    Middle of sentence 
 49    Quotes 
 50      single 
 51      double 
 52   
 53  Valid characters 
 54  Accelerator characters 
 55  Special characters 
 56  Direction (rtl or ltr) 
 57  """ 
 58   
 59  from translate.lang import data 
 60  import re 
 61   
 63      """This class is the common parent class for all language classes.""" 
 64       
 65      code = "" 
 66      """The ISO 639 language code, possibly with a country specifier or other  
 67      modifier. 
 68       
 69      Examples: 
 70          km 
 71          pt_BR 
 72          sr_YU@Latn 
 73      """ 
 74   
 75      fullname = "" 
 76      """The full (English) name of this language. 
 77   
 78      Dialect codes should have the form of  
 79        Khmer 
 80        Portugese (Brazil) 
 81        #TODO: sr_YU@Latn? 
 82      """ 
 83       
 84      nplurals = 0 
 85      """The number of plural forms of this language. 
 86       
 87      0 is not a valid value - it must be overridden. 
 88      Any positive integer is valid (it should probably be between 1 and 6) 
 89      Also see data.py 
 90      """ 
 91       
 92      pluralequation = "0" 
 93      """The plural equation for selection of plural forms.  
 94   
 95      This is used for PO files to fill into the header. 
 96      See U{http://www.gnu.org/software/gettext/manual/html_node/gettext_150.html}. 
 97      Also see data.py 
 98      """ 
 99       
100      listseperator = u", " 
101      """This string is used to seperate lists of textual elements. Most  
102      languages probably can stick with the default comma, but Arabic and some 
103      Asian languages might want to override this.""" 
104       
105      commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>" 
106      """These punctuation marks are common in English and most languages that  
107      use latin script.""" 
108   
109      quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»" 
110      """These are different quotation marks used by various languages.""" 
111   
112      invertedpunc = u"¿¡" 
113      """Inveted punctuation sometimes used at the beginning of sentences in  
114      Spanish, Asturian, Galician, and Catalan.""" 
115   
116      rtlpunc = u"،؟؛÷" 
117      """These punctuation marks are used by Arabic and Persian, for example.""" 
118   
119      CJKpunc = u"。、,;!?「」『』【】" 
120      """These punctuation marks are used in certain circumstances with CJK  
121      languages.""" 
122   
123      indicpunc = u"।॥॰" 
124      """These punctuation marks are used by several Indic languages.""" 
125   
126      ethiopicpunc = u"።፤፣" 
127      """These punctuation marks are used by several Ethiopic languages.""" 
128   
129      miscpunc = u"…±°¹²³·©®×£¥€" 
130      """The middle dot (·) is used by Greek and Georgian.""" 
131   
132      punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,\ 
133              indicpunc, ethiopicpunc, miscpunc]) 
134      """We include many types of punctuation here, simply since this is only  
135      meant to determine if something is punctuation. Hopefully we catch some  
136      languages which might not be represented with modules. Most languages won't  
137      need to override this.""" 
138   
139      sentenceend = u".!?…։؟।。!?።" 
140      """These marks can indicate a sentence end. Once again we try to account  
141      for many languages. Most langauges won't need to override this.""" 
142   
143       
144       
145       
146       
147      sentencere = re.compile(r"""(?s)    #make . also match newlines 
148                              .*?         #anything, but match non-greedy 
149                              [%s]        #the puntuation for sentence ending 
150                              \s+         #the spacing after the puntuation 
151                              (?=[^a-z\d])#lookahead that next part starts with caps 
152                              """ % sentenceend, re.VERBOSE) 
153       
154      puncdict = {} 
155      """A dictionary of punctuation transformation rules that can be used by punctranslate().""" 
156   
157      ignoretests = [] 
158      """List of pofilter tests for this language that must be ignored.""" 
159   
160      checker = None 
161      """A language specific checker (see filters.checks). 
162   
163      This doesn't need to be supplied, but will be used if it exists.""" 
164   
166          """This constructor is used if we need to instantiate an abject (not  
167          the usual setup). This will mostly when the factory is asked for a 
168          language for which we don't have a dedicated class.""" 
169          self.code = code or "" 
170          while code: 
171              langdata = data.languages.get(code, None) 
172              if langdata: 
173                  self.fullname, self.nplurals, self.pluralequation = langdata 
174                  break 
175              code = data.simplercode(code) 
176          if not code: 
177   
178              pass 
 179   
181          """Give a simple string representation without address information to  
182          be able to store it in text for comparison later.""" 
183          detail = "" 
184          if self.code: 
185              detail = "(%s)" % self.code 
186          return "<class 'translate.lang.common.Common%s'>" % detail 
 187   
199      punctranslate = classmethod(punctranslate) 
200   
202          """Returns an iterator over the characters in text.""" 
203           
204          prev = 'A' 
205          for c in text: 
206              if c.isspace() and prev.isspace(): 
207                  continue 
208              prev = c 
209              if not (c in cls.punctuation): 
210                  yield c 
 211      character_iter = classmethod(character_iter) 
212   
216      characters = classmethod(characters) 
217   
219          """Returns an iterator over the words in text.""" 
220           
221          for w in text.split(): 
222              word = w.strip(cls.punctuation) 
223              if word: 
224                  yield word 
 225      word_iter = classmethod(word_iter) 
226   
228          """Returns a list of words in text.""" 
229          return [w for w in cls.word_iter(text)] 
 230      words = classmethod(words) 
231   
233          """Returns an iterator over the sentences in text.""" 
234          lastmatch = 0 
235          iter = cls.sentencere.finditer(text) 
236          for item in iter: 
237              lastmatch = item.end() 
238              sentence = item.group() 
239              if strip: sentence = sentence.strip() 
240              if sentence: yield sentence 
241          remainder = text[lastmatch:] 
242          if strip: remainder = remainder.strip() 
243          if remainder: yield remainder 
 244      sentence_iter = classmethod(sentence_iter) 
245               
247          """Returns a list of senteces in text.""" 
248          return [s for s in cls.sentence_iter(text, strip=strip)] 
 249      sentences = classmethod(sentences) 
250   
252          """Determines whether the text starts with a capital letter.""" 
253          stripped = text.lstrip().lstrip(cls.punctuation) 
254          return stripped and stripped[0].isupper() 
 255      capsstart = classmethod(capsstart) 
 256