1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20   
 21   
 22   
 23  """module for parsing html files for translation""" 
 24   
 25  import re 
 26  from translate.storage import base 
 27  from HTMLParser import HTMLParser 
 28   
 30      """A unit of translatable/localisable HTML content""" 
 34   
 38       
 41      source = property(getsource, setsource) 
 42   
 44          self.locations.append(location) 
  45   
  48   
 49   
 50 -class htmlfile(HTMLParser, base.TranslationStore): 
  51      UnitClass = htmlunit 
 52      markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"] 
 53      markingattrs = [] 
 54      includeattrs = ["alt", "summary", "standby", "abbr", "content"] 
 55   
 56 -    def __init__(self, includeuntaggeddata=None, inputfile=None): 
  57          self.units = [] 
 58          self.filename = getattr(inputfile, 'name', None)  
 59          self.currentblock = "" 
 60          self.currentblocknum = 0 
 61          self.currenttag = None 
 62          self.includeuntaggeddata = includeuntaggeddata 
 63          HTMLParser.__init__(self) 
 64   
 65          if inputfile is not None: 
 66              htmlsrc = inputfile.read() 
 67              inputfile.close() 
 68              self.parse(htmlsrc) 
  69   
 71          """Returns the encoding of the html text. 
 72           
 73          We look for 'charset=' within a meta tag to do this. 
 74          """ 
 75   
 76          pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']''' 
 77          result = re.findall(pattern, htmlsrc) 
 78          encoding = None 
 79          if result: 
 80              encoding = result[0] 
 81          return encoding 
  82   
 84          """Return the html text properly encoded based on a charset.""" 
 85          charset = self.guess_encoding(htmlsrc) 
 86          if charset: 
 87              return htmlsrc.decode(charset) 
 88          else: 
 89              return htmlsrc 
  90   
 91 -    def parse(self, htmlsrc): 
  92          htmlsrc = self.do_encoding(htmlsrc) 
 93          self.feed(htmlsrc) 
  94   
101   
103          """Strip unnecessary html from the text. 
104           
105          HTML tags are deemed unnecessary if it fully encloses the translatable 
106          text, eg. '<a href="index.html">Home Page</a>'. 
107   
108          HTML tags that occurs within the normal flow of text will not be removed, 
109          eg. 'This is a link to the <a href="index.html">Home Page</a>.' 
110          """ 
111          text = text.strip() 
112   
113          pattern = '(?s)^<[^>]*>(.*)</.*>$' 
114          result = re.findall(pattern, text) 
115          if len(result) == 1: 
116              text = self.strip_html(result[0]) 
117          return text 
 118   
120          """Check if the supplied HTML snippet has any content that needs to be translated.""" 
121   
122          text = text.strip() 
123          result = re.findall('(?i).*(charset.*=.*)', text) 
124          if len(result) == 1: 
125              return False 
126   
127           
128          if text == ' ': 
129              return False 
130   
131          pattern = '<[^>]*>' 
132          result = re.sub(pattern, '', text).strip() 
133          if result: 
134              return True 
135          else: 
136              return False 
 137   
138   
139   
141          self.addhtmlblock(self.currentblock) 
142          self.currentblock = "" 
143          self.currenttag = tag 
 144   
146          self.addhtmlblock(self.currentblock) 
147          self.currentblock = "" 
148          self.currenttag = None 
 149   
151          newblock = 0 
152          if tag in self.markingtags: 
153              newblock = 1 
154          for attrname, attrvalue in attrs: 
155              if attrname in self.markingattrs: 
156                  newblock = 1 
157              if attrname in self.includeattrs: 
158                  self.addhtmlblock(attrvalue) 
159   
160          if newblock: 
161              self.startblock(tag) 
162          elif self.currenttag is not None: 
163              self.currentblock += self.get_starttag_text() 
 164   
166          for attrname, attrvalue in attrs: 
167              if attrname in self.includeattrs: 
168                  self.addhtmlblock(attrvalue) 
169          if self.currenttag is not None: 
170              self.currentblock += self.get_starttag_text() 
 171   
173          if tag == self.currenttag: 
174              self.endblock() 
175          elif self.currenttag is not None:  
176              self.currentblock += '</%s>' % tag 
 177   
179          if self.currenttag is not None: 
180              self.currentblock += data 
181          elif self.includeuntaggeddata: 
182              self.startblock(None) 
183              self.currentblock += data 
 184   
187   
190   
 194   
197