Package cssutils :: Package scripts :: Module csscapture
[hide private]
[frames] | no frames]

Source Code for Module cssutils.scripts.csscapture

  1  #!/usr/bin/env python 
  2  """Retrieve all CSS stylesheets including embedded for a given URL. 
  3  Retrieve as StyleSheetList or save to disk - raw, parsed or minified version. 
  4   
  5  TODO: 
  6  - maybe use DOM 3 load/save? 
  7  - logger class which handles all cases when no log is given... 
  8  - saveto: why does urllib2 hang? 
  9  """ 
 10  __all__ = ['CSSCapture'] 
 11  __docformat__ = 'restructuredtext' 
 12  __version__ = '$Id: csscapture.py 1116 2008-03-05 13:52:23Z cthedot $' 
 13   
 14  import codecs 
 15  import errno 
 16  import HTMLParser 
 17  import logging 
 18  import os 
 19  import sys 
 20  import urllib2 
 21  import urlparse 
 22   
 23  import cssutils 
 24  try: 
 25      import encutils 
 26  except ImportError: 
 27      try: 
 28          import cssutils.encutils as encutils 
 29      except ImportError: 
 30          sys.exit("You need encutils from http://cthedot.de/encutils/") 
 31   
32 -class CSSCaptureHTMLParser(HTMLParser.HTMLParser):
33 """ parses given data for link and style elements """ 34 curtag = u'' 35 links = [] 36 # list of attrsdict 37 styles = [] 38 # list of (attrsdict, data) 39
40 - def _lowerattrs(self, attrs):
41 return dict([(a.lower(), v.lower()) for a, v in attrs])
42
43 - def handle_starttag(self, tag, attrs):
44 if tag == u'link': 45 attrs = self._lowerattrs(attrs) 46 if attrs.get(u'type', u'') == u'text/css': 47 self.links.append(attrs) 48 # also get content of tag 49 elif tag == u'style': 50 attrs = self._lowerattrs(attrs) 51 if attrs.get(u'type', u'') == u'text/css': 52 self.styles.append((attrs, u'')) 53 self.curtag = tag 54 else: 55 # close as style cannot contain any elements 56 self.curtag = u''
57
58 - def handle_data(self, data):
59 if self.curtag == u'style': 60 self.styles[-1] = (self.styles[-1][0], data)
61
62 - def handle_comment(self, data):
63 # style might have comment content, treat same as data 64 self.handle_data(data)
65
66 - def handle_endtag(self, tag):
67 # close as style cannot contain any elements 68 self.curtag = u''
69
70 -class CSSCapture(object):
71 """ 72 Retrieve all CSS stylesheets including embedded for a given URL. 73 Optional setting of User-Agent used for retrieval possible 74 to handle browser sniffing servers. 75 76 raises urllib2.HTTPError 77 """
78 - def __init__(self, ua=None, log=None, defaultloglevel=logging.INFO):
79 """ 80 initialize a new Capture object 81 82 ua 83 init User-Agent to use for requests 84 log 85 supply a log object which is used instead of the default 86 log which writes to sys.stderr 87 defaultloglevel 88 constant of logging package which defines the level of the 89 default log if no explicit log given 90 """ 91 self._ua = ua 92 self._parser = CSSCaptureHTMLParser() 93 94 if log: 95 self._log = log 96 else: 97 self._log = logging.getLogger('CSSCapture') 98 hdlr = logging.StreamHandler(sys.stderr) 99 formatter = logging.Formatter('%(message)s') 100 hdlr.setFormatter(formatter) 101 self._log.addHandler(hdlr) 102 self._log.setLevel(defaultloglevel) 103 self._log.debug(u'Using default log')
104
105 - def _doRequest(self, url):
106 """ 107 Does an HTTP request 108 109 Returns: (response, url) 110 111 url might have been changed by server due to redirects etc 112 """ 113 self._log.debug(u' CSSCapture._doRequest\n * URL: %s' % url) 114 115 req = urllib2.Request(url) 116 if self._ua: 117 req.add_header('User-agent', self._ua) 118 self._log.info(' * Using User-Agent: %s', self._ua) 119 try: 120 res = urllib2.urlopen(req) 121 except urllib2.HTTPError, e: 122 self._log.critical(' %s\n%s %s\n%s' % ( 123 e.geturl(), e.code, e.msg, e.headers)) 124 return None, None 125 126 # get real url 127 if url != res.geturl(): 128 url = res.geturl() 129 self._log.info(' URL retrieved: %s', url) 130 131 return res, url
132
133 - def _createStyleSheet(self, href=None, 134 media=None, 135 parentStyleSheet=None, 136 title=u'', 137 cssText=None, 138 encoding=None):
139 """ 140 returns CSSStyleSheet read from href or if cssText is given use that 141 142 encoding 143 used if inline style found, same as self.docencoding 144 """ 145 if not cssText: 146 res, href = self._doRequest(href) 147 if res: 148 if not encoding: 149 media_type, encoding = encutils.getHTTPInfo(res) 150 if media_type != u'text/css': 151 self._log.warn(u' WARNING: HTTP media type is different than expected "text/css": %r' % 152 media_type) 153 try: 154 cssText = codecs.getreader('css')(res, 155 encoding=encoding).read() 156 except UnicodeDecodeError, e: 157 self._log.error(u' Error retrieving CSS, probably encoding mismatch:\n\t%s\n\t%s' 158 % (href, e)) 159 return None 160 else: 161 self._log.error(u' ERROR accessing CSS\n\t' % href) 162 return None 163 164 sheet = cssutils.parseString(cssText) 165 sheet.href = href 166 sheet.media = media 167 sheet._parentStyleSheet = parentStyleSheet 168 sheet.title = title 169 self._log.debug(u' * title: %s', title) 170 if href: 171 self._log.info(u' * href : %s', href) 172 self._log.info(u' * media: %s', media.mediaText) 173 self._log.info(u' %s\n' % sheet) 174 self._log.debug(u' * cssText:\n%s\n', cssText) 175 176 self._nonparsed[sheet] = cssText 177 return sheet
178
179 - def _doImports(self, parentStyleSheet, baseurl=None):
180 """ 181 handle all @import CSS stylesheet recursively 182 found CSS stylesheets are appended to stylesheetlist 183 """ 184 for rule in parentStyleSheet.cssRules: 185 if rule.type == rule.IMPORT_RULE: 186 self._log.info(u'\n@import FOUND -----') 187 self._log.debug(u' IN: %s\n' % parentStyleSheet) 188 href = urlparse.urljoin(baseurl, rule.href) 189 sheet = self._createStyleSheet( 190 href=href, 191 media=rule.media, 192 parentStyleSheet=parentStyleSheet) 193 if sheet: 194 self.stylesheetlist.append(sheet) 195 self._doImports(sheet, baseurl=href)
196
197 - def _findStyleSheets(self, docurl, doctext):
198 """ 199 parse text for stylesheets 200 fills stylesheetlist with all found StyleSheets 201 202 docurl 203 to build a full url of found StyleSheets @href 204 doctext 205 to parse 206 """ 207 self._parser.feed(doctext) 208 # <link>ed stylesheets, ownerNode should be set to the <link> node 209 for link in self._parser.links: 210 self._log.info(u'\n<link> FOUND -----') 211 self._log.debug(u' %s\n' % link) 212 href = urlparse.urljoin(docurl, link.get(u'href', u'')) 213 sheet = self._createStyleSheet( 214 href=href, 215 media=cssutils.stylesheets.MediaList( 216 link.get(u'media', u'')), 217 title=link.get(u'title', u'')) 218 if sheet: 219 self.stylesheetlist.append(sheet) 220 self._doImports(sheet, baseurl=href) 221 222 # internal <style> sheets 223 # href is None for internal stylesheets 224 # ownerNode should be set to the <style> node 225 for style in self._parser.styles: 226 stylemeta, cssText = style 227 self._log.info(u'\n<style> FOUND -----' ) 228 self._log.debug(u' %s\n' % stylemeta) 229 sheet = self._createStyleSheet( 230 media=cssutils.stylesheets.MediaList( 231 stylemeta.get(u'media', u'')), 232 title=stylemeta.get(u'title', u''), 233 cssText=cssText, 234 encoding=self.docencoding) 235 if sheet: 236 self.stylesheetlist.append(sheet) 237 self._doImports(sheet, baseurl=docurl)
238
239 - def capture(self, url, ua=None):
240 """ 241 Capture stylesheets for the given url, any HTTPError is raised to 242 caller. 243 244 url 245 to capture CSS from 246 ua 247 User-Agent to use for requests 248 249 Returns StyleSheetList. 250 """ 251 if ua is not None: 252 self._ua = ua 253 254 self._log.info(u'\nCapturing CSS from URL: %s\n', url) 255 self.stylesheetlist = cssutils.stylesheets.StyleSheetList() 256 257 # used to save inline styles 258 scheme, loc, path, query, fragment = urlparse.urlsplit(url) 259 self._filename = os.path.basename(path) 260 261 # get url content 262 res, url = self._doRequest(url) 263 if not res: 264 sys.exit(1) 265 rawdoc = res.read() 266 267 self.docencoding = encutils.getEncodingInfo( 268 res, rawdoc, log=self._log).encoding 269 self._log.info(u'\nUsing Encoding: %s\n', self.docencoding) 270 271 doctext = unicode(rawdoc, self.docencoding) 272 273 # fill list of stylesheets and list of raw css 274 self._nonparsed = {} 275 self._findStyleSheets(url, doctext) 276 277 return self.stylesheetlist
278
279 - def saveto(self, dir, saveraw=False, minified=False):
280 """ 281 saves css in "dir" in the same layout as on the server 282 internal stylesheets are saved as "dir/__INLINE_STYLE__.html.css" 283 284 dir 285 directory to save files to 286 saveparsed 287 save literal CSS from server or save the parsed CSS 288 minified 289 save minified CSS 290 291 Both parsed and minified (which is also parsed of course) will 292 loose information which cssutils is unable to understand or where 293 it is simple buggy. You might to first save the raw version before 294 parsing of even minifying it. 295 """ 296 msg = 'parsed' 297 if saveraw: 298 msg = 'raw' 299 if minified: 300 cssutils.ser.prefs.useMinified() 301 msg = 'minified' 302 303 inlines = 0 304 for sheet in self.stylesheetlist: 305 url = sheet.href 306 if not url: 307 inlines += 1 308 url = '%s_INLINE_%s.css' % ( 309 self._filename, inlines) 310 311 # build savepath 312 scheme, loc, path, query, fragment = urlparse.urlsplit(url) 313 # no absolute path 314 if path and path.startswith('/'): 315 path = path[1:] 316 path = os.path.normpath(path) 317 path, fn = os.path.split(path) 318 savepath = os.path.join(dir, loc, path) 319 savefn = os.path.join(savepath, fn) 320 try: 321 os.makedirs(savepath) 322 except OSError, e: 323 if e.errno != errno.EEXIST: 324 raise e 325 self._log.debug(u'Path "%s" already exists.', savepath) 326 327 if saveraw: 328 cssText = self._nonparsed[sheet] 329 else: 330 cssText = sheet.cssText 331 332 self._log.info(u'Saving %s "%s"' % (msg, savefn)) 333 sf = open(savefn, 'wb') 334 uf = codecs.getwriter('css')(sf) 335 uf.write(cssText) 336 sf.close()
337
338 -def main(args=None):
339 import optparse 340 341 usage = "usage: %prog [options] URL" 342 parser = optparse.OptionParser(usage=usage) 343 parser.add_option('-d', '--debug', action='store_true', dest='debug', 344 help='show debug messages during capturing') 345 parser.add_option('-m', '--minified', action='store_true', dest='minified', 346 help='saves minified version of captured files') 347 parser.add_option('-n', '--notsave', action='store_true', dest='notsave', 348 help='if given files are NOT saved, only log is written') 349 parser.add_option('-r', '--saveraw', action='store_true', dest='saveraw', 350 help='if given saves raw css otherwise cssutils\' parsed files') 351 parser.add_option('-s', '--saveto', action='store', dest='saveto', 352 help='saving retrieved files to "saveto", defaults to "_CSSCapture_SAVED"') 353 parser.add_option('-u', '--useragent', action='store', dest='ua', 354 help='useragent to use for request of URL, default is urllib2s default') 355 options, url = parser.parse_args() 356 357 if not url: 358 parser.error('no URL given') 359 else: 360 url = url[0] 361 362 if options.debug: 363 dll = logging.DEBUG 364 else: 365 dll = logging.INFO 366 367 # START 368 c = CSSCapture(defaultloglevel=dll) 369 370 stylesheetlist = c.capture(url, ua=options.ua) 371 372 if options.notsave is None or not options.notsave: 373 if options.saveto: 374 saveto = options.saveto 375 else: 376 saveto = '_CSSCapture_SAVED' 377 c.saveto(saveto, saveraw=options.saveraw, minified=options.minified) 378 else: 379 for i, s in enumerate(stylesheetlist): 380 print i+1, u'\ttitle: "%s", \n\thref : "%s"\n' % (s.title, s.href)
381 382 383 if __name__ == "__main__": 384 sys.exit(main()) 385