1
2 """Retrieve all CSS stylesheets including embedded for a given URL.
3 Retrieve as StyleSheetList or save to disk - raw, parsed or minified version.
4
5 TODO:
6 - maybe use DOM 3 load/save?
7 - logger class which handles all cases when no log is given...
8 - saveto: why does urllib2 hang?
9 """
10 __all__ = ['CSSCapture']
11 __docformat__ = 'restructuredtext'
12 __version__ = '$Id: csscapture.py 1116 2008-03-05 13:52:23Z cthedot $'
13
14 import codecs
15 import errno
16 import HTMLParser
17 import logging
18 import os
19 import sys
20 import urllib2
21 import urlparse
22
23 import cssutils
24 try:
25 import encutils
26 except ImportError:
27 try:
28 import cssutils.encutils as encutils
29 except ImportError:
30 sys.exit("You need encutils from http://cthedot.de/encutils/")
31
33 """ parses given data for link and style elements """
34 curtag = u''
35 links = []
36
37 styles = []
38
39
41 return dict([(a.lower(), v.lower()) for a, v in attrs])
42
44 if tag == u'link':
45 attrs = self._lowerattrs(attrs)
46 if attrs.get(u'type', u'') == u'text/css':
47 self.links.append(attrs)
48
49 elif tag == u'style':
50 attrs = self._lowerattrs(attrs)
51 if attrs.get(u'type', u'') == u'text/css':
52 self.styles.append((attrs, u''))
53 self.curtag = tag
54 else:
55
56 self.curtag = u''
57
61
65
69
71 """
72 Retrieve all CSS stylesheets including embedded for a given URL.
73 Optional setting of User-Agent used for retrieval possible
74 to handle browser sniffing servers.
75
76 raises urllib2.HTTPError
77 """
78 - def __init__(self, ua=None, log=None, defaultloglevel=logging.INFO):
79 """
80 initialize a new Capture object
81
82 ua
83 init User-Agent to use for requests
84 log
85 supply a log object which is used instead of the default
86 log which writes to sys.stderr
87 defaultloglevel
88 constant of logging package which defines the level of the
89 default log if no explicit log given
90 """
91 self._ua = ua
92 self._parser = CSSCaptureHTMLParser()
93
94 if log:
95 self._log = log
96 else:
97 self._log = logging.getLogger('CSSCapture')
98 hdlr = logging.StreamHandler(sys.stderr)
99 formatter = logging.Formatter('%(message)s')
100 hdlr.setFormatter(formatter)
101 self._log.addHandler(hdlr)
102 self._log.setLevel(defaultloglevel)
103 self._log.debug(u'Using default log')
104
106 """
107 Does an HTTP request
108
109 Returns: (response, url)
110
111 url might have been changed by server due to redirects etc
112 """
113 self._log.debug(u' CSSCapture._doRequest\n * URL: %s' % url)
114
115 req = urllib2.Request(url)
116 if self._ua:
117 req.add_header('User-agent', self._ua)
118 self._log.info(' * Using User-Agent: %s', self._ua)
119 try:
120 res = urllib2.urlopen(req)
121 except urllib2.HTTPError, e:
122 self._log.critical(' %s\n%s %s\n%s' % (
123 e.geturl(), e.code, e.msg, e.headers))
124 return None, None
125
126
127 if url != res.geturl():
128 url = res.geturl()
129 self._log.info(' URL retrieved: %s', url)
130
131 return res, url
132
133 - def _createStyleSheet(self, href=None,
134 media=None,
135 parentStyleSheet=None,
136 title=u'',
137 cssText=None,
138 encoding=None):
139 """
140 returns CSSStyleSheet read from href or if cssText is given use that
141
142 encoding
143 used if inline style found, same as self.docencoding
144 """
145 if not cssText:
146 res, href = self._doRequest(href)
147 if res:
148 if not encoding:
149 media_type, encoding = encutils.getHTTPInfo(res)
150 if media_type != u'text/css':
151 self._log.warn(u' WARNING: HTTP media type is different than expected "text/css": %r' %
152 media_type)
153 try:
154 cssText = codecs.getreader('css')(res,
155 encoding=encoding).read()
156 except UnicodeDecodeError, e:
157 self._log.error(u' Error retrieving CSS, probably encoding mismatch:\n\t%s\n\t%s'
158 % (href, e))
159 return None
160 else:
161 self._log.error(u' ERROR accessing CSS\n\t' % href)
162 return None
163
164 sheet = cssutils.parseString(cssText)
165 sheet.href = href
166 sheet.media = media
167 sheet._parentStyleSheet = parentStyleSheet
168 sheet.title = title
169 self._log.debug(u' * title: %s', title)
170 if href:
171 self._log.info(u' * href : %s', href)
172 self._log.info(u' * media: %s', media.mediaText)
173 self._log.info(u' %s\n' % sheet)
174 self._log.debug(u' * cssText:\n%s\n', cssText)
175
176 self._nonparsed[sheet] = cssText
177 return sheet
178
179 - def _doImports(self, parentStyleSheet, baseurl=None):
196
198 """
199 parse text for stylesheets
200 fills stylesheetlist with all found StyleSheets
201
202 docurl
203 to build a full url of found StyleSheets @href
204 doctext
205 to parse
206 """
207 self._parser.feed(doctext)
208
209 for link in self._parser.links:
210 self._log.info(u'\n<link> FOUND -----')
211 self._log.debug(u' %s\n' % link)
212 href = urlparse.urljoin(docurl, link.get(u'href', u''))
213 sheet = self._createStyleSheet(
214 href=href,
215 media=cssutils.stylesheets.MediaList(
216 link.get(u'media', u'')),
217 title=link.get(u'title', u''))
218 if sheet:
219 self.stylesheetlist.append(sheet)
220 self._doImports(sheet, baseurl=href)
221
222
223
224
225 for style in self._parser.styles:
226 stylemeta, cssText = style
227 self._log.info(u'\n<style> FOUND -----' )
228 self._log.debug(u' %s\n' % stylemeta)
229 sheet = self._createStyleSheet(
230 media=cssutils.stylesheets.MediaList(
231 stylemeta.get(u'media', u'')),
232 title=stylemeta.get(u'title', u''),
233 cssText=cssText,
234 encoding=self.docencoding)
235 if sheet:
236 self.stylesheetlist.append(sheet)
237 self._doImports(sheet, baseurl=docurl)
238
240 """
241 Capture stylesheets for the given url, any HTTPError is raised to
242 caller.
243
244 url
245 to capture CSS from
246 ua
247 User-Agent to use for requests
248
249 Returns StyleSheetList.
250 """
251 if ua is not None:
252 self._ua = ua
253
254 self._log.info(u'\nCapturing CSS from URL: %s\n', url)
255 self.stylesheetlist = cssutils.stylesheets.StyleSheetList()
256
257
258 scheme, loc, path, query, fragment = urlparse.urlsplit(url)
259 self._filename = os.path.basename(path)
260
261
262 res, url = self._doRequest(url)
263 if not res:
264 sys.exit(1)
265 rawdoc = res.read()
266
267 self.docencoding = encutils.getEncodingInfo(
268 res, rawdoc, log=self._log).encoding
269 self._log.info(u'\nUsing Encoding: %s\n', self.docencoding)
270
271 doctext = unicode(rawdoc, self.docencoding)
272
273
274 self._nonparsed = {}
275 self._findStyleSheets(url, doctext)
276
277 return self.stylesheetlist
278
279 - def saveto(self, dir, saveraw=False, minified=False):
280 """
281 saves css in "dir" in the same layout as on the server
282 internal stylesheets are saved as "dir/__INLINE_STYLE__.html.css"
283
284 dir
285 directory to save files to
286 saveparsed
287 save literal CSS from server or save the parsed CSS
288 minified
289 save minified CSS
290
291 Both parsed and minified (which is also parsed of course) will
292 loose information which cssutils is unable to understand or where
293 it is simple buggy. You might to first save the raw version before
294 parsing of even minifying it.
295 """
296 msg = 'parsed'
297 if saveraw:
298 msg = 'raw'
299 if minified:
300 cssutils.ser.prefs.useMinified()
301 msg = 'minified'
302
303 inlines = 0
304 for sheet in self.stylesheetlist:
305 url = sheet.href
306 if not url:
307 inlines += 1
308 url = '%s_INLINE_%s.css' % (
309 self._filename, inlines)
310
311
312 scheme, loc, path, query, fragment = urlparse.urlsplit(url)
313
314 if path and path.startswith('/'):
315 path = path[1:]
316 path = os.path.normpath(path)
317 path, fn = os.path.split(path)
318 savepath = os.path.join(dir, loc, path)
319 savefn = os.path.join(savepath, fn)
320 try:
321 os.makedirs(savepath)
322 except OSError, e:
323 if e.errno != errno.EEXIST:
324 raise e
325 self._log.debug(u'Path "%s" already exists.', savepath)
326
327 if saveraw:
328 cssText = self._nonparsed[sheet]
329 else:
330 cssText = sheet.cssText
331
332 self._log.info(u'Saving %s "%s"' % (msg, savefn))
333 sf = open(savefn, 'wb')
334 uf = codecs.getwriter('css')(sf)
335 uf.write(cssText)
336 sf.close()
337
338 -def main(args=None):
339 import optparse
340
341 usage = "usage: %prog [options] URL"
342 parser = optparse.OptionParser(usage=usage)
343 parser.add_option('-d', '--debug', action='store_true', dest='debug',
344 help='show debug messages during capturing')
345 parser.add_option('-m', '--minified', action='store_true', dest='minified',
346 help='saves minified version of captured files')
347 parser.add_option('-n', '--notsave', action='store_true', dest='notsave',
348 help='if given files are NOT saved, only log is written')
349 parser.add_option('-r', '--saveraw', action='store_true', dest='saveraw',
350 help='if given saves raw css otherwise cssutils\' parsed files')
351 parser.add_option('-s', '--saveto', action='store', dest='saveto',
352 help='saving retrieved files to "saveto", defaults to "_CSSCapture_SAVED"')
353 parser.add_option('-u', '--useragent', action='store', dest='ua',
354 help='useragent to use for request of URL, default is urllib2s default')
355 options, url = parser.parse_args()
356
357 if not url:
358 parser.error('no URL given')
359 else:
360 url = url[0]
361
362 if options.debug:
363 dll = logging.DEBUG
364 else:
365 dll = logging.INFO
366
367
368 c = CSSCapture(defaultloglevel=dll)
369
370 stylesheetlist = c.capture(url, ua=options.ua)
371
372 if options.notsave is None or not options.notsave:
373 if options.saveto:
374 saveto = options.saveto
375 else:
376 saveto = '_CSSCapture_SAVED'
377 c.saveto(saveto, saveraw=options.saveraw, minified=options.minified)
378 else:
379 for i, s in enumerate(stylesheetlist):
380 print i+1, u'\ttitle: "%s", \n\thref : "%s"\n' % (s.title, s.href)
381
382
383 if __name__ == "__main__":
384 sys.exit(main())
385