Package cssutils :: Module tokenize2
[hide private]
[frames] | no frames]

Source Code for Module cssutils.tokenize2

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  """New CSS Tokenizer (a generator) 
  4  """ 
  5  __all__ = ['Tokenizer', 'CSSProductions'] 
  6  __docformat__ = 'restructuredtext' 
  7  __version__ = '$Id: tokenize2.py 1382 2008-07-15 20:38:08Z cthedot $' 
  8   
  9  import os 
 10  import re 
 11  import string 
 12  import xml.dom 
 13  import cssutils 
 14  import util 
 15  from cssproductions import * 
 16   
17 -class Tokenizer(object):
18 """ 19 generates a list of Token tuples: 20 (Tokenname, value, startline, startcolumn) 21 """ 22 _atkeywords = { 23 u'@font-face': CSSProductions.FONT_FACE_SYM, 24 u'@import': CSSProductions.IMPORT_SYM, 25 u'@media': CSSProductions.MEDIA_SYM, 26 u'@namespace': CSSProductions.NAMESPACE_SYM, 27 u'@page': CSSProductions.PAGE_SYM 28 } 29 _linesep = u'\n' 30
31 - def __init__(self, macros=None, productions=None):
32 """ 33 inits tokenizer with given macros and productions which default to 34 cssutils own macros and productions 35 """ 36 self.log = cssutils.log 37 if not macros: 38 macros = MACROS 39 if not productions: 40 productions = PRODUCTIONS 41 self.tokenmatches = self._compile_productions( 42 self._expand_macros(macros, 43 productions)) 44 self.commentmatcher = [x[1] for x in self.tokenmatches if x[0] == 'COMMENT'][0] 45 self.urimatcher = [x[1] for x in self.tokenmatches if x[0] == 'URI'][0] 46 self.unicodesub = re.compile(r'\\[0-9a-fA-F]{1,6}(?:\r\n|[\t|\r|\n|\f|\x20])?').sub
47
48 - def _expand_macros(self, macros, productions):
49 """returns macro expanded productions, order of productions is kept""" 50 def macro_value(m): 51 return '(?:%s)' % macros[m.groupdict()['macro']]
52 expanded = [] 53 for key, value in productions: 54 while re.search(r'{[a-zA-Z][a-zA-Z0-9-]*}', value): 55 value = re.sub(r'{(?P<macro>[a-zA-Z][a-zA-Z0-9-]*)}', 56 macro_value, value) 57 expanded.append((key, value)) 58 return expanded
59
60 - def _compile_productions(self, expanded_productions):
61 """compile productions into callable match objects, order is kept""" 62 compiled = [] 63 for key, value in expanded_productions: 64 compiled.append((key, re.compile('^(?:%s)' % value, re.U).match)) 65 return compiled
66
67 - def tokenize(self, text, fullsheet=False):
68 """Generator: Tokenize text and yield tokens, each token is a tuple 69 of:: 70 71 (nname, value, line, col) 72 73 The token value will contain a normal string, meaning CSS unicode 74 escapes have been resolved to normal characters. The serializer 75 escapes needed characters back to unicode escapes depending on 76 the stylesheet target encoding. 77 78 text 79 to be tokenized 80 fullsheet 81 if ``True`` appends EOF token as last one and completes incomplete 82 COMMENT or INVALID (to STRING) tokens 83 """ 84 def repl(m): 85 "used by unicodesub" 86 num = int(m.group(0)[1:], 16) 87 if num < 0x10000: 88 return unichr(num) 89 else: 90 return m.group(0)
91 92 def normalize(value): 93 "normalize and do unicodesub" 94 return util.Base._normalize(self.unicodesub(repl, value)) 95 96 line = col = 1 97 98 # check for BOM first as it should only be max one at the start 99 (BOM, matcher), productions = self.tokenmatches[0], self.tokenmatches[1:] 100 match = matcher(text) 101 if match: 102 found = match.group(0) 103 yield (BOM, found, line, col) 104 text = text[len(found):] 105 106 # check for @charset which is valid only at start of CSS 107 if text.startswith('@charset '): 108 found = '@charset ' # production has trailing S! 109 yield (CSSProductions.CHARSET_SYM, found, line, col) 110 text = text[len(found):] 111 col += len(found) 112 113 while text: 114 # speed test for most used CHARs 115 c = text[0] 116 if c in '{}:;,': 117 yield ('CHAR', c, line, col) 118 col += 1 119 text = text[1:] 120 121 else: 122 # check all other productions, at least CHAR must match 123 for name, matcher in productions: 124 if fullsheet and name == 'CHAR' and text.startswith(u'/*'): 125 # before CHAR production test for incomplete comment 126 possiblecomment = u'%s*/' % text 127 match = self.commentmatcher(possiblecomment) 128 if match: 129 yield ('COMMENT', possiblecomment, line, col) 130 text = None # eats all remaining text 131 break 132 133 match = matcher(text) # if no match try next production 134 if match: 135 found = match.group(0) # needed later for line/col 136 if fullsheet: 137 # check if found may be completed into a full token 138 if 'INVALID' == name and text == found: 139 # complete INVALID to STRING with start char " or ' 140 name, found = 'STRING', '%s%s' % (found, found[0]) 141 142 elif 'FUNCTION' == name and\ 143 u'url(' == normalize(found): 144 # FUNCTION url( is fixed to URI if fullsheet 145 # FUNCTION production MUST BE after URI production! 146 for end in (u"')", u'")', u')'): 147 possibleuri = '%s%s' % (text, end) 148 match = self.urimatcher(possibleuri) 149 if match: 150 name, found = 'URI', match.group(0) 151 break 152 153 if name in ('DIMENSION', 'IDENT', 'STRING', 'URI', 154 'HASH', 'COMMENT', 'FUNCTION', 'INVALID'): 155 # may contain unicode escape, replace with normal char 156 # but do not normalize (?) 157 value = self.unicodesub(repl, found) 158 159 else: 160 if 'ATKEYWORD' == name: 161 # get actual ATKEYWORD SYM 162 if '@charset' == found and ' ' == text[len(found):len(found)+1]: 163 # only this syntax! 164 name = CSSProductions.CHARSET_SYM 165 found += ' ' 166 else: 167 name = self._atkeywords.get(normalize(found), 'ATKEYWORD') 168 169 value = found # should not contain unicode escape (?) 170 171 yield (name, value, line, col) 172 text = text[len(found):] 173 nls = found.count(self._linesep) 174 line += nls 175 if nls: 176 col = len(found[found.rfind(self._linesep):]) 177 else: 178 col += len(found) 179 break 180 181 if fullsheet: 182 yield ('EOF', u'', line, col) 183