1
2
3 """New CSS Tokenizer (a generator)
4 """
5 __all__ = ['Tokenizer', 'CSSProductions']
6 __docformat__ = 'restructuredtext'
7 __version__ = '$Id: tokenize2.py 1382 2008-07-15 20:38:08Z cthedot $'
8
9 import os
10 import re
11 import string
12 import xml.dom
13 import cssutils
14 import util
15 from cssproductions import *
16
18 """
19 generates a list of Token tuples:
20 (Tokenname, value, startline, startcolumn)
21 """
22 _atkeywords = {
23 u'@font-face': CSSProductions.FONT_FACE_SYM,
24 u'@import': CSSProductions.IMPORT_SYM,
25 u'@media': CSSProductions.MEDIA_SYM,
26 u'@namespace': CSSProductions.NAMESPACE_SYM,
27 u'@page': CSSProductions.PAGE_SYM
28 }
29 _linesep = u'\n'
30
31 - def __init__(self, macros=None, productions=None):
32 """
33 inits tokenizer with given macros and productions which default to
34 cssutils own macros and productions
35 """
36 self.log = cssutils.log
37 if not macros:
38 macros = MACROS
39 if not productions:
40 productions = PRODUCTIONS
41 self.tokenmatches = self._compile_productions(
42 self._expand_macros(macros,
43 productions))
44 self.commentmatcher = [x[1] for x in self.tokenmatches if x[0] == 'COMMENT'][0]
45 self.urimatcher = [x[1] for x in self.tokenmatches if x[0] == 'URI'][0]
46 self.unicodesub = re.compile(r'\\[0-9a-fA-F]{1,6}(?:\r\n|[\t|\r|\n|\f|\x20])?').sub
47
49 """returns macro expanded productions, order of productions is kept"""
50 def macro_value(m):
51 return '(?:%s)' % macros[m.groupdict()['macro']]
52 expanded = []
53 for key, value in productions:
54 while re.search(r'{[a-zA-Z][a-zA-Z0-9-]*}', value):
55 value = re.sub(r'{(?P<macro>[a-zA-Z][a-zA-Z0-9-]*)}',
56 macro_value, value)
57 expanded.append((key, value))
58 return expanded
59
61 """compile productions into callable match objects, order is kept"""
62 compiled = []
63 for key, value in expanded_productions:
64 compiled.append((key, re.compile('^(?:%s)' % value, re.U).match))
65 return compiled
66
67 - def tokenize(self, text, fullsheet=False):
68 """Generator: Tokenize text and yield tokens, each token is a tuple
69 of::
70
71 (nname, value, line, col)
72
73 The token value will contain a normal string, meaning CSS unicode
74 escapes have been resolved to normal characters. The serializer
75 escapes needed characters back to unicode escapes depending on
76 the stylesheet target encoding.
77
78 text
79 to be tokenized
80 fullsheet
81 if ``True`` appends EOF token as last one and completes incomplete
82 COMMENT or INVALID (to STRING) tokens
83 """
84 def repl(m):
85 "used by unicodesub"
86 num = int(m.group(0)[1:], 16)
87 if num < 0x10000:
88 return unichr(num)
89 else:
90 return m.group(0)
91
92 def normalize(value):
93 "normalize and do unicodesub"
94 return util.Base._normalize(self.unicodesub(repl, value))
95
96 line = col = 1
97
98
99 (BOM, matcher), productions = self.tokenmatches[0], self.tokenmatches[1:]
100 match = matcher(text)
101 if match:
102 found = match.group(0)
103 yield (BOM, found, line, col)
104 text = text[len(found):]
105
106
107 if text.startswith('@charset '):
108 found = '@charset '
109 yield (CSSProductions.CHARSET_SYM, found, line, col)
110 text = text[len(found):]
111 col += len(found)
112
113 while text:
114
115 c = text[0]
116 if c in '{}:;,':
117 yield ('CHAR', c, line, col)
118 col += 1
119 text = text[1:]
120
121 else:
122
123 for name, matcher in productions:
124 if fullsheet and name == 'CHAR' and text.startswith(u'/*'):
125
126 possiblecomment = u'%s*/' % text
127 match = self.commentmatcher(possiblecomment)
128 if match:
129 yield ('COMMENT', possiblecomment, line, col)
130 text = None
131 break
132
133 match = matcher(text)
134 if match:
135 found = match.group(0)
136 if fullsheet:
137
138 if 'INVALID' == name and text == found:
139
140 name, found = 'STRING', '%s%s' % (found, found[0])
141
142 elif 'FUNCTION' == name and\
143 u'url(' == normalize(found):
144
145
146 for end in (u"')", u'")', u')'):
147 possibleuri = '%s%s' % (text, end)
148 match = self.urimatcher(possibleuri)
149 if match:
150 name, found = 'URI', match.group(0)
151 break
152
153 if name in ('DIMENSION', 'IDENT', 'STRING', 'URI',
154 'HASH', 'COMMENT', 'FUNCTION', 'INVALID'):
155
156
157 value = self.unicodesub(repl, found)
158
159 else:
160 if 'ATKEYWORD' == name:
161
162 if '@charset' == found and ' ' == text[len(found):len(found)+1]:
163
164 name = CSSProductions.CHARSET_SYM
165 found += ' '
166 else:
167 name = self._atkeywords.get(normalize(found), 'ATKEYWORD')
168
169 value = found
170
171 yield (name, value, line, col)
172 text = text[len(found):]
173 nls = found.count(self._linesep)
174 line += nls
175 if nls:
176 col = len(found[found.rfind(self._linesep):])
177 else:
178 col += len(found)
179 break
180
181 if fullsheet:
182 yield ('EOF', u'', line, col)
183