1
2
3 """Python codec for CSS."""
4 __docformat__ = 'restructuredtext'
5 __docformat__ = 'restructuredtext'
6 __author__ = 'Walter Doerwald'
7 __version__ = '$Id: util.py 1114 2008-03-05 13:22:59Z cthedot $'
8
9 import codecs, marshal
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
31 """
32 Detect the encoding of the byte string ``input``, which contains the
33 beginning of a CSS file. To detect the encoding the first few bytes are
34 used (or if ``input`` is ASCII compatible and starts with a charset rule
35 the encoding name from the rule).
36
37 If the encoding can't be detected yet, ``None`` is returned. ``final``
38 specifies whether more data is available in later calls or not. If ``final``
39 is true, ``_detectencoding_str()`` will never return ``None``.
40 """
41
42
43 CANDIDATE_UTF_8_SIG = 1
44 CANDIDATE_UTF_16_AS_LE = 2
45 CANDIDATE_UTF_16_AS_BE = 4
46 CANDIDATE_UTF_16_LE = 8
47 CANDIDATE_UTF_16_BE = 16
48 CANDIDATE_UTF_32_AS_LE = 32
49 CANDIDATE_UTF_32_AS_BE = 64
50 CANDIDATE_UTF_32_LE = 128
51 CANDIDATE_UTF_32_BE = 256
52 CANDIDATE_CHARSET = 512
53
54 candidates = 1023
55
56 li = len(input)
57 if li>=1:
58
59 c = input[0]
60 if c != "\xef":
61 candidates &= ~CANDIDATE_UTF_8_SIG
62 if c != "\xff":
63 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_16_AS_LE)
64 if c != "\xfe":
65 candidates &= ~CANDIDATE_UTF_16_AS_BE
66 if c != "@":
67 candidates &= ~(CANDIDATE_UTF_32_LE|CANDIDATE_UTF_16_LE|CANDIDATE_CHARSET)
68 if c != "\x00":
69 candidates &= ~(CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_BE|CANDIDATE_UTF_16_BE)
70 if li>=2:
71
72 c = input[1]
73 if c != "\xbb":
74 candidates &= ~CANDIDATE_UTF_8_SIG
75 if c != "\xfe":
76 candidates &= ~(CANDIDATE_UTF_16_AS_LE|CANDIDATE_UTF_32_AS_LE)
77 if c != "\xff":
78 candidates &= ~CANDIDATE_UTF_16_AS_BE
79 if c != "\x00":
80 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
81 if c != "@":
82 candidates &= ~CANDIDATE_UTF_16_BE
83 if c != "c":
84 candidates &= ~CANDIDATE_CHARSET
85 if li>=3:
86
87 c = input[2]
88 if c != "\xbf":
89 candidates &= ~CANDIDATE_UTF_8_SIG
90 if c != "c":
91 candidates &= ~CANDIDATE_UTF_16_LE
92 if c != "\x00":
93 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
94 if c != "\xfe":
95 candidates &= ~CANDIDATE_UTF_32_AS_BE
96 if c != "h":
97 candidates &= ~CANDIDATE_CHARSET
98 if li>=4:
99
100 c = input[3]
101 if input[2:4] == "\x00\x00":
102 candidates &= ~CANDIDATE_UTF_16_AS_LE
103 if c != "\x00":
104 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE)
105 if c != "\xff":
106 candidates &= ~CANDIDATE_UTF_32_AS_BE
107 if c != "@":
108 candidates &= ~CANDIDATE_UTF_32_BE
109 if c != "a":
110 candidates &= ~CANDIDATE_CHARSET
111 if candidates == 0:
112 return "utf-8"
113 if not (candidates & (candidates-1)):
114 if candidates == CANDIDATE_UTF_8_SIG and li >= 3:
115 return "utf-8-sig"
116 elif candidates == CANDIDATE_UTF_16_AS_LE and li >= 2:
117 return "utf-16"
118 elif candidates == CANDIDATE_UTF_16_AS_BE and li >= 2:
119 return "utf-16"
120 elif candidates == CANDIDATE_UTF_16_LE and li >= 4:
121 return "utf-16-le"
122 elif candidates == CANDIDATE_UTF_16_BE and li >= 2:
123 return "utf-16-be"
124 elif candidates == CANDIDATE_UTF_32_AS_LE and li >= 4:
125 return "utf-32"
126 elif candidates == CANDIDATE_UTF_32_AS_BE and li >= 4:
127 return "utf-32"
128 elif candidates == CANDIDATE_UTF_32_LE and li >= 4:
129 return "utf-32-le"
130 elif candidates == CANDIDATE_UTF_32_BE and li >= 4:
131 return "utf-32-be"
132 elif candidates == CANDIDATE_CHARSET and li >= 4:
133 prefix = '@charset "'
134 if input[:len(prefix)] == prefix:
135 pos = input.find('"', len(prefix))
136 if pos >= 0:
137 return input[len(prefix):pos]
138
139
140 if final:
141 return "utf-8"
142 return None
143
144
146 """
147 Detect the encoding of the unicode string ``input``, which contains the
148 beginning of a CSS file. The encoding is detected from the charset rule
149 at the beginning of ``input``. If there is no charset rule, ``"utf-8"``
150 will be returned.
151
152 If the encoding can't be detected yet, ``None`` is returned. ``final``
153 specifies whether more data will be available in later calls or not. If
154 ``final`` is true, ``_detectencoding_unicode()`` will never return ``None``.
155 """
156 prefix = u'@charset "'
157 if input.startswith(prefix):
158 pos = input.find(u'"', len(prefix))
159 if pos >= 0:
160 return input[len(prefix):pos]
161 elif final or not prefix.startswith(input):
162
163
164 return "utf-8"
165 return None
166
167
169 """
170 Replace the name of the encoding in the charset rule at the beginning of
171 ``input`` with ``encoding``. If ``input`` doesn't starts with a charset
172 rule, ``input`` will be returned unmodified.
173
174 If the encoding can't be found yet, ``None`` is returned. ``final``
175 specifies whether more data will be available in later calls or not.
176 If ``final`` is true, ``_fixencoding()`` will never return ``None``.
177 """
178 prefix = u'@charset "'
179 if len(input) > len(prefix):
180 if input.startswith(prefix):
181 pos = input.find(u'"', len(prefix))
182 if pos >= 0:
183 if encoding.replace("_", "-").lower() == "utf-8-sig":
184 encoding = u"utf-8"
185 return prefix + encoding + input[pos:]
186
187 else:
188 return input
189 elif not prefix.startswith(input) or final:
190
191 return input
192 if final:
193 return input
194 return None
195
196
197 -def decode(input, errors="strict", encoding=None):
204
205
206 -def encode(input, errors="strict", encoding=None):
218
219
221
222 i = 0
223 for byte in bytes:
224 i = (i<<8) + ord(byte)
225 return i
226
227
229
230 v = []
231 while i:
232 v.insert(0, chr(i&0xff))
233 i >>= 8
234 return "".join(v)
235
236
237 if hasattr(codecs, "IncrementalDecoder"):
239 - def __init__(self, errors="strict", encoding=None):
248
250 for part in input:
251 result = self.decode(part, False)
252 if result:
253 yield result
254 result = self.decode("", True)
255 if result:
256 yield result
257
258 - def decode(self, input, final=False):
259
260
261
262
263
264 if self.decoder is None:
265 input = self.buffer + input
266 self.encoding = _detectencoding_str(input, final)
267 if self.encoding is None:
268 self.buffer = input
269 return u""
270 if self.encoding == "css":
271 raise ValueError("css not allowed as encoding name")
272 self.buffer = ""
273 decoder = codecs.getincrementaldecoder(self.encoding)
274 self.decoder = decoder(self._errors)
275 if self.headerfixed:
276 return self.decoder.decode(input, final)
277
278
279 output = self.buffer + self.decoder.decode(input, final)
280 encoding = self.encoding
281 if encoding.replace("_", "-").lower() == "utf-8-sig":
282 encoding = "utf-8"
283 newoutput = _fixencoding(output, unicode(encoding), final)
284 if newoutput is None:
285
286 self.buffer = output
287 return u""
288 self.headerfixed = True
289 return newoutput
290
292 codecs.IncrementalDecoder.reset(self)
293 self.decoder = None
294 self.buffer = ""
295 self.headerfixed = False
296
299
301
302 if self.decoder is not None:
303 self.decoder.errors = errors
304 self._errors = errors
305 errors = property(_geterrors, _seterrors)
306
308 if self.decoder is not None:
309 state = (self.encoding, self.buffer, self.headerfixed, True, self.decoder.getstate())
310 else:
311 state = (self.encoding, self.buffer, self.headerfixed, False, None)
312 return ("", _bytes2int(marshal.dumps(state)))
313
315 state = _int2bytes(marshal.loads(state[1]))
316 self.encoding = state[0]
317 self.buffer = state[1]
318 self.headerfixed = state[2]
319 if state[3] is not None:
320 self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors)
321 self.decoder.setstate(state[4])
322 else:
323 self.decoder = None
324
325
326 if hasattr(codecs, "IncrementalEncoder"):
328 - def __init__(self, errors="strict", encoding=None):
336
338 for part in input:
339 result = self.encode(part, False)
340 if result:
341 yield result
342 result = self.encode(u"", True)
343 if result:
344 yield result
345
346 - def encode(self, input, final=False):
347 if self.encoder is None:
348 input = self.buffer + input
349 if self.encoding is not None:
350
351 encoding = self.encoding
352 if encoding.replace("_", "-").lower() == "utf-8-sig":
353 encoding = "utf-8"
354 newinput = _fixencoding(input, unicode(encoding), final)
355 if newinput is None:
356 self.buffer = input
357 return ""
358 input = newinput
359 else:
360
361 self.encoding = _detectencoding_unicode(input, final)
362 if self.encoding is not None:
363 if self.encoding == "css":
364 raise ValueError("css not allowed as encoding name")
365 info = codecs.lookup(self.encoding)
366 encoding = self.encoding
367 if self.encoding.replace("_", "-").lower() == "utf-8-sig":
368 input = _fixencoding(input, u"utf-8", True)
369 self.encoder = info.incrementalencoder(self._errors)
370 self.buffer = u""
371 else:
372 self.buffer = input
373 return ""
374 return self.encoder.encode(input, final)
375
380
383
385
386 if self.encoder is not None:
387 self.encoder.errors = errors
388 self._errors = errors
389 errors = property(_geterrors, _seterrors)
390
392 if self.encoder is not None:
393 state = (self.encoding, self.buffer, True, self.encoder.getstate())
394 else:
395 state = (self.encoding, self.buffer, False, None)
396 return _bytes2int(marshal.dumps(state))
397
399 state = _int2bytes(marshal.loads(state))
400 self.encoding = state[0]
401 self.buffer = state[1]
402 if state[2] is not None:
403 self.encoder = codecs.getincrementalencoder(self.encoding)(self._errors)
404 self.encoder.setstate(state[4])
405 else:
406 self.encoder = None
407
408
410 - def __init__(self, stream, errors="strict", encoding=None, header=False):
416
417 - def encode(self, input, errors='strict'):
418 li = len(input)
419 if self.streamwriter is None:
420 input = self.buffer + input
421 li = len(input)
422 if self.encoding is not None:
423
424 encoding = self.encoding
425 if encoding.replace("_", "-").lower() == "utf-8-sig":
426 encoding = "utf-8"
427 newinput = _fixencoding(input, unicode(encoding), False)
428 if newinput is None:
429 self.buffer = input
430 return ("", 0)
431 input = newinput
432 else:
433
434 self.encoding = _detectencoding_unicode(input, False)
435 if self.encoding is not None:
436 if self.encoding == "css":
437 raise ValueError("css not allowed as encoding name")
438 self.streamwriter = codecs.getwriter(self.encoding)(self.stream, self._errors)
439 encoding = self.encoding
440 if self.encoding.replace("_", "-").lower() == "utf-8-sig":
441 input = _fixencoding(input, u"utf-8", True)
442 self.buffer = u""
443 else:
444 self.buffer = input
445 return ("", 0)
446 return (self.streamwriter.encode(input, errors)[0], li)
447
450
452
453 if self.streamwriter is not None:
454 self.streamwriter.errors = errors
455 self._errors = errors
456 errors = property(_geterrors, _seterrors)
457
458
460 - def __init__(self, stream, errors="strict", encoding=None):
465
466 - def decode(self, input, errors='strict'):
467 if self.streamreader is None:
468 if self.encoding is None:
469 self.encoding = _detectencoding_str(input, False)
470 if self.encoding is None:
471 return (u"", 0)
472 if self.encoding == "css":
473 raise ValueError("css not allowed as encoding name")
474 streamreader = codecs.getreader(self.encoding)
475 streamreader = streamreader(self.stream, self._errors)
476 (output, consumed) = streamreader.decode(input, errors)
477 encoding = self.encoding
478 if encoding.replace("_", "-").lower() == "utf-8-sig":
479 encoding = "utf-8"
480 newoutput = _fixencoding(output, unicode(encoding), False)
481 if newoutput is not None:
482 self.streamreader = streamreader
483 return (newoutput, consumed)
484 return (u"", 0)
485 return self.streamreader.decode(input, errors)
486
489
491
492 if self.streamreader is not None:
493 self.streamreader.errors = errors
494 self._errors = errors
495 errors = property(_geterrors, _seterrors)
496
497
498 if hasattr(codecs, "CodecInfo"):
499
511 else:
512
514 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
515
517 prefix = 0
518 if input[:3] == codecs.BOM_UTF8:
519 input = input[3:]
520 prefix = 3
521 (output, consumed) = codecs.utf_8_decode(input, errors, True)
522 return (output, consumed+prefix)
523
531
532 - def encode(self, input, errors='strict'):
535
543
544 - def decode(self, input, errors='strict'):
545 if len(input) < 3 and codecs.BOM_UTF8.startswith(input):
546
547
548 return (u"", 0)
549 self.decode = codecs.utf_8_decode
550 return utf8sig_decode(input, errors)
551
559
560
561 codecs.register(search_function)
562
563
564
565
567 if not isinstance(exc, UnicodeEncodeError):
568 raise TypeError("don't know how to handle %r" % exc)
569 return (u"".join(u"\\%06x" % ord(c) for c in exc.object[exc.start:exc.end]), exc.end)
570
571 codecs.register_error("cssescape", cssescape)
572