Package cssutils :: Module codec
[hide private]
[frames] | no frames]

Source Code for Module cssutils.codec

  1  #!/usr/bin/env python 
  2   
  3  """Python codec for CSS.""" 
  4  __docformat__ = 'restructuredtext' 
  5  __docformat__ = 'restructuredtext' 
  6  __author__ = 'Walter Doerwald' 
  7  __version__ = '$Id: util.py 1114 2008-03-05 13:22:59Z cthedot $' 
  8   
  9  import codecs, marshal 
 10   
 11   
 12  # We're using bits to store all possible candidate encodings (or variants, i.e. 
 13  # we have two bits for the variants of UTF-16 and two for the 
 14  # variants of UTF-32). 
 15  # 
 16  # Prefixes for various CSS encodings 
 17  # UTF-8-SIG   xEF  xBB  xBF 
 18  # UTF-16 (LE) xFF  xFE ~x00|~x00 
 19  # UTF-16 (BE) xFE  xFF 
 20  # UTF-16-LE    @   x00   @   x00 
 21  # UTF-16-BE   x00   @ 
 22  # UTF-32 (LE) xFF  xFE  x00  x00 
 23  # UTF-32 (BE) x00  x00  xFE  xFF 
 24  # UTF-32-LE    @   x00  x00  x00 
 25  # UTF-32-BE   x00  x00  x00   @ 
 26  # CHARSET      @    c    h    a  ... 
 27   
 28   
 29   
30 -def _detectencoding_str(input, final=False):
31 """ 32 Detect the encoding of the byte string ``input``, which contains the 33 beginning of a CSS file. To detect the encoding the first few bytes are 34 used (or if ``input`` is ASCII compatible and starts with a charset rule 35 the encoding name from the rule). 36 37 If the encoding can't be detected yet, ``None`` is returned. ``final`` 38 specifies whether more data is available in later calls or not. If ``final`` 39 is true, ``_detectencoding_str()`` will never return ``None``. 40 """ 41 42 # A bit for every candidate 43 CANDIDATE_UTF_8_SIG = 1 44 CANDIDATE_UTF_16_AS_LE = 2 45 CANDIDATE_UTF_16_AS_BE = 4 46 CANDIDATE_UTF_16_LE = 8 47 CANDIDATE_UTF_16_BE = 16 48 CANDIDATE_UTF_32_AS_LE = 32 49 CANDIDATE_UTF_32_AS_BE = 64 50 CANDIDATE_UTF_32_LE = 128 51 CANDIDATE_UTF_32_BE = 256 52 CANDIDATE_CHARSET = 512 53 54 candidates = 1023 # all candidates 55 56 li = len(input) 57 if li>=1: 58 # Check first byte 59 c = input[0] 60 if c != "\xef": 61 candidates &= ~CANDIDATE_UTF_8_SIG 62 if c != "\xff": 63 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_16_AS_LE) 64 if c != "\xfe": 65 candidates &= ~CANDIDATE_UTF_16_AS_BE 66 if c != "@": 67 candidates &= ~(CANDIDATE_UTF_32_LE|CANDIDATE_UTF_16_LE|CANDIDATE_CHARSET) 68 if c != "\x00": 69 candidates &= ~(CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_BE|CANDIDATE_UTF_16_BE) 70 if li>=2: 71 # Check second byte 72 c = input[1] 73 if c != "\xbb": 74 candidates &= ~CANDIDATE_UTF_8_SIG 75 if c != "\xfe": 76 candidates &= ~(CANDIDATE_UTF_16_AS_LE|CANDIDATE_UTF_32_AS_LE) 77 if c != "\xff": 78 candidates &= ~CANDIDATE_UTF_16_AS_BE 79 if c != "\x00": 80 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE) 81 if c != "@": 82 candidates &= ~CANDIDATE_UTF_16_BE 83 if c != "c": 84 candidates &= ~CANDIDATE_CHARSET 85 if li>=3: 86 # Check third byte 87 c = input[2] 88 if c != "\xbf": 89 candidates &= ~CANDIDATE_UTF_8_SIG 90 if c != "c": 91 candidates &= ~CANDIDATE_UTF_16_LE 92 if c != "\x00": 93 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE) 94 if c != "\xfe": 95 candidates &= ~CANDIDATE_UTF_32_AS_BE 96 if c != "h": 97 candidates &= ~CANDIDATE_CHARSET 98 if li>=4: 99 # Check fourth byte 100 c = input[3] 101 if input[2:4] == "\x00\x00": 102 candidates &= ~CANDIDATE_UTF_16_AS_LE 103 if c != "\x00": 104 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE) 105 if c != "\xff": 106 candidates &= ~CANDIDATE_UTF_32_AS_BE 107 if c != "@": 108 candidates &= ~CANDIDATE_UTF_32_BE 109 if c != "a": 110 candidates &= ~CANDIDATE_CHARSET 111 if candidates == 0: 112 return "utf-8" 113 if not (candidates & (candidates-1)): # only one candidate remaining 114 if candidates == CANDIDATE_UTF_8_SIG and li >= 3: 115 return "utf-8-sig" 116 elif candidates == CANDIDATE_UTF_16_AS_LE and li >= 2: 117 return "utf-16" 118 elif candidates == CANDIDATE_UTF_16_AS_BE and li >= 2: 119 return "utf-16" 120 elif candidates == CANDIDATE_UTF_16_LE and li >= 4: 121 return "utf-16-le" 122 elif candidates == CANDIDATE_UTF_16_BE and li >= 2: 123 return "utf-16-be" 124 elif candidates == CANDIDATE_UTF_32_AS_LE and li >= 4: 125 return "utf-32" 126 elif candidates == CANDIDATE_UTF_32_AS_BE and li >= 4: 127 return "utf-32" 128 elif candidates == CANDIDATE_UTF_32_LE and li >= 4: 129 return "utf-32-le" 130 elif candidates == CANDIDATE_UTF_32_BE and li >= 4: 131 return "utf-32-be" 132 elif candidates == CANDIDATE_CHARSET and li >= 4: 133 prefix = '@charset "' 134 if input[:len(prefix)] == prefix: 135 pos = input.find('"', len(prefix)) 136 if pos >= 0: 137 return input[len(prefix):pos] 138 # if this is the last call, and we haven't determined an encoding yet, 139 # we default to UTF-8 140 if final: 141 return "utf-8" 142 return None # dont' know yet
143 144
145 -def _detectencoding_unicode(input, final=False):
146 """ 147 Detect the encoding of the unicode string ``input``, which contains the 148 beginning of a CSS file. The encoding is detected from the charset rule 149 at the beginning of ``input``. If there is no charset rule, ``"utf-8"`` 150 will be returned. 151 152 If the encoding can't be detected yet, ``None`` is returned. ``final`` 153 specifies whether more data will be available in later calls or not. If 154 ``final`` is true, ``_detectencoding_unicode()`` will never return ``None``. 155 """ 156 prefix = u'@charset "' 157 if input.startswith(prefix): 158 pos = input.find(u'"', len(prefix)) 159 if pos >= 0: 160 return input[len(prefix):pos] 161 elif final or not prefix.startswith(input): 162 # if this is the last call, and we haven't determined an encoding yet, 163 # (or the string definitely doesn't start with prefix) we default to UTF-8 164 return "utf-8" 165 return None # don't know yet
166 167
168 -def _fixencoding(input, encoding, final=False):
169 """ 170 Replace the name of the encoding in the charset rule at the beginning of 171 ``input`` with ``encoding``. If ``input`` doesn't starts with a charset 172 rule, ``input`` will be returned unmodified. 173 174 If the encoding can't be found yet, ``None`` is returned. ``final`` 175 specifies whether more data will be available in later calls or not. 176 If ``final`` is true, ``_fixencoding()`` will never return ``None``. 177 """ 178 prefix = u'@charset "' 179 if len(input) > len(prefix): 180 if input.startswith(prefix): 181 pos = input.find(u'"', len(prefix)) 182 if pos >= 0: 183 if encoding.replace("_", "-").lower() == "utf-8-sig": 184 encoding = u"utf-8" 185 return prefix + encoding + input[pos:] 186 # we haven't seen the end of the encoding name yet => fall through 187 else: 188 return input # doesn't start with prefix, so nothing to fix 189 elif not prefix.startswith(input) or final: 190 # can't turn out to be a @charset rule later (or there is no "later") 191 return input 192 if final: 193 return input 194 return None # don't know yet
195 196
197 -def decode(input, errors="strict", encoding=None):
198 if encoding is None: 199 encoding = _detectencoding_str(input, True) 200 if encoding == "css": 201 raise ValueError("css not allowed as encoding name") 202 (input, consumed) = codecs.getdecoder(encoding)(input, errors) 203 return (_fixencoding(input, unicode(encoding), True), consumed)
204 205
206 -def encode(input, errors="strict", encoding=None):
207 consumed = len(input) 208 if encoding is None: 209 encoding = _detectencoding_unicode(input, True) 210 if encoding.replace("_", "-").lower() == "utf-8-sig": 211 input = _fixencoding(input, u"utf-8", True) 212 else: 213 input = _fixencoding(input, unicode(encoding), True) 214 if encoding == "css": 215 raise ValueError("css not allowed as encoding name") 216 encoder = codecs.getencoder(encoding) 217 return (encoder(input, errors)[0], consumed)
218 219
220 -def _bytes2int(bytes):
221 # Helper: convert an 8 bit string into an ``int``. 222 i = 0 223 for byte in bytes: 224 i = (i<<8) + ord(byte) 225 return i
226 227
228 -def _int2bytes(i):
229 # Helper: convert an ``int`` into an 8-bit string. 230 v = [] 231 while i: 232 v.insert(0, chr(i&0xff)) 233 i >>= 8 234 return "".join(v)
235 236 237 if hasattr(codecs, "IncrementalDecoder"):
238 - class IncrementalDecoder(codecs.IncrementalDecoder):
239 - def __init__(self, errors="strict", encoding=None):
240 self.decoder = None 241 self.encoding = encoding 242 codecs.IncrementalDecoder.__init__(self, errors) 243 # Store ``errors`` somewhere else, 244 # because we have to hide it in a property 245 self._errors = errors 246 self.buffer = "" 247 self.headerfixed = False
248
249 - def iterdecode(self, input):
250 for part in input: 251 result = self.decode(part, False) 252 if result: 253 yield result 254 result = self.decode("", True) 255 if result: 256 yield result
257
258 - def decode(self, input, final=False):
259 # We're doing basically the same as a ``BufferedIncrementalDecoder``, 260 # but since the buffer is only relevant until the encoding has been 261 # detected (in which case the buffer of the underlying codec might 262 # kick in), we're implementing buffering ourselves to avoid some 263 # overhead. 264 if self.decoder is None: 265 input = self.buffer + input 266 self.encoding = _detectencoding_str(input, final) 267 if self.encoding is None: 268 self.buffer = input # retry the complete input on the next call 269 return u"" # no encoding determined yet, so no output 270 if self.encoding == "css": 271 raise ValueError("css not allowed as encoding name") 272 self.buffer = "" # drop buffer, as the decoder might keep its own 273 decoder = codecs.getincrementaldecoder(self.encoding) 274 self.decoder = decoder(self._errors) 275 if self.headerfixed: 276 return self.decoder.decode(input, final) 277 # If we haven't fixed the header yet, 278 # the content of ``self.buffer`` is a ``unicode`` object 279 output = self.buffer + self.decoder.decode(input, final) 280 encoding = self.encoding 281 if encoding.replace("_", "-").lower() == "utf-8-sig": 282 encoding = "utf-8" 283 newoutput = _fixencoding(output, unicode(encoding), final) 284 if newoutput is None: 285 # retry fixing the @charset rule (but keep the decoded stuff) 286 self.buffer = output 287 return u"" 288 self.headerfixed = True 289 return newoutput
290
291 - def reset(self):
292 codecs.IncrementalDecoder.reset(self) 293 self.decoder = None 294 self.buffer = "" 295 self.headerfixed = False
296
297 - def _geterrors(self):
298 return self._errors
299
300 - def _seterrors(self, errors):
301 # Setting ``errors`` must be done on the real decoder too 302 if self.decoder is not None: 303 self.decoder.errors = errors 304 self._errors = errors
305 errors = property(_geterrors, _seterrors) 306
307 - def getstate(self):
308 if self.decoder is not None: 309 state = (self.encoding, self.buffer, self.headerfixed, True, self.decoder.getstate()) 310 else: 311 state = (self.encoding, self.buffer, self.headerfixed, False, None) 312 return ("", _bytes2int(marshal.dumps(state)))
313
314 - def setstate(self, state):
315 state = _int2bytes(marshal.loads(state[1])) # ignore buffered input 316 self.encoding = state[0] 317 self.buffer = state[1] 318 self.headerfixed = state[2] 319 if state[3] is not None: 320 self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors) 321 self.decoder.setstate(state[4]) 322 else: 323 self.decoder = None
324 325 326 if hasattr(codecs, "IncrementalEncoder"):
327 - class IncrementalEncoder(codecs.IncrementalEncoder):
328 - def __init__(self, errors="strict", encoding=None):
329 self.encoder = None 330 self.encoding = encoding 331 codecs.IncrementalEncoder.__init__(self, errors) 332 # Store ``errors`` somewhere else, 333 # because we have to hide it in a property 334 self._errors = errors 335 self.buffer = u""
336
337 - def iterencode(self, input):
338 for part in input: 339 result = self.encode(part, False) 340 if result: 341 yield result 342 result = self.encode(u"", True) 343 if result: 344 yield result
345
346 - def encode(self, input, final=False):
347 if self.encoder is None: 348 input = self.buffer + input 349 if self.encoding is not None: 350 # Replace encoding in the @charset rule with the specified one 351 encoding = self.encoding 352 if encoding.replace("_", "-").lower() == "utf-8-sig": 353 encoding = "utf-8" 354 newinput = _fixencoding(input, unicode(encoding), final) 355 if newinput is None: # @charset rule incomplete => Retry next time 356 self.buffer = input 357 return "" 358 input = newinput 359 else: 360 # Use encoding from the @charset declaration 361 self.encoding = _detectencoding_unicode(input, final) 362 if self.encoding is not None: 363 if self.encoding == "css": 364 raise ValueError("css not allowed as encoding name") 365 info = codecs.lookup(self.encoding) 366 encoding = self.encoding 367 if self.encoding.replace("_", "-").lower() == "utf-8-sig": 368 input = _fixencoding(input, u"utf-8", True) 369 self.encoder = info.incrementalencoder(self._errors) 370 self.buffer = u"" 371 else: 372 self.buffer = input 373 return "" 374 return self.encoder.encode(input, final)
375
376 - def reset(self):
377 codecs.IncrementalEncoder.reset(self) 378 self.encoder = None 379 self.buffer = u""
380
381 - def _geterrors(self):
382 return self._errors
383
384 - def _seterrors(self, errors):
385 # Setting ``errors ``must be done on the real encoder too 386 if self.encoder is not None: 387 self.encoder.errors = errors 388 self._errors = errors
389 errors = property(_geterrors, _seterrors) 390
391 - def getstate(self):
392 if self.encoder is not None: 393 state = (self.encoding, self.buffer, True, self.encoder.getstate()) 394 else: 395 state = (self.encoding, self.buffer, False, None) 396 return _bytes2int(marshal.dumps(state))
397
398 - def setstate(self, state):
399 state = _int2bytes(marshal.loads(state)) 400 self.encoding = state[0] 401 self.buffer = state[1] 402 if state[2] is not None: 403 self.encoder = codecs.getincrementalencoder(self.encoding)(self._errors) 404 self.encoder.setstate(state[4]) 405 else: 406 self.encoder = None
407 408
409 -class StreamWriter(codecs.StreamWriter):
410 - def __init__(self, stream, errors="strict", encoding=None, header=False):
411 codecs.StreamWriter.__init__(self, stream, errors) 412 self.streamwriter = None 413 self.encoding = encoding 414 self._errors = errors 415 self.buffer = u""
416
417 - def encode(self, input, errors='strict'):
418 li = len(input) 419 if self.streamwriter is None: 420 input = self.buffer + input 421 li = len(input) 422 if self.encoding is not None: 423 # Replace encoding in the @charset rule with the specified one 424 encoding = self.encoding 425 if encoding.replace("_", "-").lower() == "utf-8-sig": 426 encoding = "utf-8" 427 newinput = _fixencoding(input, unicode(encoding), False) 428 if newinput is None: # @charset rule incomplete => Retry next time 429 self.buffer = input 430 return ("", 0) 431 input = newinput 432 else: 433 # Use encoding from the @charset declaration 434 self.encoding = _detectencoding_unicode(input, False) 435 if self.encoding is not None: 436 if self.encoding == "css": 437 raise ValueError("css not allowed as encoding name") 438 self.streamwriter = codecs.getwriter(self.encoding)(self.stream, self._errors) 439 encoding = self.encoding 440 if self.encoding.replace("_", "-").lower() == "utf-8-sig": 441 input = _fixencoding(input, u"utf-8", True) 442 self.buffer = u"" 443 else: 444 self.buffer = input 445 return ("", 0) 446 return (self.streamwriter.encode(input, errors)[0], li)
447
448 - def _geterrors(self):
449 return self._errors
450
451 - def _seterrors(self, errors):
452 # Setting ``errors`` must be done on the streamwriter too 453 if self.streamwriter is not None: 454 self.streamwriter.errors = errors 455 self._errors = errors
456 errors = property(_geterrors, _seterrors)
457 458
459 -class StreamReader(codecs.StreamReader):
460 - def __init__(self, stream, errors="strict", encoding=None):
461 codecs.StreamReader.__init__(self, stream, errors) 462 self.streamreader = None 463 self.encoding = encoding 464 self._errors = errors
465
466 - def decode(self, input, errors='strict'):
467 if self.streamreader is None: 468 if self.encoding is None: 469 self.encoding = _detectencoding_str(input, False) 470 if self.encoding is None: 471 return (u"", 0) # no encoding determined yet, so no output 472 if self.encoding == "css": 473 raise ValueError("css not allowed as encoding name") 474 streamreader = codecs.getreader(self.encoding) 475 streamreader = streamreader(self.stream, self._errors) 476 (output, consumed) = streamreader.decode(input, errors) 477 encoding = self.encoding 478 if encoding.replace("_", "-").lower() == "utf-8-sig": 479 encoding = "utf-8" 480 newoutput = _fixencoding(output, unicode(encoding), False) 481 if newoutput is not None: 482 self.streamreader = streamreader 483 return (newoutput, consumed) 484 return (u"", 0) # we will create a new streamreader on the next call 485 return self.streamreader.decode(input, errors)
486
487 - def _geterrors(self):
488 return self._errors
489
490 - def _seterrors(self, errors):
491 # Setting ``errors`` must be done on the streamreader too 492 if self.streamreader is not None: 493 self.streamreader.errors = errors 494 self._errors = errors
495 errors = property(_geterrors, _seterrors)
496 497 498 if hasattr(codecs, "CodecInfo"): 499 # We're running on Python 2.5 or better
500 - def search_function(name):
501 if name == "css": 502 return codecs.CodecInfo( 503 name="css", 504 encode=encode, 505 decode=decode, 506 incrementalencoder=IncrementalEncoder, 507 incrementaldecoder=IncrementalDecoder, 508 streamwriter=StreamWriter, 509 streamreader=StreamReader, 510 )
511 else: 512 # If we're running on Python 2.4, define the utf-8-sig codec here
513 - def utf8sig_encode(input, errors='strict'):
514 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
515
516 - def utf8sig_decode(input, errors='strict'):
517 prefix = 0 518 if input[:3] == codecs.BOM_UTF8: 519 input = input[3:] 520 prefix = 3 521 (output, consumed) = codecs.utf_8_decode(input, errors, True) 522 return (output, consumed+prefix)
523
524 - class UTF8SigStreamWriter(codecs.StreamWriter):
525 - def reset(self):
526 codecs.StreamWriter.reset(self) 527 try: 528 del self.encode 529 except AttributeError: 530 pass
531
532 - def encode(self, input, errors='strict'):
533 self.encode = codecs.utf_8_encode 534 return utf8sig_encode(input, errors)
535
536 - class UTF8SigStreamReader(codecs.StreamReader):
537 - def reset(self):
538 codecs.StreamReader.reset(self) 539 try: 540 del self.decode 541 except AttributeError: 542 pass
543
544 - def decode(self, input, errors='strict'):
545 if len(input) < 3 and codecs.BOM_UTF8.startswith(input): 546 # not enough data to decide if this is a BOM 547 # => try again on the next call 548 return (u"", 0) 549 self.decode = codecs.utf_8_decode 550 return utf8sig_decode(input, errors)
551
552 - def search_function(name):
553 import encodings 554 name = encodings.normalize_encoding(name) 555 if name == "css": 556 return (encode, decode, StreamReader, StreamWriter) 557 elif name == "utf_8_sig": 558 return (utf8sig_encode, utf8sig_decode, UTF8SigStreamReader, UTF8SigStreamWriter)
559 560 561 codecs.register(search_function) 562 563 564 # Error handler for CSS escaping 565
566 -def cssescape(exc):
567 if not isinstance(exc, UnicodeEncodeError): 568 raise TypeError("don't know how to handle %r" % exc) 569 return (u"".join(u"\\%06x" % ord(c) for c in exc.object[exc.start:exc.end]), exc.end)
570 571 codecs.register_error("cssescape", cssescape) 572