[cairo] Unicode error causing Cairo to crash.

Bill Spitzak spitzak at d2.com
Mon May 2 11:55:54 PDT 2005


Though I posted it before, the attached code is what I use to step 
through a UTF-8 string and convert it to Unicode, plus convert the 
errors as though this is a misidentified CP1252/ISO8859-1 string.

Some simple changes will make it return the "error" Unicode for each 
byte in a bad string, I have put in an (untested) #ifdef code to do 
this. (I still prefer the ISO8859-1 results).

Jost Boekemeier wrote:
>>request is to make it possible to use a UTF-8 API
>>only, and *encourage* 
> 
> 
> I think such a feature is useful.  It is always good
> to be backwards compatible.  
> 
> However, I don't know of any libraries which provide
> the functionality you're requesting.  Can you please
> give some examples? existing code/libraries?


/* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
    they are instead turned into the Unicode REPLACEMENT CHARACTER, of
    value 0xfffd.
    If this is on utf8decode will correctly map most (perhaps all)
    human-readable text that is in ISO-8859-1. This may allow you
    to completely ignore character sets in your code because virtually
    everything is either ISO-8859-1 or UTF-8.
*/
#define ERRORS_TO_ISO8859_1 1

/* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
    Unicode index for Microsoft's CP1252 character set. You should
    also set ERRORS_TO_ISO8859_1. With this a huge amount of more
    available text (such as all web pages) are correctly converted
    to Unicode.
*/
#define ERRORS_TO_CP1252 1

/* A number of Unicode code points are in fact illegal and should not
    be produced by a UTF-8 converter. Turn this on will replace the
    bytes in those encodings with errors. If you do this then converting
    arbitrary 16-bit data to UTF-8 and then back is not an identity,
    which will probably break a lot of software.
*/
#define STRICT_RFC3629 0

#if ERRORS_TO_CP1252
// Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
// to Unicode:
static unsigned short cp1252[32] = {
   0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
   0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
   0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
   0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
};
#endif

/*! Decode a single UTF-8 encoded character starting at \e p. The
     resulting Unicode value (in the range 0-0x10ffff) is returned,
     and \e len is set the the number of bytes in the UTF-8 encoding
     (adding \e len to \e p will point at the next character).

     If \a p points at an illegal UTF-8 encoding, including one that
     would go past \e end, or where a code is uses more bytes than
     necessary, then *(unsigned char*)p is translated as though it is
     in the Microsoft CP1252 character set and \e len is set to 1.
     Treating errors this way allows this to decode almost any
     ISO-8859-1 or CP1252 text that has been mistakenly placed where
     UTF-8 is expected, and has proven very useful.

     If you want errors to be converted to error characters (as the
     standards recommend), adding a test to see if the length is
     unexpectedly 1 will work:

\code
     if (*p & 0x80) { // what should be a multibyte encoding
       code = utf8decode(p,end,&len);
       if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
     } else { // handle the 1-byte utf8 encoding:
       code = *p;
       len = 1;
     }
\endcode

     Direct testing for the 1-byte case (as shown above) will also
     speed up the scanning of strings where the majority of characters
     are ASCII. If you don't care for the CP1252 translation you
     should use *p if it is not in the range 0xc2 through 0xf4.
*/
unsigned utf8decode(const char* p, const char* end, int* len)
{
   unsigned char c = *(unsigned char*)p;
   if (c < 0x80) {
     *len = 1;
     return c;
#if ERRORS_TO_CP1252
   } else if (c < 0xa0) {
     *len = 1;
     return cp1252[c-0x80];
#endif
   } else if (c < 0xc2) {
     goto FAIL;
   }
   if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
   if (c < 0xe0) {
     *len = 2;
     return
       ((p[0] & 0x1f) << 6) +
       ((p[1] & 0x3f));
   } else if (c == 0xe0) {
     if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
     goto UTF8_3;
#if STRICT_RFC3629
   } else if (c == 0xed) {
     // RFC 3629 says surrogate chars are illegal.
     if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
     goto UTF8_3;
   } else if (c == 0xef) {
     // 0xfffe and 0xffff are also illegal characters
     if (((unsigned char*)p)[1]==0xbf &&
	((unsigned char*)p)[2]>=0xbe) goto FAIL;
     goto UTF8_3;
#endif
   } else if (c < 0xf0) {
   UTF8_3:
     if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
     *len = 3;
     return
       ((p[0] & 0x0f) << 12) +
       ((p[1] & 0x3f) << 6) +
       ((p[2] & 0x3f));
   } else if (c == 0xf0) {
     if (((unsigned char*)p)[1] < 0x90) goto FAIL;
     goto UTF8_4;
   } else if (c < 0xf4) {
   UTF8_4:
     if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto 
FAIL;
     *len = 4;
#if STRICT_RFC3629
     // RFC 3629 says all codes ending in fffe or ffff are illegal:
     if ((p[1]&0xf)==0xf &&
	((unsigned char*)p)[2] == 0xbf &&
	((unsigned char*)p)[3] >= 0xbe) goto FAIL;
#endif
     return
       ((p[0] & 0x07) << 18) +
       ((p[1] & 0x3f) << 12) +
       ((p[2] & 0x3f) << 6) +
       ((p[3] & 0x3f));
   } else if (c == 0xf4) {
     if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
     goto UTF8_4;
   } else {
   FAIL:
     *len = 1;
#if ERRORS_TO_ISO8859_1
     return c;
#else
     return 0xfffd; // Unicode REPLACEMENT CHARACTER
#endif
   }
}



More information about the cairo mailing list