[cairo] Unicode error causing Cairo to crash.

Tue Apr 26 14:29:57 PDT 2005

Owen Taylor wrote:
> 
> cairo_text_extents() takes a UTF-8 string ... for anything > 256,
> what you are putting into 'c' isn't valid UTF-8 character.
> 
> So, it shouldn't work. But it shouldn't crash either ... if it does,
> file a bug.
> 
> Regards,

Can we *please* make Cairo (and Pango or any other interface) draw 
illegal parts of UTF-8 as though the bytes are individual Unicode 
characters? This makes converting existing API's to UTF-8 trivial and 
would greatly encourage I18N. I have never seen (and I challenge anybody 
to come up with) an ISO-8859-1 text in any language that does not draw 
correctly when this is done.

Also please make sure that any UTF-8 encoded with "more bytes than 
necessary" is illegal.

Attached is my code for converting UTF-8 to unicode indicies that I hope 
can be used. It also converts Microsoft CP1252 (like Word writes to html 
pages) but that may be deleted if you want. I also accept a few 
sequences that RFC 3629 says are illegal as they would make some Unicode 
indicies impossible.

// Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
// to Unicode:
static unsigned short cp1252[32] = {
   0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
   0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
   0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
   0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
};

/*! Decode a single UTF-8 encoded character starting at \e p. The
     resulting Unicode value (in the range 0-0x10ffff) is returned,
     and \e len is set the the number of bytes in the UTF-8 encoding
     (adding \e len to \e p will point at the next character).

     If \a p points at an illegal UTF-8 encoding, including one that
     would go past \e end, or where a code is uses more bytes than
     necessary, then *(unsigned char*)p is translated as though it is
     in the Microsoft CP1252 character set and \e len is set to 1.
     Treating errors this way allows this to decode almost any
     ISO-8859-1 or CP1252 text that has been mistakenly placed where
     UTF-8 is expected, and has proven very useful.

     If you want errors to be converted to error characters (as the
     standards recommend), adding a test to see if the length is
     unexpectedly 1 will work:

\code
     if (*p & 0x80) { // what should be a multibyte encoding
       code = utf8decode(p,end,&len);
       if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
     } else { // handle the 1-byte utf8 encoding:
       code = *p;
       len = 1;
     }
\endcode

     Direct testing for the 1-byte case (as shown above) will also
     speed up the scanning of strings where the majority of characters
     are ASCII. If you don't care for the CP1252 translation you
     should use *p if it is not in the range 0xc2 through 0xf4.
*/
unsigned utf8decode(const char* p, const char* end, int* len)
{
   unsigned char c = *(unsigned char*)p;
   if (c < 0x80) {
     *len = 1;
     return c;
   } else if (c < 0xa0) {
     *len = 1;
     return cp1252[c-0x80];
   } else if (c < 0xc2) {
     *len = 1;
     return c;
   }
   if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
   if (c < 0xe0) {
     *len = 2;
     return
       ((p[0] & 0x1f) << 6) +
       ((p[1] & 0x3f));
   } else if (c == 0xe0) {
     if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
     goto UTF8_3;
#if 0
   } else if (c == 0xed) {
     // RFC 3629 says surrogate chars are illegal.
     // I don't check this so that all 16-bit values are preserved
     // when going through utf8encode/utf8decode.
     if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
     goto UTF8_3;
   } else if (c == 0xef) {
     // 0xfffe and 0xffff are also illegal characters
     // Again I don't check this so 16-bit values are preserved
     if (((unsigned char*)p)[1]==0xbf &&
	((unsigned char*)p)[2]>=0xbe) goto FAIL;
     goto UTF8_3;
#endif
   } else if (c < 0xf0) {
   UTF8_3:
     if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
     *len = 3;
     return
       ((p[0] & 0x0f) << 12) +
       ((p[1] & 0x3f) << 6) +
       ((p[2] & 0x3f));
   } else if (c == 0xf0) {
     if (((unsigned char*)p)[1] < 0x90) goto FAIL;
     goto UTF8_4;
   } else if (c < 0xf4) {
   UTF8_4:
     if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto 
FAIL;
     *len = 4;
#if 0
     // RFC 3629 says all codes ending in fffe or ffff are illegal:
     if ((p[1]&0xf)==0xf &&
	((unsigned char*)p)[2] == 0xbf &&
	((unsigned char*)p)[3] >= 0xbe) goto FAIL;
#endif
     return
       ((p[0] & 0x07) << 18) +
       ((p[1] & 0x3f) << 12) +
       ((p[2] & 0x3f) << 6) +
       ((p[3] & 0x3f));
   } else if (c == 0xf4) {
     if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
     goto UTF8_4;
   } else {
   FAIL:
     *len = 1;
     return c;
   }
}