Implement UCN support for C string literals (C99 6.4.3) and add some very basic tests. Chris Goller has graciously offered to write some test to help validate UCN support.

From a front-end perspective, I believe this code should work for ObjC @-strings. At the moment, I believe we need to tweak the code generation for @-strings (which doesn't appear to handle them). Will be investigating.

llvm-svn: 68076
This commit is contained in:
Steve Naroff 2009-03-30 23:46:03 +00:00
parent 29f80c343b
commit 7b753d21b5
3 changed files with 120 additions and 14 deletions

View File

@ -52,6 +52,10 @@ def ext_nonstandard_escape : Extension<
"use of non-standard escape character '\\%0'">;
def ext_unknown_escape : Extension<"unknown escape sequence '\\%0'">;
def err_hex_escape_no_digits : Error<"\\x used with no following hex digits">;
def err_ucn_escape_no_digits : Error<"\\u used with no following hex digits">;
def err_ucn_escape_invalid : Error<"invalid universal character">;
def err_ucn_escape_incomplete : Error<"incomplete universal character name">;
def err_ucn_escape_too_big : Error<"universal character name is too long">;
def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">;
def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">;
def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">;

View File

@ -71,8 +71,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
case 'v':
ResultChar = 11;
break;
//case 'u': case 'U': // FIXME: UCNs.
case 'x': { // Hex escape.
ResultChar = 0;
if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
@ -151,7 +149,90 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
return ResultChar;
}
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
/// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser.
/// When we decide to implement UCN's for character constants and identifiers,
/// we will likely rework our support for UCN's.
static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
char *&ResultBuf, const char *ResultBufEnd,
bool &HadError,
SourceLocation Loc, Preprocessor &PP) {
// FIXME: Add a warning - UCN's are only valid in C++ & C99.
// Skip the '\u' char's.
ThisTokBuf += 2;
if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
PP.Diag(Loc, diag::err_ucn_escape_no_digits);
HadError = 1;
return;
}
typedef unsigned int UTF32;
UTF32 UcnVal = 0;
unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) {
int CharVal = HexDigitValue(ThisTokBuf[0]);
if (CharVal == -1) break;
UcnVal <<= 4;
UcnVal |= CharVal;
}
// If we didn't consume the proper number of digits, there is a problem.
if (UcnLen) {
PP.Diag(Loc, diag::err_ucn_escape_incomplete);
HadError = 1;
return;
}
// Check UCN constraints (C99 6.4.3p2)
if ((UcnVal < 0xa0 &&
(UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
|| (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)) {
PP.Diag(Loc, diag::err_ucn_escape_invalid);
HadError = 1;
return;
}
// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
// The conversion below was inspired by:
// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
// First, we determine how many bytes the result will require.
typedef unsigned char UTF8;
unsigned short bytesToWrite = 0;
if (UcnVal < (UTF32)0x80)
bytesToWrite = 1;
else if (UcnVal < (UTF32)0x800)
bytesToWrite = 2;
else if (UcnVal < (UTF32)0x10000)
bytesToWrite = 3;
else
bytesToWrite = 4;
// If the buffer isn't big enough, bail.
if ((ResultBuf + bytesToWrite) >= ResultBufEnd) {
PP.Diag(Loc, diag::err_ucn_escape_too_big);
HadError = 1;
return;
}
const unsigned byteMask = 0xBF;
const unsigned byteMark = 0x80;
// Once the bits are split out into bytes of UTF8, this is a mask OR-ed
// into the first byte, depending on how many bytes follow. There are
// as many entries in this table as there are UTF8 sequence types.
static const UTF8 firstByteMark[7] = {
0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
};
// Finally, we write the bytes into ResultBuf.
ResultBuf += bytesToWrite;
switch (bytesToWrite) { // note: everything falls through.
case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
}
// Update the buffer.
ResultBuf += bytesToWrite;
}
/// integer-constant: [C99 6.4.4.1]
@ -757,23 +838,29 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
*ResultPtr++ = InStart[0];
// Add zeros at the end.
for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
*ResultPtr++ = 0;
*ResultPtr++ = 0;
}
}
continue;
}
// Otherwise, this is an escape character. Process it.
unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
StringToks[i].getLocation(),
ThisIsWide, PP);
// Note: our internal rep of wide char tokens is always little-endian.
*ResultPtr++ = ResultChar & 0xFF;
if (AnyWide) {
for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
*ResultPtr++ = ResultChar >> i*8;
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
GetString() + ResultBuf.size(),
hadError, StringToks[i].getLocation(), PP);
} else {
// Otherwise, this is a non-UCN escape character. Process it.
unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
StringToks[i].getLocation(),
ThisIsWide, PP);
// Note: our internal rep of wide char tokens is always little-endian.
*ResultPtr++ = ResultChar & 0xFF;
if (AnyWide) {
for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
*ResultPtr++ = ResultChar >> i*8;
}
}
}
}

View File

@ -0,0 +1,15 @@
// RUN: clang-cc %s -verify -fsyntax-only -pedantic
#include <stdio.h>
int main(void) {
printf("%s (%d)\n", "hello \u2192 \u2603 \u2190 world", sizeof("hello \u2192 \u2603 \u2190 world"));
printf("%s (%d)\n", "\U00010400\U0001D12B", sizeof("\U00010400\U0001D12B"));
// Some error conditions...
printf("%s\n", "\U"); // expected-error{{\u used with no following hex digits}}
printf("%s\n", "\U00"); // expected-error{{incomplete universal character name}}
printf("%s\n", "\U0001"); // expected-error{{incomplete universal character name}}
printf("%s\n", "\u0001"); // expected-error{{invalid universal character}}
return 0;
}