From a9cfcf6bdef8738d0746f0b0d76fac8a95a84eed Mon Sep 17 00:00:00 2001 From: "Ryan C. Gordon" Date: Wed, 26 Jun 2024 21:32:45 -0400 Subject: [PATCH] stdinc: Drastically improve SDL_StepUTF8() and make it a public API. Fixes #10105. --- include/SDL3/SDL_stdinc.h | 61 +++++++++++++++++ src/dynapi/SDL_dynapi.sym | 1 + src/dynapi/SDL_dynapi_overrides.h | 1 + src/dynapi/SDL_dynapi_procs.h | 1 + src/filesystem/SDL_filesystem.c | 2 +- src/stdlib/SDL_string.c | 107 ++++++++++++++++++++---------- src/stdlib/SDL_sysstdlib.h | 2 - test/testiconv.c | 94 ++++++++++++++++++++------ 8 files changed, 211 insertions(+), 58 deletions(-) diff --git a/include/SDL3/SDL_stdinc.h b/include/SDL3/SDL_stdinc.h index a45958b29..532fc97fc 100644 --- a/include/SDL3/SDL_stdinc.h +++ b/include/SDL3/SDL_stdinc.h @@ -1250,6 +1250,67 @@ extern SDL_DECLSPEC int SDLCALL SDL_strcasecmp(const char *str1, const char *str */ extern SDL_DECLSPEC int SDLCALL SDL_strncasecmp(const char *str1, const char *str2, size_t maxlen); +/** + * The Unicode REPLACEMENT CHARACTER codepoint. + * + * SDL_StepUTF8() reports this codepoint when it encounters a UTF-8 string + * with encoding errors. + * + * This tends to render as something like a question mark in most places. + * + * \since This macro is available since SDL 3.0.0. + * + * \sa SDL_StepUTF8 + */ +#define SDL_INVALID_UNICODE_CODEPOINT 0xFFFD + +/** + * Decode a UTF-8 string, one Unicode codepoint at a time. + * + * This will return the first Unicode codepoint in the UTF-8 encoded + * string in `*pstr`, and then advance `*pstr` past any consumed bytes + * before returning. + * + * It will not access more than `*pslen` bytes from the string. + * `*pslen` will be adjusted, as well, subtracting the number of + * bytes consumed. + * + * `pslen` is allowed to be NULL, in which case the string _must_ be + * NULL-terminated, as the function will blindly read until it sees + * the NULL char. + * + * if `*pslen` is zero, it assumes the end of string is reached and + * returns a zero codepoint regardless of the contents of the string + * buffer. + * + * If the resulting codepoint is zero (a NULL terminator), or `*pslen` + * is zero, it will not advance `*pstr` or `*pslen` at all. + * + * Generally this function is called in a loop until it returns zero, + * adjusting its parameters each iteration. + * + * If an invalid UTF-8 sequence is encountered, this function returns + * SDL_INVALID_UNICODE_CODEPOINT and advances the string/length by one + * byte (which is to say, a multibyte sequence might produce several + * SDL_INVALID_UNICODE_CODEPOINT returns before it syncs to the next + * valid UTF-8 sequence). + * + * Several things can generate invalid UTF-8 sequences, including + * overlong encodings, the use of UTF-16 surrogate values, and + * truncated data. Please refer to + * [RFC3629](https://www.ietf.org/rfc/rfc3629.txt) for details. + * + * \param pstr a pointer to a UTF-8 string pointer to be read and adjusted. + * \param pslen a pointer to the number of bytes in the string, to be read + * and adjusted. NULL is allowed. + * \returns the first Unicode codepoint in the string. + * + * \threadsafety It is safe to call this function from any thread. + * + * \since This function is available since SDL 3.0.0. + */ +extern SDL_DECLSPEC Uint32 SDLCALL SDL_StepUTF8(const char **pstr, size_t *pslen); + extern SDL_DECLSPEC int SDLCALL SDL_sscanf(const char *text, SDL_SCANF_FORMAT_STRING const char *fmt, ...) SDL_SCANF_VARARG_FUNC(2); extern SDL_DECLSPEC int SDLCALL SDL_vsscanf(const char *text, SDL_SCANF_FORMAT_STRING const char *fmt, va_list ap) SDL_SCANF_VARARG_FUNCV(2); extern SDL_DECLSPEC int SDLCALL SDL_snprintf(SDL_OUT_Z_CAP(maxlen) char *text, size_t maxlen, SDL_PRINTF_FORMAT_STRING const char *fmt, ... ) SDL_PRINTF_VARARG_FUNC(3); diff --git a/src/dynapi/SDL_dynapi.sym b/src/dynapi/SDL_dynapi.sym index e3bcc9d44..f6ce6ba8b 100644 --- a/src/dynapi/SDL_dynapi.sym +++ b/src/dynapi/SDL_dynapi.sym @@ -788,6 +788,7 @@ SDL3_0.0.0 { SDL_SignalCondition; SDL_SoftStretch; SDL_StartTextInput; + SDL_StepUTF8; SDL_StopHapticEffect; SDL_StopHapticEffects; SDL_StopHapticRumble; diff --git a/src/dynapi/SDL_dynapi_overrides.h b/src/dynapi/SDL_dynapi_overrides.h index 2d71ca64a..8e238965b 100644 --- a/src/dynapi/SDL_dynapi_overrides.h +++ b/src/dynapi/SDL_dynapi_overrides.h @@ -813,6 +813,7 @@ #define SDL_SignalCondition SDL_SignalCondition_REAL #define SDL_SoftStretch SDL_SoftStretch_REAL #define SDL_StartTextInput SDL_StartTextInput_REAL +#define SDL_StepUTF8 SDL_StepUTF8_REAL #define SDL_StopHapticEffect SDL_StopHapticEffect_REAL #define SDL_StopHapticEffects SDL_StopHapticEffects_REAL #define SDL_StopHapticRumble SDL_StopHapticRumble_REAL diff --git a/src/dynapi/SDL_dynapi_procs.h b/src/dynapi/SDL_dynapi_procs.h index 9206f567b..4780080bb 100644 --- a/src/dynapi/SDL_dynapi_procs.h +++ b/src/dynapi/SDL_dynapi_procs.h @@ -823,6 +823,7 @@ SDL_DYNAPI_PROC(int,SDL_ShowWindowSystemMenu,(SDL_Window *a, int b, int c),(a,b, SDL_DYNAPI_PROC(int,SDL_SignalCondition,(SDL_Condition *a),(a),return) SDL_DYNAPI_PROC(int,SDL_SoftStretch,(SDL_Surface *a, const SDL_Rect *b, SDL_Surface *c, const SDL_Rect *d, SDL_ScaleMode e),(a,b,c,d,e),return) SDL_DYNAPI_PROC(int,SDL_StartTextInput,(SDL_Window *a),(a),return) +SDL_DYNAPI_PROC(Uint32,SDL_StepUTF8,(const char **a, size_t *b),(a,b),return) SDL_DYNAPI_PROC(int,SDL_StopHapticEffect,(SDL_Haptic *a, int b),(a,b),return) SDL_DYNAPI_PROC(int,SDL_StopHapticEffects,(SDL_Haptic *a),(a),return) SDL_DYNAPI_PROC(int,SDL_StopHapticRumble,(SDL_Haptic *a),(a),return) diff --git a/src/filesystem/SDL_filesystem.c b/src/filesystem/SDL_filesystem.c index d038d2840..4310bbb6a 100644 --- a/src/filesystem/SDL_filesystem.c +++ b/src/filesystem/SDL_filesystem.c @@ -185,7 +185,7 @@ static char *CaseFoldUtf8String(const char *fname) Uint32 codepoint; char *ptr = retval; size_t remaining = allocation; - while ((codepoint = SDL_StepUTF8(&fname, 4)) != 0) { + while ((codepoint = SDL_StepUTF8(&fname, NULL)) != 0) { Uint32 folded[3]; const int num_folded = SDL_CaseFoldUnicode(codepoint, folded); SDL_assert(num_folded > 0); diff --git a/src/stdlib/SDL_string.c b/src/stdlib/SDL_string.c index fd494fdcb..554a746ff 100644 --- a/src/stdlib/SDL_string.c +++ b/src/stdlib/SDL_string.c @@ -32,9 +32,6 @@ #include "SDL_casefolding.h" -// this is the Unicode REPLACEMENT CHARACTER, used for invalid codepoint values. -#define INVALID_UNICODE_CODEPOINT 0xFFFD - #if defined(__SIZEOF_WCHAR_T__) #define SDL_SIZEOF_WCHAR_T __SIZEOF_WCHAR_T__ #elif defined(SDL_PLATFORM_WINDOWS) @@ -129,7 +126,7 @@ int SDL_CaseFoldUnicode(const Uint32 from, Uint32 *to) cp1 = folded1[tail1++]; \ } else { \ const Uint##bits *str1start = (const Uint##bits *) str1; \ - head1 = SDL_CaseFoldUnicode(SDL_StepUTF##bits(&str1, slen1), folded1); \ + head1 = SDL_CaseFoldUnicode(StepUTF##bits(&str1, slen1), folded1); \ update_slen1; \ cp1 = folded1[0]; \ tail1 = 1; \ @@ -138,7 +135,7 @@ int SDL_CaseFoldUnicode(const Uint32 from, Uint32 *to) cp2 = folded2[tail2++]; \ } else { \ const Uint##bits *str2start = (const Uint##bits *) str2; \ - head2 = SDL_CaseFoldUnicode(SDL_StepUTF##bits(&str2, slen2), folded2); \ + head2 = SDL_CaseFoldUnicode(StepUTF##bits(&str2, slen2), folded2); \ update_slen2; \ cp2 = folded2[0]; \ tail2 = 1; \ @@ -154,12 +151,23 @@ int SDL_CaseFoldUnicode(const Uint32 from, Uint32 *to) return 0 -Uint32 SDL_StepUTF8(const char **_str, const size_t slen) +static Uint32 StepUTF8(const char **_str, const size_t slen) { - const char *str = *_str; - const Uint32 octet = (Uint32) (slen ? ((Uint8) *str) : 0); + /* + * From rfc3629, the UTF-8 spec: + * https://www.ietf.org/rfc/rfc3629.txt + * + * Char. number range | UTF-8 octet sequence + * (hexadecimal) | (binary) + * --------------------+--------------------------------------------- + * 0000 0000-0000 007F | 0xxxxxxx + * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx + * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx + * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + */ - // !!! FIXME: this could have _way_ more error checking! Illegal surrogate codepoints, unexpected bit patterns, etc. + const Uint8 *str = (const Uint8 *) *_str; + const Uint32 octet = (Uint32) (slen ? *str : 0); if (octet == 0) { // null terminator, end of string. return 0; // don't advance `*_str`. @@ -167,41 +175,73 @@ Uint32 SDL_StepUTF8(const char **_str, const size_t slen) (*_str)++; return octet; } else if (((octet & 0xE0) == 0xC0) && (slen >= 2)) { // 110xxxxx 10xxxxxx: two byte codepoint. - if (slen >= 2) { - *_str += 2; - return ((octet & 0x1F) << 6) | (((Uint8) str[1]) & 0x3F); + const Uint8 str1 = str[1]; + if ((str1 & 0xC0) == 0x80) { // If trailing bytes aren't 10xxxxxx, sequence is bogus. + const Uint32 retval = ((octet & 0x1F) << 6) | (str1 & 0x3F); + if (retval >= 0x0080) { // rfc3629 says you can't use overlong sequences for smaller values. + *_str += 2; + return retval; + } } } else if (((octet & 0xF0) == 0xE0) && (slen >= 3)) { // 1110xxxx 10xxxxxx 10xxxxxx: three byte codepoint. - *_str += 3; - const Uint32 octet2 = ((Uint32) (((Uint8) str[1]) & 0x1F)) << 6; - const Uint32 octet3 = (Uint32) (((Uint8) str[2]) & 0x3F); - return ((octet & 0x0F) << 12) | octet2 | octet3; + const Uint8 str1 = str[1]; + const Uint8 str2 = str[2]; + if (((str1 & 0xC0) == 0x80) && ((str2 & 0xC0) == 0x80)) { // If trailing bytes aren't 10xxxxxx, sequence is bogus. + const Uint32 octet2 = ((Uint32) (str1 & 0x3F)) << 6; + const Uint32 octet3 = ((Uint32) (str2 & 0x3F)); + const Uint32 retval = ((octet & 0x0F) << 12) | octet2 | octet3; + if (retval >= 0x800) { // rfc3629 says you can't use overlong sequences for smaller values. + if ((retval < 0xD800) || (retval > 0xDFFF)) { // UTF-16 surrogate values are illegal in UTF-8. + *_str += 3; + return retval; + } + } + } } else if (((octet & 0xF8) == 0xF0) && (slen >= 4)) { // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx: four byte codepoint. - *_str += 4; - const Uint32 octet2 = ((Uint32) (((Uint8) str[1]) & 0x1F)) << 12; - const Uint32 octet3 = ((Uint32) (((Uint8) str[2]) & 0x3F)) << 6; - const Uint32 octet4 = (Uint32) (((Uint8) str[3]) & 0x3F); - return ((octet & 0x07) << 18) | octet2 | octet3 | octet4; + const Uint8 str1 = str[1]; + const Uint8 str2 = str[2]; + const Uint8 str3 = str[3]; + if (((str1 & 0xC0) == 0x80) && ((str2 & 0xC0) == 0x80) && ((str3 & 0xC0) == 0x80)) { // If trailing bytes aren't 10xxxxxx, sequence is bogus. + const Uint32 octet2 = ((Uint32) (str1 & 0x1F)) << 12; + const Uint32 octet3 = ((Uint32) (str2 & 0x3F)) << 6; + const Uint32 octet4 = ((Uint32) (str3 & 0x3F)); + const Uint32 retval = ((octet & 0x07) << 18) | octet2 | octet3 | octet4; + if (retval >= 0x10000) { // rfc3629 says you can't use overlong sequences for smaller values. + *_str += 4; + return retval; + } + } } // bogus byte, skip ahead, return a REPLACEMENT CHARACTER. (*_str)++; - return INVALID_UNICODE_CODEPOINT; + return SDL_INVALID_UNICODE_CODEPOINT; +} + +Uint32 SDL_StepUTF8(const char **pstr, size_t *pslen) +{ + if (!pslen) { + return StepUTF8(pstr, 4); // 4 == max codepoint size. + } + const char *origstr = *pstr; + const Uint32 retval = StepUTF8(pstr, *pslen); + *pslen -= (size_t) (*pstr - origstr); + return retval; } #if (SDL_SIZEOF_WCHAR_T == 2) -static Uint32 SDL_StepUTF16(const Uint16 **_str, const size_t slen) +static Uint32 StepUTF16(const Uint16 **_str, const size_t slen) { const Uint16 *str = *_str; Uint32 cp = (Uint32) *(str++); if (cp == 0) { return 0; // don't advance string pointer. } else if ((cp >= 0xDC00) && (cp <= 0xDFFF)) { - cp = INVALID_UNICODE_CODEPOINT; // Orphaned second half of surrogate pair + cp = SDL_INVALID_UNICODE_CODEPOINT; // Orphaned second half of surrogate pair } else if ((cp >= 0xD800) && (cp <= 0xDBFF)) { // start of surrogate pair! const Uint32 pair = (Uint32) *str; if ((pair == 0) || ((pair < 0xDC00) || (pair > 0xDFFF))) { - cp = INVALID_UNICODE_CODEPOINT; + cp = SDL_INVALID_UNICODE_CODEPOINT; } else { str++; // eat the other surrogate. cp = 0x10000 + (((cp - 0xD800) << 10) | (pair - 0xDC00)); @@ -209,10 +249,10 @@ static Uint32 SDL_StepUTF16(const Uint16 **_str, const size_t slen) } *_str = str; - return (cp > 0x10FFFF) ? INVALID_UNICODE_CODEPOINT : cp; + return (cp > 0x10FFFF) ? SDL_INVALID_UNICODE_CODEPOINT : cp; } #elif (SDL_SIZEOF_WCHAR_T == 4) -static Uint32 SDL_StepUTF32(const Uint32 **_str, const size_t slen) +static Uint32 StepUTF32(const Uint32 **_str, const size_t slen) { if (!slen) { return 0; @@ -225,7 +265,7 @@ static Uint32 SDL_StepUTF32(const Uint32 **_str, const size_t slen) } (*_str)++; - return (cp > 0x10FFFF) ? INVALID_UNICODE_CODEPOINT : cp; + return (cp > 0x10FFFF) ? SDL_INVALID_UNICODE_CODEPOINT : cp; } #endif @@ -816,7 +856,7 @@ size_t SDL_utf8strlcpy(SDL_OUT_Z_CAP(dst_bytes) char *dst, const char *src, size size_t SDL_utf8strlen(const char *str) { size_t retval = 0; - while (SDL_StepUTF8(&str, 4)) { + while (SDL_StepUTF8(&str, NULL)) { retval++; } return retval; @@ -825,14 +865,9 @@ size_t SDL_utf8strlen(const char *str) size_t SDL_utf8strnlen(const char *str, size_t bytes) { size_t retval = 0; - const char *strstart = str; - - while (SDL_StepUTF8(&str, bytes)) { - bytes -= (size_t) (str - strstart); - strstart = str; + while (SDL_StepUTF8(&str, &bytes)) { retval++; } - return retval; } @@ -983,7 +1018,7 @@ char *SDL_strcasestr(const char *haystack, const char *needle) if (SDL_strncasecmp(haystack, needle, length) == 0) { return (char *)haystack; } - } while (SDL_StepUTF8(&haystack, 4)); // move ahead by a full codepoint at a time, regardless of bytes. + } while (SDL_StepUTF8(&haystack, NULL)); // move ahead by a full codepoint at a time, regardless of bytes. return NULL; } diff --git a/src/stdlib/SDL_sysstdlib.h b/src/stdlib/SDL_sysstdlib.h index ef7ec0748..305fd31d3 100644 --- a/src/stdlib/SDL_sysstdlib.h +++ b/src/stdlib/SDL_sysstdlib.h @@ -25,8 +25,6 @@ // most things you might need internally in here are public APIs, this is // just a few special pieces right now. -Uint32 SDL_StepUTF8(const char **_str, const size_t slen); - // this expects `from` to be a Unicode codepoint, and `to` to point to AT LEAST THREE Uint32s. int SDL_CaseFoldUnicode(const Uint32 from, Uint32 *to); diff --git a/test/testiconv.c b/test/testiconv.c index b7e71edbd..54eb90245 100644 --- a/test/testiconv.c +++ b/test/testiconv.c @@ -10,13 +10,6 @@ freely. */ -/* quiet windows compiler warnings */ -#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS) -#define _CRT_SECURE_NO_WARNINGS -#endif - -#include - #include #include #include @@ -33,6 +26,34 @@ widelen(char *data) return len; } +static char *get_next_line(Uint8 **fdataptr, size_t *fdatalen) +{ + char *retval = (char *) *fdataptr; + Uint8 *ptr = *fdataptr; + size_t len = *fdatalen; + + if (len == 0) { + return NULL; + } + + while (len > 0) { + if (*ptr == '\r') { + *ptr = '\0'; + } else if (*ptr == '\n') { + *ptr = '\0'; + ptr++; + len--; + break; + } + ptr++; + len--; + } + + *fdataptr = ptr; + *fdatalen = len; + return retval; +} + int main(int argc, char *argv[]) { const char *formats[] = { @@ -51,13 +72,15 @@ int main(int argc, char *argv[]) }; char *fname = NULL; - char buffer[BUFSIZ]; char *ucs4; char *test[2]; int i; - FILE *file; int errors = 0; SDLTest_CommonState *state; + Uint8 *fdata = NULL; + Uint8 *fdataptr = NULL; + char *line = NULL; + size_t fdatalen = 0; /* Initialize test framework */ state = SDLTest_CommonCreateState(argv, 0); @@ -89,20 +112,19 @@ int main(int argc, char *argv[]) } fname = GetResourceFilename(fname, "utf8.txt"); - file = fopen(fname, "rb"); - if (!file) { - SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Unable to open %s\n", fname); + fdata = (Uint8 *) (fname ? SDL_LoadFile(fname, &fdatalen) : NULL); + if (!fdata) { + SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Unable to load %s\n", fname); return 1; } - SDL_free(fname); - while (fgets(buffer, sizeof(buffer), file)) { + fdataptr = fdata; + while ((line = get_next_line(&fdataptr, &fdatalen)) != NULL) { /* Convert to UCS-4 */ size_t len; - ucs4 = - SDL_iconv_string("UCS-4", "UTF-8", buffer, - SDL_strlen(buffer) + 1); + ucs4 = SDL_iconv_string("UCS-4", "UTF-8", line, SDL_strlen(line) + 1); len = (widelen(ucs4) + 1) * 4; + for (i = 0; i < SDL_arraysize(formats); ++i) { test[0] = SDL_iconv_string(formats[i], "UCS-4", ucs4, len); test[1] = SDL_iconv_string("UCS-4", formats[i], test[0], len); @@ -115,10 +137,44 @@ int main(int argc, char *argv[]) } test[0] = SDL_iconv_string("UTF-8", "UCS-4", ucs4, len); SDL_free(ucs4); - (void)fputs(test[0], stdout); + SDL_Log("%s", test[0]); SDL_free(test[0]); } - (void)fclose(file); + SDL_free(fdata); + + #if 0 + { + Uint32 *ucs4buf; + Uint32 *ucs4ptr; + char *utf8out; + Uint32 cp; + SDL_IOStream *io; + + fdata = (Uint8 *) (fname ? SDL_LoadFile(fname, &fdatalen) : NULL); + if (!fdata) { + SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Unable to load %s\n", fname); + return 1; + } + + ucs4buf = (Uint32 *) SDL_malloc(fdatalen * 4); + ucs4ptr = ucs4buf; + + fdataptr = fdata; + while ((cp = SDL_StepUTF8((const char **) &fdataptr, &fdatalen)) != 0) { + *(ucs4ptr++) = SDL_Swap32BE(cp); + } + *(ucs4ptr++) = 0; + utf8out = SDL_iconv_string("UTF-8", "UCS-4", (const char *) ucs4buf, (size_t) ((ucs4ptr - ucs4buf)) * 4); + io = SDL_IOFromFile("test_steputf8.txt", "wb"); + SDL_WriteIO(io, utf8out, SDL_strlen(utf8out)); + SDL_CloseIO(io); + SDL_free(ucs4buf); + SDL_free(utf8out); + SDL_free(fdata); + } + #endif + + SDL_free(fname); SDL_LogInfo(SDL_LOG_CATEGORY_APPLICATION, "Total errors: %d\n", errors); SDL_Quit();