stdinc: Drastically improve SDL_StepUTF8() and make it a public API.

Fixes #10105.
This commit is contained in:
Ryan C. Gordon 2024-06-26 21:32:45 -04:00
parent 9b8c5f642f
commit a9cfcf6bde
8 changed files with 211 additions and 58 deletions

View File

@ -1250,6 +1250,67 @@ extern SDL_DECLSPEC int SDLCALL SDL_strcasecmp(const char *str1, const char *str
*/
extern SDL_DECLSPEC int SDLCALL SDL_strncasecmp(const char *str1, const char *str2, size_t maxlen);
/**
* The Unicode REPLACEMENT CHARACTER codepoint.
*
* SDL_StepUTF8() reports this codepoint when it encounters a UTF-8 string
* with encoding errors.
*
* This tends to render as something like a question mark in most places.
*
* \since This macro is available since SDL 3.0.0.
*
* \sa SDL_StepUTF8
*/
#define SDL_INVALID_UNICODE_CODEPOINT 0xFFFD
/**
* Decode a UTF-8 string, one Unicode codepoint at a time.
*
* This will return the first Unicode codepoint in the UTF-8 encoded
* string in `*pstr`, and then advance `*pstr` past any consumed bytes
* before returning.
*
* It will not access more than `*pslen` bytes from the string.
* `*pslen` will be adjusted, as well, subtracting the number of
* bytes consumed.
*
* `pslen` is allowed to be NULL, in which case the string _must_ be
* NULL-terminated, as the function will blindly read until it sees
* the NULL char.
*
* if `*pslen` is zero, it assumes the end of string is reached and
* returns a zero codepoint regardless of the contents of the string
* buffer.
*
* If the resulting codepoint is zero (a NULL terminator), or `*pslen`
* is zero, it will not advance `*pstr` or `*pslen` at all.
*
* Generally this function is called in a loop until it returns zero,
* adjusting its parameters each iteration.
*
* If an invalid UTF-8 sequence is encountered, this function returns
* SDL_INVALID_UNICODE_CODEPOINT and advances the string/length by one
* byte (which is to say, a multibyte sequence might produce several
* SDL_INVALID_UNICODE_CODEPOINT returns before it syncs to the next
* valid UTF-8 sequence).
*
* Several things can generate invalid UTF-8 sequences, including
* overlong encodings, the use of UTF-16 surrogate values, and
* truncated data. Please refer to
* [RFC3629](https://www.ietf.org/rfc/rfc3629.txt) for details.
*
* \param pstr a pointer to a UTF-8 string pointer to be read and adjusted.
* \param pslen a pointer to the number of bytes in the string, to be read
* and adjusted. NULL is allowed.
* \returns the first Unicode codepoint in the string.
*
* \threadsafety It is safe to call this function from any thread.
*
* \since This function is available since SDL 3.0.0.
*/
extern SDL_DECLSPEC Uint32 SDLCALL SDL_StepUTF8(const char **pstr, size_t *pslen);
extern SDL_DECLSPEC int SDLCALL SDL_sscanf(const char *text, SDL_SCANF_FORMAT_STRING const char *fmt, ...) SDL_SCANF_VARARG_FUNC(2);
extern SDL_DECLSPEC int SDLCALL SDL_vsscanf(const char *text, SDL_SCANF_FORMAT_STRING const char *fmt, va_list ap) SDL_SCANF_VARARG_FUNCV(2);
extern SDL_DECLSPEC int SDLCALL SDL_snprintf(SDL_OUT_Z_CAP(maxlen) char *text, size_t maxlen, SDL_PRINTF_FORMAT_STRING const char *fmt, ... ) SDL_PRINTF_VARARG_FUNC(3);

View File

@ -788,6 +788,7 @@ SDL3_0.0.0 {
SDL_SignalCondition;
SDL_SoftStretch;
SDL_StartTextInput;
SDL_StepUTF8;
SDL_StopHapticEffect;
SDL_StopHapticEffects;
SDL_StopHapticRumble;

View File

@ -813,6 +813,7 @@
#define SDL_SignalCondition SDL_SignalCondition_REAL
#define SDL_SoftStretch SDL_SoftStretch_REAL
#define SDL_StartTextInput SDL_StartTextInput_REAL
#define SDL_StepUTF8 SDL_StepUTF8_REAL
#define SDL_StopHapticEffect SDL_StopHapticEffect_REAL
#define SDL_StopHapticEffects SDL_StopHapticEffects_REAL
#define SDL_StopHapticRumble SDL_StopHapticRumble_REAL

View File

@ -823,6 +823,7 @@ SDL_DYNAPI_PROC(int,SDL_ShowWindowSystemMenu,(SDL_Window *a, int b, int c),(a,b,
SDL_DYNAPI_PROC(int,SDL_SignalCondition,(SDL_Condition *a),(a),return)
SDL_DYNAPI_PROC(int,SDL_SoftStretch,(SDL_Surface *a, const SDL_Rect *b, SDL_Surface *c, const SDL_Rect *d, SDL_ScaleMode e),(a,b,c,d,e),return)
SDL_DYNAPI_PROC(int,SDL_StartTextInput,(SDL_Window *a),(a),return)
SDL_DYNAPI_PROC(Uint32,SDL_StepUTF8,(const char **a, size_t *b),(a,b),return)
SDL_DYNAPI_PROC(int,SDL_StopHapticEffect,(SDL_Haptic *a, int b),(a,b),return)
SDL_DYNAPI_PROC(int,SDL_StopHapticEffects,(SDL_Haptic *a),(a),return)
SDL_DYNAPI_PROC(int,SDL_StopHapticRumble,(SDL_Haptic *a),(a),return)

View File

@ -185,7 +185,7 @@ static char *CaseFoldUtf8String(const char *fname)
Uint32 codepoint;
char *ptr = retval;
size_t remaining = allocation;
while ((codepoint = SDL_StepUTF8(&fname, 4)) != 0) {
while ((codepoint = SDL_StepUTF8(&fname, NULL)) != 0) {
Uint32 folded[3];
const int num_folded = SDL_CaseFoldUnicode(codepoint, folded);
SDL_assert(num_folded > 0);

View File

@ -32,9 +32,6 @@
#include "SDL_casefolding.h"
// this is the Unicode REPLACEMENT CHARACTER, used for invalid codepoint values.
#define INVALID_UNICODE_CODEPOINT 0xFFFD
#if defined(__SIZEOF_WCHAR_T__)
#define SDL_SIZEOF_WCHAR_T __SIZEOF_WCHAR_T__
#elif defined(SDL_PLATFORM_WINDOWS)
@ -129,7 +126,7 @@ int SDL_CaseFoldUnicode(const Uint32 from, Uint32 *to)
cp1 = folded1[tail1++]; \
} else { \
const Uint##bits *str1start = (const Uint##bits *) str1; \
head1 = SDL_CaseFoldUnicode(SDL_StepUTF##bits(&str1, slen1), folded1); \
head1 = SDL_CaseFoldUnicode(StepUTF##bits(&str1, slen1), folded1); \
update_slen1; \
cp1 = folded1[0]; \
tail1 = 1; \
@ -138,7 +135,7 @@ int SDL_CaseFoldUnicode(const Uint32 from, Uint32 *to)
cp2 = folded2[tail2++]; \
} else { \
const Uint##bits *str2start = (const Uint##bits *) str2; \
head2 = SDL_CaseFoldUnicode(SDL_StepUTF##bits(&str2, slen2), folded2); \
head2 = SDL_CaseFoldUnicode(StepUTF##bits(&str2, slen2), folded2); \
update_slen2; \
cp2 = folded2[0]; \
tail2 = 1; \
@ -154,12 +151,23 @@ int SDL_CaseFoldUnicode(const Uint32 from, Uint32 *to)
return 0
Uint32 SDL_StepUTF8(const char **_str, const size_t slen)
static Uint32 StepUTF8(const char **_str, const size_t slen)
{
const char *str = *_str;
const Uint32 octet = (Uint32) (slen ? ((Uint8) *str) : 0);
/*
* From rfc3629, the UTF-8 spec:
* https://www.ietf.org/rfc/rfc3629.txt
*
* Char. number range | UTF-8 octet sequence
* (hexadecimal) | (binary)
* --------------------+---------------------------------------------
* 0000 0000-0000 007F | 0xxxxxxx
* 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
* 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
// !!! FIXME: this could have _way_ more error checking! Illegal surrogate codepoints, unexpected bit patterns, etc.
const Uint8 *str = (const Uint8 *) *_str;
const Uint32 octet = (Uint32) (slen ? *str : 0);
if (octet == 0) { // null terminator, end of string.
return 0; // don't advance `*_str`.
@ -167,41 +175,73 @@ Uint32 SDL_StepUTF8(const char **_str, const size_t slen)
(*_str)++;
return octet;
} else if (((octet & 0xE0) == 0xC0) && (slen >= 2)) { // 110xxxxx 10xxxxxx: two byte codepoint.
if (slen >= 2) {
*_str += 2;
return ((octet & 0x1F) << 6) | (((Uint8) str[1]) & 0x3F);
const Uint8 str1 = str[1];
if ((str1 & 0xC0) == 0x80) { // If trailing bytes aren't 10xxxxxx, sequence is bogus.
const Uint32 retval = ((octet & 0x1F) << 6) | (str1 & 0x3F);
if (retval >= 0x0080) { // rfc3629 says you can't use overlong sequences for smaller values.
*_str += 2;
return retval;
}
}
} else if (((octet & 0xF0) == 0xE0) && (slen >= 3)) { // 1110xxxx 10xxxxxx 10xxxxxx: three byte codepoint.
*_str += 3;
const Uint32 octet2 = ((Uint32) (((Uint8) str[1]) & 0x1F)) << 6;
const Uint32 octet3 = (Uint32) (((Uint8) str[2]) & 0x3F);
return ((octet & 0x0F) << 12) | octet2 | octet3;
const Uint8 str1 = str[1];
const Uint8 str2 = str[2];
if (((str1 & 0xC0) == 0x80) && ((str2 & 0xC0) == 0x80)) { // If trailing bytes aren't 10xxxxxx, sequence is bogus.
const Uint32 octet2 = ((Uint32) (str1 & 0x3F)) << 6;
const Uint32 octet3 = ((Uint32) (str2 & 0x3F));
const Uint32 retval = ((octet & 0x0F) << 12) | octet2 | octet3;
if (retval >= 0x800) { // rfc3629 says you can't use overlong sequences for smaller values.
if ((retval < 0xD800) || (retval > 0xDFFF)) { // UTF-16 surrogate values are illegal in UTF-8.
*_str += 3;
return retval;
}
}
}
} else if (((octet & 0xF8) == 0xF0) && (slen >= 4)) { // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx: four byte codepoint.
*_str += 4;
const Uint32 octet2 = ((Uint32) (((Uint8) str[1]) & 0x1F)) << 12;
const Uint32 octet3 = ((Uint32) (((Uint8) str[2]) & 0x3F)) << 6;
const Uint32 octet4 = (Uint32) (((Uint8) str[3]) & 0x3F);
return ((octet & 0x07) << 18) | octet2 | octet3 | octet4;
const Uint8 str1 = str[1];
const Uint8 str2 = str[2];
const Uint8 str3 = str[3];
if (((str1 & 0xC0) == 0x80) && ((str2 & 0xC0) == 0x80) && ((str3 & 0xC0) == 0x80)) { // If trailing bytes aren't 10xxxxxx, sequence is bogus.
const Uint32 octet2 = ((Uint32) (str1 & 0x1F)) << 12;
const Uint32 octet3 = ((Uint32) (str2 & 0x3F)) << 6;
const Uint32 octet4 = ((Uint32) (str3 & 0x3F));
const Uint32 retval = ((octet & 0x07) << 18) | octet2 | octet3 | octet4;
if (retval >= 0x10000) { // rfc3629 says you can't use overlong sequences for smaller values.
*_str += 4;
return retval;
}
}
}
// bogus byte, skip ahead, return a REPLACEMENT CHARACTER.
(*_str)++;
return INVALID_UNICODE_CODEPOINT;
return SDL_INVALID_UNICODE_CODEPOINT;
}
Uint32 SDL_StepUTF8(const char **pstr, size_t *pslen)
{
if (!pslen) {
return StepUTF8(pstr, 4); // 4 == max codepoint size.
}
const char *origstr = *pstr;
const Uint32 retval = StepUTF8(pstr, *pslen);
*pslen -= (size_t) (*pstr - origstr);
return retval;
}
#if (SDL_SIZEOF_WCHAR_T == 2)
static Uint32 SDL_StepUTF16(const Uint16 **_str, const size_t slen)
static Uint32 StepUTF16(const Uint16 **_str, const size_t slen)
{
const Uint16 *str = *_str;
Uint32 cp = (Uint32) *(str++);
if (cp == 0) {
return 0; // don't advance string pointer.
} else if ((cp >= 0xDC00) && (cp <= 0xDFFF)) {
cp = INVALID_UNICODE_CODEPOINT; // Orphaned second half of surrogate pair
cp = SDL_INVALID_UNICODE_CODEPOINT; // Orphaned second half of surrogate pair
} else if ((cp >= 0xD800) && (cp <= 0xDBFF)) { // start of surrogate pair!
const Uint32 pair = (Uint32) *str;
if ((pair == 0) || ((pair < 0xDC00) || (pair > 0xDFFF))) {
cp = INVALID_UNICODE_CODEPOINT;
cp = SDL_INVALID_UNICODE_CODEPOINT;
} else {
str++; // eat the other surrogate.
cp = 0x10000 + (((cp - 0xD800) << 10) | (pair - 0xDC00));
@ -209,10 +249,10 @@ static Uint32 SDL_StepUTF16(const Uint16 **_str, const size_t slen)
}
*_str = str;
return (cp > 0x10FFFF) ? INVALID_UNICODE_CODEPOINT : cp;
return (cp > 0x10FFFF) ? SDL_INVALID_UNICODE_CODEPOINT : cp;
}
#elif (SDL_SIZEOF_WCHAR_T == 4)
static Uint32 SDL_StepUTF32(const Uint32 **_str, const size_t slen)
static Uint32 StepUTF32(const Uint32 **_str, const size_t slen)
{
if (!slen) {
return 0;
@ -225,7 +265,7 @@ static Uint32 SDL_StepUTF32(const Uint32 **_str, const size_t slen)
}
(*_str)++;
return (cp > 0x10FFFF) ? INVALID_UNICODE_CODEPOINT : cp;
return (cp > 0x10FFFF) ? SDL_INVALID_UNICODE_CODEPOINT : cp;
}
#endif
@ -816,7 +856,7 @@ size_t SDL_utf8strlcpy(SDL_OUT_Z_CAP(dst_bytes) char *dst, const char *src, size
size_t SDL_utf8strlen(const char *str)
{
size_t retval = 0;
while (SDL_StepUTF8(&str, 4)) {
while (SDL_StepUTF8(&str, NULL)) {
retval++;
}
return retval;
@ -825,14 +865,9 @@ size_t SDL_utf8strlen(const char *str)
size_t SDL_utf8strnlen(const char *str, size_t bytes)
{
size_t retval = 0;
const char *strstart = str;
while (SDL_StepUTF8(&str, bytes)) {
bytes -= (size_t) (str - strstart);
strstart = str;
while (SDL_StepUTF8(&str, &bytes)) {
retval++;
}
return retval;
}
@ -983,7 +1018,7 @@ char *SDL_strcasestr(const char *haystack, const char *needle)
if (SDL_strncasecmp(haystack, needle, length) == 0) {
return (char *)haystack;
}
} while (SDL_StepUTF8(&haystack, 4)); // move ahead by a full codepoint at a time, regardless of bytes.
} while (SDL_StepUTF8(&haystack, NULL)); // move ahead by a full codepoint at a time, regardless of bytes.
return NULL;
}

View File

@ -25,8 +25,6 @@
// most things you might need internally in here are public APIs, this is
// just a few special pieces right now.
Uint32 SDL_StepUTF8(const char **_str, const size_t slen);
// this expects `from` to be a Unicode codepoint, and `to` to point to AT LEAST THREE Uint32s.
int SDL_CaseFoldUnicode(const Uint32 from, Uint32 *to);

View File

@ -10,13 +10,6 @@
freely.
*/
/* quiet windows compiler warnings */
#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS
#endif
#include <stdio.h>
#include <SDL3/SDL.h>
#include <SDL3/SDL_main.h>
#include <SDL3/SDL_test.h>
@ -33,6 +26,34 @@ widelen(char *data)
return len;
}
static char *get_next_line(Uint8 **fdataptr, size_t *fdatalen)
{
char *retval = (char *) *fdataptr;
Uint8 *ptr = *fdataptr;
size_t len = *fdatalen;
if (len == 0) {
return NULL;
}
while (len > 0) {
if (*ptr == '\r') {
*ptr = '\0';
} else if (*ptr == '\n') {
*ptr = '\0';
ptr++;
len--;
break;
}
ptr++;
len--;
}
*fdataptr = ptr;
*fdatalen = len;
return retval;
}
int main(int argc, char *argv[])
{
const char *formats[] = {
@ -51,13 +72,15 @@ int main(int argc, char *argv[])
};
char *fname = NULL;
char buffer[BUFSIZ];
char *ucs4;
char *test[2];
int i;
FILE *file;
int errors = 0;
SDLTest_CommonState *state;
Uint8 *fdata = NULL;
Uint8 *fdataptr = NULL;
char *line = NULL;
size_t fdatalen = 0;
/* Initialize test framework */
state = SDLTest_CommonCreateState(argv, 0);
@ -89,20 +112,19 @@ int main(int argc, char *argv[])
}
fname = GetResourceFilename(fname, "utf8.txt");
file = fopen(fname, "rb");
if (!file) {
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Unable to open %s\n", fname);
fdata = (Uint8 *) (fname ? SDL_LoadFile(fname, &fdatalen) : NULL);
if (!fdata) {
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Unable to load %s\n", fname);
return 1;
}
SDL_free(fname);
while (fgets(buffer, sizeof(buffer), file)) {
fdataptr = fdata;
while ((line = get_next_line(&fdataptr, &fdatalen)) != NULL) {
/* Convert to UCS-4 */
size_t len;
ucs4 =
SDL_iconv_string("UCS-4", "UTF-8", buffer,
SDL_strlen(buffer) + 1);
ucs4 = SDL_iconv_string("UCS-4", "UTF-8", line, SDL_strlen(line) + 1);
len = (widelen(ucs4) + 1) * 4;
for (i = 0; i < SDL_arraysize(formats); ++i) {
test[0] = SDL_iconv_string(formats[i], "UCS-4", ucs4, len);
test[1] = SDL_iconv_string("UCS-4", formats[i], test[0], len);
@ -115,10 +137,44 @@ int main(int argc, char *argv[])
}
test[0] = SDL_iconv_string("UTF-8", "UCS-4", ucs4, len);
SDL_free(ucs4);
(void)fputs(test[0], stdout);
SDL_Log("%s", test[0]);
SDL_free(test[0]);
}
(void)fclose(file);
SDL_free(fdata);
#if 0
{
Uint32 *ucs4buf;
Uint32 *ucs4ptr;
char *utf8out;
Uint32 cp;
SDL_IOStream *io;
fdata = (Uint8 *) (fname ? SDL_LoadFile(fname, &fdatalen) : NULL);
if (!fdata) {
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Unable to load %s\n", fname);
return 1;
}
ucs4buf = (Uint32 *) SDL_malloc(fdatalen * 4);
ucs4ptr = ucs4buf;
fdataptr = fdata;
while ((cp = SDL_StepUTF8((const char **) &fdataptr, &fdatalen)) != 0) {
*(ucs4ptr++) = SDL_Swap32BE(cp);
}
*(ucs4ptr++) = 0;
utf8out = SDL_iconv_string("UTF-8", "UCS-4", (const char *) ucs4buf, (size_t) ((ucs4ptr - ucs4buf)) * 4);
io = SDL_IOFromFile("test_steputf8.txt", "wb");
SDL_WriteIO(io, utf8out, SDL_strlen(utf8out));
SDL_CloseIO(io);
SDL_free(ucs4buf);
SDL_free(utf8out);
SDL_free(fdata);
}
#endif
SDL_free(fname);
SDL_LogInfo(SDL_LOG_CATEGORY_APPLICATION, "Total errors: %d\n", errors);
SDL_Quit();