Add support for whole BMP in the JSON parser
The previous implementation only supported codepoints up to 0x7f as characters, and all remaining codepoints up to 0xff as integers. The new implementation supports all codepoints in the BMP, i.e. up to 0xffff.
This commit is contained in:
parent
a847d8c3bf
commit
0a3ddf8c0f
|
@ -18,6 +18,8 @@
|
|||
%{
|
||||
#include "json_parser.h"
|
||||
|
||||
#include <util/unicode.h>
|
||||
|
||||
int yyjsonlex();
|
||||
extern char *yyjsontext;
|
||||
extern int yyjsonleng; // really an int, not a size_t
|
||||
|
@ -51,7 +53,7 @@ static std::string convert_TOK_STRING()
|
|||
// \uABCD, i.e. the following four digits are part of this character.
|
||||
assert(p + 4 < yyjsontext + len - 1);
|
||||
std::string hex(++p, 4);
|
||||
result += std::stoi(hex, nullptr, 16);
|
||||
result += codepoint_hex_to_utf8(hex);
|
||||
p += 3;
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -8,11 +8,14 @@ Author: Daniel Kroening, kroening@kroening.com
|
|||
|
||||
#include "unicode.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <locale>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <codecvt>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <iomanip>
|
||||
#include <locale>
|
||||
#include <sstream>
|
||||
|
||||
#include "invariant.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <util/pragma_push.def>
|
||||
|
@ -315,3 +318,33 @@ std::string utf16_native_endian_to_java(const std::wstring &in)
|
|||
utf16_native_endian_to_java(ch, result, loc);
|
||||
return result.str();
|
||||
}
|
||||
|
||||
std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
|
||||
{
|
||||
return utf16_native_endian_to_utf8(std::u16string(1, utf16_char));
|
||||
}
|
||||
|
||||
std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
// Workaround for Visual Studio bug, see
|
||||
// https://stackoverflow.com/questions/32055357
|
||||
std::wstring wide_string(utf16_str.begin(), utf16_str.end());
|
||||
return std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t>{}
|
||||
.to_bytes(wide_string);
|
||||
#else
|
||||
return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}
|
||||
.to_bytes(utf16_str);
|
||||
#endif
|
||||
}
|
||||
|
||||
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
|
||||
{
|
||||
PRECONDITION(hex.length() == 4);
|
||||
return std::strtol(hex.c_str(), nullptr, 16);
|
||||
}
|
||||
|
||||
std::string codepoint_hex_to_utf8(const std::string &hex)
|
||||
{
|
||||
return utf16_native_endian_to_utf8(codepoint_hex_to_utf16_native_endian(hex));
|
||||
}
|
||||
|
|
|
@ -31,6 +31,26 @@ std::string utf16_native_endian_to_java(const std::wstring &in);
|
|||
|
||||
std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);
|
||||
|
||||
/// \param utf16_char: UTF-16 character in architecture-native endianness
|
||||
/// encoding
|
||||
/// \return UTF-8 encoding of the same codepoint
|
||||
std::string utf16_native_endian_to_utf8(char16_t utf16_char);
|
||||
|
||||
/// \param utf16_str: UTF-16 string in architecture-native endianness encoding
|
||||
/// \return UTF-8 encoding of the string
|
||||
std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str);
|
||||
|
||||
/// \param hex: representation of a BMP codepoint as a four-digit string
|
||||
/// (e.g.\ "0041" for \\u0041)
|
||||
/// \return encoding of the codepoint as a single UTF-16 character in
|
||||
/// architecture-native endianness encoding
|
||||
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex);
|
||||
|
||||
/// \param hex: representation of a BMP codepoint as a four-digit string
|
||||
/// (e.g.\ "0041" for \\u0041)
|
||||
/// \return UTF-8 encoding of the codepoint
|
||||
std::string codepoint_hex_to_utf8(const std::string &hex);
|
||||
|
||||
template <typename It>
|
||||
std::vector<const char *> to_c_str_array(It b, It e)
|
||||
{
|
||||
|
|
|
@ -84,17 +84,22 @@ SCENARIO("Loading JSON files")
|
|||
}
|
||||
}
|
||||
}
|
||||
GIVEN("A JSON file containing a hexadecimal Unicode character")
|
||||
GIVEN("A JSON file containing hexadecimal Unicode symbols")
|
||||
{
|
||||
temporary_filet unicode_json_file("cbmc_unit_json_parser_unicode", ".json");
|
||||
const std::string unicode_json_path = unicode_json_file();
|
||||
{
|
||||
std::ofstream unicode_json_out(unicode_json_path);
|
||||
unicode_json_out << "{\n"
|
||||
<< " \"special character\": \"\\u0001\"\n"
|
||||
<< " \"one\": \"\\u0001\",\n"
|
||||
<< " \"latin\": \"\\u0042\",\n"
|
||||
<< " \"grave\": \"\\u00E0\",\n"
|
||||
<< " \"trema\": \"\\u00FF\",\n"
|
||||
<< " \"high\": \"\\uFFFF\",\n"
|
||||
<< " \"several\": \"a\\u0041b\\u2FC3\\uFFFF\"\n"
|
||||
<< "}\n";
|
||||
}
|
||||
WHEN("Loading the JSON file with the special character")
|
||||
WHEN("Loading the JSON file with the Unicode symbols")
|
||||
{
|
||||
jsont unicode_json;
|
||||
const auto unicode_parse_error =
|
||||
|
@ -105,9 +110,25 @@ SCENARIO("Loading JSON files")
|
|||
REQUIRE(unicode_json.is_object());
|
||||
|
||||
const json_objectt &json_object = to_json_object(unicode_json);
|
||||
REQUIRE(json_object.find("special character") != json_object.end());
|
||||
REQUIRE(json_object["special character"].value.size() == 1);
|
||||
REQUIRE(json_object["special character"].value == "\u0001");
|
||||
|
||||
REQUIRE(json_object.find("one") != json_object.end());
|
||||
REQUIRE(json_object["one"].value.size() == 1);
|
||||
REQUIRE(json_object["one"].value == u8"\u0001");
|
||||
|
||||
REQUIRE(json_object.find("latin") != json_object.end());
|
||||
REQUIRE(json_object["latin"].value == "B");
|
||||
|
||||
REQUIRE(json_object.find("grave") != json_object.end());
|
||||
REQUIRE(json_object["grave"].value == "à");
|
||||
|
||||
REQUIRE(json_object.find("trema") != json_object.end());
|
||||
REQUIRE(json_object["trema"].value == "ÿ");
|
||||
|
||||
REQUIRE(json_object.find("high") != json_object.end());
|
||||
REQUIRE(json_object["high"].value == u8"\uFFFF");
|
||||
|
||||
REQUIRE(json_object.find("several") != json_object.end());
|
||||
REQUIRE(json_object["several"].value == u8"aAb\u2FC3\uFFFF");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue