Add support for whole BMP in the JSON parser

The previous implementation only supported codepoints up to 0x7f as
characters, and all remaining codepoints up to 0xff as integers.
The new implementation supports all codepoints in the BMP, i.e. up to
0xffff.
This commit is contained in:
Antonia Lechner 2019-04-30 18:10:57 +01:00
parent a847d8c3bf
commit 0a3ddf8c0f
4 changed files with 87 additions and 11 deletions

View File

@ -18,6 +18,8 @@
%{
#include "json_parser.h"
#include <util/unicode.h>
int yyjsonlex();
extern char *yyjsontext;
extern int yyjsonleng; // really an int, not a size_t
@ -51,7 +53,7 @@ static std::string convert_TOK_STRING()
// \uABCD, i.e. the following four digits are part of this character.
assert(p + 4 < yyjsontext + len - 1);
std::string hex(++p, 4);
result += std::stoi(hex, nullptr, 16);
result += codepoint_hex_to_utf8(hex);
p += 3;
break;
}

View File

@ -8,11 +8,14 @@ Author: Daniel Kroening, kroening@kroening.com
#include "unicode.h"
#include <cstring>
#include <locale>
#include <iomanip>
#include <sstream>
#include <codecvt>
#include <cstdint>
#include <cstring>
#include <iomanip>
#include <locale>
#include <sstream>
#include "invariant.h"
#ifdef _WIN32
#include <util/pragma_push.def>
@ -315,3 +318,33 @@ std::string utf16_native_endian_to_java(const std::wstring &in)
utf16_native_endian_to_java(ch, result, loc);
return result.str();
}
std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
{
return utf16_native_endian_to_utf8(std::u16string(1, utf16_char));
}
std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str)
{
#ifdef _MSC_VER
// Workaround for Visual Studio bug, see
// https://stackoverflow.com/questions/32055357
std::wstring wide_string(utf16_str.begin(), utf16_str.end());
return std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t>{}
.to_bytes(wide_string);
#else
return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}
.to_bytes(utf16_str);
#endif
}
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
{
PRECONDITION(hex.length() == 4);
return std::strtol(hex.c_str(), nullptr, 16);
}
std::string codepoint_hex_to_utf8(const std::string &hex)
{
return utf16_native_endian_to_utf8(codepoint_hex_to_utf16_native_endian(hex));
}

View File

@ -31,6 +31,26 @@ std::string utf16_native_endian_to_java(const std::wstring &in);
std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);
/// \param utf16_char: UTF-16 character in architecture-native endianness
/// encoding
/// \return UTF-8 encoding of the same codepoint
std::string utf16_native_endian_to_utf8(char16_t utf16_char);
/// \param utf16_str: UTF-16 string in architecture-native endianness encoding
/// \return UTF-8 encoding of the string
std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str);
/// \param hex: representation of a BMP codepoint as a four-digit string
/// (e.g.\ "0041" for \\u0041)
/// \return encoding of the codepoint as a single UTF-16 character in
/// architecture-native endianness encoding
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex);
/// \param hex: representation of a BMP codepoint as a four-digit string
/// (e.g.\ "0041" for \\u0041)
/// \return UTF-8 encoding of the codepoint
std::string codepoint_hex_to_utf8(const std::string &hex);
template <typename It>
std::vector<const char *> to_c_str_array(It b, It e)
{

View File

@ -84,17 +84,22 @@ SCENARIO("Loading JSON files")
}
}
}
GIVEN("A JSON file containing a hexadecimal Unicode character")
GIVEN("A JSON file containing hexadecimal Unicode symbols")
{
temporary_filet unicode_json_file("cbmc_unit_json_parser_unicode", ".json");
const std::string unicode_json_path = unicode_json_file();
{
std::ofstream unicode_json_out(unicode_json_path);
unicode_json_out << "{\n"
<< " \"special character\": \"\\u0001\"\n"
<< " \"one\": \"\\u0001\",\n"
<< " \"latin\": \"\\u0042\",\n"
<< " \"grave\": \"\\u00E0\",\n"
<< " \"trema\": \"\\u00FF\",\n"
<< " \"high\": \"\\uFFFF\",\n"
<< " \"several\": \"a\\u0041b\\u2FC3\\uFFFF\"\n"
<< "}\n";
}
WHEN("Loading the JSON file with the special character")
WHEN("Loading the JSON file with the Unicode symbols")
{
jsont unicode_json;
const auto unicode_parse_error =
@ -105,9 +110,25 @@ SCENARIO("Loading JSON files")
REQUIRE(unicode_json.is_object());
const json_objectt &json_object = to_json_object(unicode_json);
REQUIRE(json_object.find("special character") != json_object.end());
REQUIRE(json_object["special character"].value.size() == 1);
REQUIRE(json_object["special character"].value == "\u0001");
REQUIRE(json_object.find("one") != json_object.end());
REQUIRE(json_object["one"].value.size() == 1);
REQUIRE(json_object["one"].value == u8"\u0001");
REQUIRE(json_object.find("latin") != json_object.end());
REQUIRE(json_object["latin"].value == "B");
REQUIRE(json_object.find("grave") != json_object.end());
REQUIRE(json_object["grave"].value == "à");
REQUIRE(json_object.find("trema") != json_object.end());
REQUIRE(json_object["trema"].value == "ÿ");
REQUIRE(json_object.find("high") != json_object.end());
REQUIRE(json_object["high"].value == u8"\uFFFF");
REQUIRE(json_object.find("several") != json_object.end());
REQUIRE(json_object["several"].value == u8"aAb\u2FC3\uFFFF");
}
}
}