Merge pull request #2543 from tautschnig/vs-unsigned-byte-swap

Explicit unsigned -> uint16_t casts to avoid conversion warnings
This commit is contained in:
Daniel Kroening 2018-07-24 08:54:09 +01:00 committed by GitHub
commit c7457fbe6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 75 additions and 110 deletions

View File

@ -202,7 +202,7 @@ std::string expr2javat::convert_constant(
if(to_integer(src, int_value))
UNREACHABLE;
dest += "(char)'" + utf16_little_endian_to_java(int_value.to_long()) + '\'';
dest += "(char)'" + utf16_native_endian_to_java(int_value.to_long()) + '\'';
return dest;
}
else if(src.type()==java_byte_type())

View File

@ -106,7 +106,7 @@ symbol_exprt get_or_create_string_literal_symbol(
if(string_refinement_enabled)
{
const array_exprt data =
utf16_to_array(utf8_to_utf16(id2string(value), false));
utf16_to_array(utf8_to_utf16_native_endian(id2string(value)));
struct_exprt literal_init(new_symbol.type);
literal_init.operands().resize(jls_struct.components().size());

View File

@ -34,7 +34,7 @@ std::basic_string<unsigned int> convert_one_string_literal(
unescape_wide_string(std::string(src, 3, src.size()-4));
// turn into utf-8
std::string utf8_value=utf32_to_utf8(value);
const std::string utf8_value = utf32_native_endian_to_utf8(value);
// pad into wide string
value.resize(utf8_value.size());

View File

@ -23,7 +23,7 @@ static void append_universal_char(
std::basic_string<unsigned int> value_str(1, value);
// turn into utf-8
std::string utf8_value=utf32_to_utf8(value_str);
const std::string utf8_value = utf32_native_endian_to_utf8(value_str);
dest.append(utf8_value);
}

View File

@ -67,7 +67,7 @@ int make_identifier()
utf32+=letter;
// turn into utf-8
std::string utf8_value=utf32_to_utf8(utf32);
const std::string utf8_value = utf32_native_endian_to_utf8(utf32);
final_base_name+=utf8_value;
}
else

View File

@ -444,7 +444,7 @@ utf16_constant_array_to_java(const array_exprt &arr, std::size_t length)
INVARIANT(!conversion_failed, "constant should be convertible to unsigned");
out[i]=c;
}
return utf16_little_endian_to_java(out);
return utf16_native_endian_to_java(out);
}
/// Formatted string using a format string and list of arguments

View File

@ -95,7 +95,7 @@ void delete_directory_utf16(const std::wstring &path)
void delete_directory(const std::string &path)
{
#ifdef _WIN32
delete_directory_utf16(utf8_to_utf16_little_endian(path));
delete_directory_utf16(utf8_to_utf16_native_endian(path));
#else
DIR *dir=opendir(path.c_str());
if(dir!=nullptr)

View File

@ -18,16 +18,6 @@ Author: Daniel Kroening, kroening@kroening.com
#include <windows.h>
#endif
/// Determine endianness of the architecture
/// \return True if the architecture is little_endian
bool is_little_endian_arch()
{
uint32_t i=1;
return reinterpret_cast<uint8_t &>(i) != 0;
}
#define BUFSIZE 100
std::string narrow(const wchar_t *s)
{
#ifdef _WIN32
@ -138,9 +128,10 @@ static void utf8_append_code(unsigned int c, std::string &result)
}
}
/// \param utf32:encoded wide string
/// \param s UTF-32 encoded wide string
/// \return utf8-encoded string with the same unicode characters as the input.
std::string utf32_to_utf8(const std::basic_string<unsigned int> &s)
std::string
utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s)
{
std::string result;
@ -166,27 +157,15 @@ std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
return argv_narrow;
}
/// A helper function for dealing with different UTF16 endians
/// \par parameters: A 16-bit integer
/// \return A 16-bit integer with bytes swapped
uint16_t do_swap_bytes(uint16_t x)
{
uint16_t b1=x & 0xFF;
uint16_t b2=x & 0xFF00;
return (b1 << 8) | (b2 >> 8);
}
void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result)
static void utf16_append_code(unsigned int code, std::wstring &result)
{
// we do not treat 0xD800 to 0xDFFF, although
// they are not valid unicode symbols
if(code<0xFFFF)
{ // code is encoded as one UTF16 character
// we just take the code and possibly swap the bytes
unsigned int a=(swap_bytes)?do_swap_bytes(code):code;
result+=static_cast<wchar_t>(a);
{
// code is encoded as one UTF16 character
result += static_cast<wchar_t>(code);
}
else // code is encoded as two UTF16 characters
{
@ -194,23 +173,21 @@ void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result)
// code<0x10FFFF
// but let's not check it programmatically
// encode the code in UTF16, possibly swapping bytes.
// encode the code in UTF16
code=code-0x10000;
unsigned int i1=((code>>10) & 0x3ff) | 0xD800;
unsigned int a1=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i1)):i1;
result+=static_cast<wchar_t>(a1);
unsigned int i2=(code & 0x3ff) | 0xDC00;
unsigned int a2=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i2)):i2;
result+=static_cast<wchar_t>(a2);
const uint16_t i1 = static_cast<uint16_t>(((code >> 10) & 0x3ff) | 0xD800);
result += static_cast<wchar_t>(i1);
const uint16_t i2 = static_cast<uint16_t>((code & 0x3ff) | 0xDC00);
result += static_cast<wchar_t>(i2);
}
}
/// \par parameters: String in UTF-8 format, bool value indicating whether the
/// endianness should be different from the architecture one.
/// Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
/// \par parameters: String in UTF-8 format
/// \return String in UTF-16 format. The encoding follows the endianness of the
/// architecture iff swap_bytes is true.
std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
std::wstring utf8_to_utf16_native_endian(const std::string &in)
{
std::wstring result;
result.reserve(in.size());
@ -263,33 +240,17 @@ std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
code=32;
}
utf16_append_code(code, swap_bytes, result);
utf16_append_code(code, result);
}
return result;
}
/// \par parameters: String in UTF-8 format
/// \return String in UTF-16BE format
std::wstring utf8_to_utf16_big_endian(const std::string &in)
{
bool swap_bytes=is_little_endian_arch();
return utf8_to_utf16(in, swap_bytes);
}
/// \par parameters: String in UTF-8 format
/// \return String in UTF-16LE format
std::wstring utf8_to_utf16_little_endian(const std::string &in)
{
bool swap_bytes=!is_little_endian_arch();
return utf8_to_utf16(in, swap_bytes);
}
/// \param ch: UTF-16LE character
/// \param ch: UTF-16 character in architecture-native endianness encoding
/// \param result: stream to receive string in US-ASCII format, with \\uxxxx
/// escapes for other characters
/// \param loc: locale to check for printable characters
static void utf16_little_endian_to_java(
static void utf16_native_endian_to_java(
const wchar_t ch,
std::ostringstream &result,
const std::locale &loc)
@ -326,23 +287,23 @@ static void utf16_little_endian_to_java(
}
}
/// \param ch: UTF-16LE character
/// \param ch: UTF-16 character in architecture-native endianness encoding
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters
std::string utf16_little_endian_to_java(const wchar_t ch)
std::string utf16_native_endian_to_java(const wchar_t ch)
{
std::ostringstream result;
const std::locale loc;
utf16_little_endian_to_java(ch, result, loc);
utf16_native_endian_to_java(ch, result, loc);
return result.str();
}
/// \param in: String in UTF-16LE format
/// \param in: String in UTF-16 (native endianness) format
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters
std::string utf16_little_endian_to_java(const std::wstring &in)
std::string utf16_native_endian_to_java(const std::wstring &in)
{
std::ostringstream result;
const std::locale loc;
for(const auto ch : in)
utf16_little_endian_to_java(ch, result, loc);
utf16_native_endian_to_java(ch, result, loc);
return result.str();
}

View File

@ -22,13 +22,12 @@ std::wstring widen(const char *s);
std::string narrow(const std::wstring &s);
std::wstring widen(const std::string &s);
std::string utf32_to_utf8(const std::basic_string<unsigned int> &s);
std::string
utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s);
std::wstring utf8_to_utf16(const std::string &in, bool swap_bytes);
std::wstring utf8_to_utf16_big_endian(const std::string &);
std::wstring utf8_to_utf16_little_endian(const std::string &);
std::string utf16_little_endian_to_java(const wchar_t ch);
std::string utf16_little_endian_to_java(const std::wstring &in);
std::wstring utf8_to_utf16_native_endian(const std::string &in);
std::string utf16_native_endian_to_java(const wchar_t ch);
std::string utf16_native_endian_to_java(const std::wstring &in);
std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);

View File

@ -13,7 +13,6 @@ list(REMOVE_ITEM sources
${CMAKE_CURRENT_SOURCE_DIR}/json.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp_parser.cpp
${CMAKE_CURRENT_SOURCE_DIR}/osx_fat_reader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/unicode.cpp
${CMAKE_CURRENT_SOURCE_DIR}/wp.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp_scanner.cpp
${CMAKE_CURRENT_SOURCE_DIR}/float_utils.cpp

View File

@ -37,6 +37,7 @@ SRC += unit_tests.cpp \
util/string_utils/split_string.cpp \
util/string_utils/strip_string.cpp \
util/symbol_table.cpp \
util/unicode.cpp \
catch_example.cpp \
# Empty last line

View File

@ -6,20 +6,22 @@ Author: Vojtech Forejt, forejtv@diffblue.com
\*******************************************************************/
#include <cassert>
#include <testing-utils/catch.hpp>
#include <vector>
#include <string>
#include <codecvt>
#include <iomanip>
#include <iostream>
#include <locale>
#include <util/unicode.h>
// the u8 prefix is only available from VS 2015 onwards
#if !defined(_MSC_VER) || _MSC_VER >= 1900
// This unit test compares our implementation with codecvt implementation,
// checking bit-by-bit equivalence of results.
bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b)
static bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b)
{
if(a.size() != b.size())
return false;
@ -35,7 +37,10 @@ bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b)
}
// helper print function, can be called for debugging problem
void wstr_print(const std::wstring &a, const std::wstring &b)
#if 0
#include <iostream>
static void wstr_print(const std::wstring &a, const std::wstring &b)
{
int endi=(a.size()>b.size())?a.size():b.size();
const unsigned char
@ -49,46 +54,46 @@ void wstr_print(const std::wstring &a, const std::wstring &b)
}
std::cout << '\n';
}
#endif
void compare_utf8_to_utf16_big_endian(std::string& in)
static bool compare_utf8_to_utf16(const std::string &in)
{
std::wstring s1=utf8_to_utf16_big_endian(in);
const std::wstring s1 = utf8_to_utf16_native_endian(in);
typedef std::codecvt_utf8_utf16<wchar_t> codecvt_utf8_utf16t;
std::wstring_convert<codecvt_utf8_utf16t> converter;
std::wstring s2=converter.from_bytes(in);
assert(paranoid_wstr_equals(s1, s2));
return paranoid_wstr_equals(s1, s2);
}
void compare_utf8_to_utf16_little_endian(std::string& in)
TEST_CASE("unicode0", "[core][util][unicode]")
{
std::wstring s1=utf8_to_utf16_little_endian(in);
const std::codecvt_mode mode=std::codecvt_mode::little_endian;
const unsigned long maxcode=0x10ffff;
typedef std::codecvt_utf8_utf16<wchar_t, maxcode, mode> codecvt_utf8_utf16t;
std::wstring_convert<codecvt_utf8_utf16t> converter;
std::wstring s2=converter.from_bytes(in);
assert(paranoid_wstr_equals(s1, s2));
const std::string s = u8"abc";
REQUIRE(compare_utf8_to_utf16(s));
}
int main()
TEST_CASE("unicode1", "[core][util][unicode]")
{
std::string s;
s=u8"\u0070\u00DF\u00E0\u00EF\u00F0\u00F7\u00F8";
compare_utf8_to_utf16_big_endian(s);
compare_utf8_to_utf16_little_endian(s);
s=u8"$¢€𐍈";
compare_utf8_to_utf16_big_endian(s);
compare_utf8_to_utf16_little_endian(s);
s=u8"𐐏𤭢";
compare_utf8_to_utf16_big_endian(s);
compare_utf8_to_utf16_little_endian(s);
s=u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ";
compare_utf8_to_utf16_big_endian(s);
compare_utf8_to_utf16_little_endian(s);
const std::string s = u8"\u0070\u00DF\u00E0\u00EF\u00F0\u00F7\u00F8";
REQUIRE(compare_utf8_to_utf16(s));
}
TEST_CASE("unicode2", "[core][util][unicode]")
{
const std::string s = u8"$¢€𐍈";
REQUIRE(compare_utf8_to_utf16(s));
}
TEST_CASE("unicode3", "[core][util][unicode]")
{
const std::string s = u8"𐐏𤭢";
REQUIRE(compare_utf8_to_utf16(s));
}
TEST_CASE("unicode4", "[core][util][unicode]")
{
const std::string s = u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ";
REQUIRE(compare_utf8_to_utf16(s));
}
#endif