diff --git a/.travis.yml b/.travis.yml index e8e19bc01e..2f1c810399 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,6 +28,7 @@ matrix: packages: - libwww-perl - clang-3.7 + - libstdc++-5-dev - libubsan0 before_install: - mkdir bin ; ln -s /usr/bin/clang-3.7 bin/gcc diff --git a/src/util/unicode.cpp b/src/util/unicode.cpp index 82acd36d4e..1e280783af 100644 --- a/src/util/unicode.cpp +++ b/src/util/unicode.cpp @@ -7,6 +7,10 @@ Author: Daniel Kroening, kroening@kroening.com \*******************************************************************/ #include +#include +#include +#include +#include #include "unicode.h" @@ -258,3 +262,79 @@ const char **narrow_argv(int argc, const wchar_t **argv_wide) return argv_narrow; } + +/*******************************************************************\ + +Function: utf8_to_utf16_big_endian + + Inputs: String in UTF-8 format + + Outputs: String in UTF-16BE format + + Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+ + +\*******************************************************************/ + +std::wstring utf8_to_utf16_big_endian(const std::string& in) +{ + std::wstring_convert > converter; + return converter.from_bytes(in); +} + +/*******************************************************************\ + +Function: utf8_to_utf16_little_endian + + Inputs: String in UTF-8 format + + Outputs: String in UTF-16LE format + + Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+ + +\*******************************************************************/ + +std::wstring utf8_to_utf16_little_endian(const std::string& in) +{ + const std::codecvt_mode mode=std::codecvt_mode::little_endian; + + // default largest value codecvt_utf8_utf16 reads without error is 0x10ffff + // see: http://en.cppreference.com/w/cpp/locale/codecvt_utf8_utf16 + const unsigned long maxcode=0x10ffff; + + typedef std::codecvt_utf8_utf16 codecvt_utf8_utf16t; + std::wstring_convert converter; + return converter.from_bytes(in); +} + +/*******************************************************************\ + +Function: utf16_little_endian_to_ascii + + Inputs: String in UTF-16LE format + + Outputs: String in US-ASCII format, with \uxxxx escapes for other + characters + + Purpose: + +\*******************************************************************/ + +std::string utf16_little_endian_to_ascii(const std::wstring& in) +{ + std::ostringstream result; + std::locale loc; + for(const auto c : in) + { + if(c<=255 && isprint(c, loc)) + result << (unsigned char)c; + else + { + result << "\\u" + << std::hex + << std::setw(4) + << std::setfill('0') + << (unsigned int)c; + } + } + return result.str(); +} diff --git a/src/util/unicode.h b/src/util/unicode.h index edad95039f..c4bcab617d 100644 --- a/src/util/unicode.h +++ b/src/util/unicode.h @@ -22,6 +22,10 @@ std::wstring widen(const std::string &s); std::string utf32_to_utf8(const std::basic_string &s); std::string utf16_to_utf8(const std::basic_string &s); +std::wstring utf8_to_utf16_big_endian(const std::string &); +std::wstring utf8_to_utf16_little_endian(const std::string &); +std::string utf16_little_endian_to_ascii(const std::wstring &in); + const char **narrow_argv(int argc, const wchar_t **argv_wide); #endif // CPROVER_UTIL_UNICODE_H