Merge pull request #541 from smowton/string-refine-unicode

utf8 to utf16 conversion and utf16 to ascii
This commit is contained in:
Daniel Kroening 2017-03-01 07:18:11 -05:00 committed by GitHub
commit d1e691e7ed
3 changed files with 85 additions and 0 deletions

View File

@ -28,6 +28,7 @@ matrix:
packages:
- libwww-perl
- clang-3.7
- libstdc++-5-dev
- libubsan0
before_install:
- mkdir bin ; ln -s /usr/bin/clang-3.7 bin/gcc

View File

@ -7,6 +7,10 @@ Author: Daniel Kroening, kroening@kroening.com
\*******************************************************************/
#include <cstring>
#include <locale>
#include <codecvt>
#include <iomanip>
#include <sstream>
#include "unicode.h"
@ -258,3 +262,79 @@ const char **narrow_argv(int argc, const wchar_t **argv_wide)
return argv_narrow;
}
/*******************************************************************\
Function: utf8_to_utf16_big_endian
Inputs: String in UTF-8 format
Outputs: String in UTF-16BE format
Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+
\*******************************************************************/
std::wstring utf8_to_utf16_big_endian(const std::string& in)
{
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t> > converter;
return converter.from_bytes(in);
}
/*******************************************************************\
Function: utf8_to_utf16_little_endian
Inputs: String in UTF-8 format
Outputs: String in UTF-16LE format
Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+
\*******************************************************************/
std::wstring utf8_to_utf16_little_endian(const std::string& in)
{
const std::codecvt_mode mode=std::codecvt_mode::little_endian;
// default largest value codecvt_utf8_utf16 reads without error is 0x10ffff
// see: http://en.cppreference.com/w/cpp/locale/codecvt_utf8_utf16
const unsigned long maxcode=0x10ffff;
typedef std::codecvt_utf8_utf16<wchar_t, maxcode, mode> codecvt_utf8_utf16t;
std::wstring_convert<codecvt_utf8_utf16t> converter;
return converter.from_bytes(in);
}
/*******************************************************************\
Function: utf16_little_endian_to_ascii
Inputs: String in UTF-16LE format
Outputs: String in US-ASCII format, with \uxxxx escapes for other
characters
Purpose:
\*******************************************************************/
std::string utf16_little_endian_to_ascii(const std::wstring& in)
{
std::ostringstream result;
std::locale loc;
for(const auto c : in)
{
if(c<=255 && isprint(c, loc))
result << (unsigned char)c;
else
{
result << "\\u"
<< std::hex
<< std::setw(4)
<< std::setfill('0')
<< (unsigned int)c;
}
}
return result.str();
}

View File

@ -22,6 +22,10 @@ std::wstring widen(const std::string &s);
std::string utf32_to_utf8(const std::basic_string<unsigned int> &s);
std::string utf16_to_utf8(const std::basic_string<unsigned short int> &s);
std::wstring utf8_to_utf16_big_endian(const std::string &);
std::wstring utf8_to_utf16_little_endian(const std::string &);
std::string utf16_little_endian_to_ascii(const std::wstring &in);
const char **narrow_argv(int argc, const wchar_t **argv_wide);
#endif // CPROVER_UTIL_UNICODE_H