Rewrite Utf16Parser (#14417)
This commit replaces `Utf16Parser` with `<til/unicode.h>` which includes: * `til::utf16_iterator` as a replacement for `Utf16Parser::Parse` * `til::utf16_next` as a replacement for `Utf16Parser::ParseNext` This fixes 2 bugs with `Utf16Parser`: * Swallowing invalid surrogate pairs instead of turning them into U+FFFD. * `std::vector<std::vector<wchar_t>>`. It's now >12000% faster. ## Validation Steps Performed * New unit tests pass ✅ * Searching for narrow/wide characters in conhost works ✅
This commit is contained in:
parent
437b5ac595
commit
8f346a7158
|
@ -5,8 +5,9 @@
|
|||
|
||||
#include "OutputCellIterator.hpp"
|
||||
|
||||
#include <til/unicode.h>
|
||||
|
||||
#include "../../types/inc/convert.hpp"
|
||||
#include "../../types/inc/Utf16Parser.hpp"
|
||||
#include "../../types/inc/GlyphWidth.hpp"
|
||||
#include "../../inc/conattrs.hpp"
|
||||
|
||||
|
@ -392,7 +393,7 @@ OutputCellView OutputCellIterator::s_GenerateView(const std::wstring_view view,
|
|||
const TextAttribute attr,
|
||||
const TextAttributeBehavior behavior)
|
||||
{
|
||||
const auto glyph = Utf16Parser::ParseNext(view);
|
||||
const auto glyph = til::utf16_next(view);
|
||||
const auto dbcsAttr = IsGlyphFullWidth(glyph) ? DbcsAttribute::Leading : DbcsAttribute::Single;
|
||||
return OutputCellView(glyph, dbcsAttr, attr, behavior);
|
||||
}
|
||||
|
|
|
@ -5,8 +5,9 @@
|
|||
|
||||
#include "search.h"
|
||||
|
||||
#include <til/unicode.h>
|
||||
|
||||
#include "textBuffer.hpp"
|
||||
#include "../types/inc/Utf16Parser.hpp"
|
||||
#include "../types/inc/GlyphWidth.hpp"
|
||||
|
||||
using namespace Microsoft::Console::Types;
|
||||
|
@ -192,12 +193,11 @@ bool Search::_FindNeedleInHaystackAt(const til::point pos, til::point& start, ti
|
|||
|
||||
auto bufferPos = pos;
|
||||
|
||||
for (const auto& needleCell : _needle)
|
||||
for (const auto& needleChars : _needle)
|
||||
{
|
||||
// Haystack is the buffer. Needle is the string we were given.
|
||||
const auto hayIter = _uiaData.GetTextBuffer().GetTextDataAt(bufferPos);
|
||||
const auto hayChars = *hayIter;
|
||||
const auto needleChars = std::wstring_view(needleCell.data(), needleCell.size());
|
||||
|
||||
// If we didn't match at any point of the needle, return false.
|
||||
if (!_CompareChars(hayChars, needleChars))
|
||||
|
@ -328,13 +328,12 @@ void Search::_UpdateNextPosition()
|
|||
// - wstr - String that will be our search term
|
||||
// Return Value:
|
||||
// - Structured text data for comparison to screen buffer text data.
|
||||
std::vector<std::vector<wchar_t>> Search::s_CreateNeedleFromString(const std::wstring_view wstr)
|
||||
std::vector<std::wstring> Search::s_CreateNeedleFromString(const std::wstring_view wstr)
|
||||
{
|
||||
const auto charData = Utf16Parser::Parse(wstr);
|
||||
std::vector<std::vector<wchar_t>> cells;
|
||||
for (const auto chars : charData)
|
||||
std::vector<std::wstring> cells;
|
||||
for (const auto& chars : til::utf16_iterator{ wstr })
|
||||
{
|
||||
if (IsGlyphFullWidth(std::wstring_view{ chars.data(), chars.size() }))
|
||||
if (IsGlyphFullWidth(chars))
|
||||
{
|
||||
cells.emplace_back(chars);
|
||||
}
|
||||
|
|
|
@ -68,7 +68,7 @@ private:
|
|||
|
||||
static til::point s_GetInitialAnchor(const Microsoft::Console::Types::IUiaData& uiaData, const Direction dir);
|
||||
|
||||
static std::vector<std::vector<wchar_t>> s_CreateNeedleFromString(const std::wstring_view wstr);
|
||||
static std::vector<std::wstring> s_CreateNeedleFromString(const std::wstring_view wstr);
|
||||
|
||||
bool _reachedEnd = false;
|
||||
til::point _coordNext;
|
||||
|
@ -76,7 +76,7 @@ private:
|
|||
til::point _coordSelEnd;
|
||||
|
||||
const til::point _coordAnchor;
|
||||
const std::vector<std::vector<wchar_t>> _needle;
|
||||
const std::vector<std::wstring> _needle;
|
||||
const Direction _direction;
|
||||
const Sensitivity _sensitivity;
|
||||
Microsoft::Console::Types::IUiaData& _uiaData;
|
||||
|
|
|
@ -6,11 +6,11 @@
|
|||
#include "textBuffer.hpp"
|
||||
|
||||
#include <til/hash.h>
|
||||
#include <til/unicode.h>
|
||||
|
||||
#include "../renderer/base/renderer.hpp"
|
||||
#include "../types/inc/utils.hpp"
|
||||
#include "../types/inc/convert.hpp"
|
||||
#include "../../types/inc/Utf16Parser.hpp"
|
||||
#include "../../types/inc/GlyphWidth.hpp"
|
||||
|
||||
namespace
|
||||
|
@ -2810,16 +2810,14 @@ PointTree TextBuffer::GetPatterns(const til::CoordType firstRow, const til::Coor
|
|||
// match and the previous match, so we use the size of the prefix
|
||||
// along with the size of the match to determine the locations
|
||||
til::CoordType prefixSize = 0;
|
||||
for (const auto parsedGlyph : Utf16Parser::Parse(i->prefix().str()))
|
||||
for (const auto str = i->prefix().str(); const auto& glyph : til::utf16_iterator{ str })
|
||||
{
|
||||
const std::wstring_view glyph{ parsedGlyph.data(), parsedGlyph.size() };
|
||||
prefixSize += IsGlyphFullWidth(glyph) ? 2 : 1;
|
||||
}
|
||||
const auto start = lenUpToThis + prefixSize;
|
||||
til::CoordType matchSize = 0;
|
||||
for (const auto parsedGlyph : Utf16Parser::Parse(i->str()))
|
||||
for (const auto str = i->str(); const auto& glyph : til::utf16_iterator{ str })
|
||||
{
|
||||
const std::wstring_view glyph{ parsedGlyph.data(), parsedGlyph.size() };
|
||||
matchSize += IsGlyphFullWidth(glyph) ? 2 : 1;
|
||||
}
|
||||
const auto end = start + matchSize;
|
||||
|
|
|
@ -7,7 +7,6 @@
|
|||
|
||||
#include "../textBuffer.hpp"
|
||||
#include "../../renderer/inc/DummyRenderer.hpp"
|
||||
#include "../../types/inc/Utf16Parser.hpp"
|
||||
#include "../../types/inc/GlyphWidth.hpp"
|
||||
|
||||
#include <IDataSource.h>
|
||||
|
|
|
@ -10,7 +10,6 @@
|
|||
|
||||
#include <DefaultSettings.h>
|
||||
#include <unicode.hpp>
|
||||
#include <Utf16Parser.hpp>
|
||||
#include <WinUser.h>
|
||||
#include <LibraryResources.h>
|
||||
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
#include "ControlInteractivity.h"
|
||||
#include <DefaultSettings.h>
|
||||
#include <unicode.hpp>
|
||||
#include <Utf16Parser.hpp>
|
||||
#include <Utils.h>
|
||||
#include <LibraryResources.h>
|
||||
#include "../../types/inc/GlyphWidth.hpp"
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
#include "TermControl.h"
|
||||
|
||||
#include <unicode.hpp>
|
||||
#include <Utf16Parser.hpp>
|
||||
#include <LibraryResources.h>
|
||||
|
||||
#include "TermControlAutomationPeer.h"
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
#include "../interactivity/inc/ServiceLocator.hpp"
|
||||
#include "../types/inc/Viewport.hpp"
|
||||
#include "../types/inc/convert.hpp"
|
||||
#include "../types/inc/Utf16Parser.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
|
|
|
@ -4,14 +4,14 @@
|
|||
#include "precomp.h"
|
||||
|
||||
#include "conimeinfo.h"
|
||||
#include "conareainfo.h"
|
||||
|
||||
#include <til/unicode.h>
|
||||
|
||||
#include "conareainfo.h"
|
||||
#include "_output.h"
|
||||
#include "dbcs.h"
|
||||
|
||||
#include "../interactivity/inc/ServiceLocator.hpp"
|
||||
#include "../types/inc/GlyphWidth.hpp"
|
||||
#include "../types/inc/Utf16Parser.hpp"
|
||||
|
||||
// Attributes flags:
|
||||
#define COMMON_LVB_GRID_SINGLEFLAG 0x2000 // DBCS: Grid attribute: use for ime cursor.
|
||||
|
@ -223,12 +223,9 @@ std::vector<OutputCell> ConsoleImeInfo::s_ConvertToCells(const std::wstring_view
|
|||
{
|
||||
std::vector<OutputCell> cells;
|
||||
|
||||
// - Convert incoming wchar_t stream into UTF-16 units.
|
||||
const auto glyphs = Utf16Parser::Parse(text);
|
||||
|
||||
// - Walk through all of the grouped up text, match up the correct attribute to it, and make a new cell.
|
||||
size_t attributesUsed = 0;
|
||||
for (const auto& parsedGlyph : glyphs)
|
||||
for (const auto& parsedGlyph : til::utf16_iterator{ text })
|
||||
{
|
||||
const std::wstring_view glyph{ parsedGlyph.data(), parsedGlyph.size() };
|
||||
// Collect up attributes that apply to this glyph range.
|
||||
|
|
|
@ -34,7 +34,6 @@
|
|||
<ClCompile Include="TitleTests.cpp" />
|
||||
<ClCompile Include="UtilsTests.cpp" />
|
||||
<ClCompile Include="Utf8ToWideCharParserTests.cpp" />
|
||||
<ClCompile Include="Utf16ParserTests.cpp" />
|
||||
<ClCompile Include="InputBufferTests.cpp" />
|
||||
<ClCompile Include="ReadWaitTests.cpp" />
|
||||
<ClCompile Include="ViewportTests.cpp" />
|
||||
|
|
|
@ -72,9 +72,6 @@
|
|||
<ClCompile Include="AliasTests.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="Utf16ParserTests.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="SearchTests.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
|
|
|
@ -1,211 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include "precomp.h"
|
||||
#include "WexTestClass.h"
|
||||
#include "../../inc/consoletaeftemplates.hpp"
|
||||
|
||||
#include "../../types/inc/Utf16Parser.hpp"
|
||||
|
||||
using namespace WEX::Common;
|
||||
using namespace WEX::Logging;
|
||||
using namespace WEX::TestExecution;
|
||||
|
||||
static const std::vector<wchar_t> CyrillicChar = { 0x0431 }; // lowercase be
|
||||
static const std::vector<wchar_t> LatinChar = { 0x0061 }; // uppercase A
|
||||
static const std::vector<wchar_t> FullWidthChar = { 0xFF2D }; // fullwidth latin small letter m
|
||||
static const std::vector<wchar_t> GaelicChar = { 0x1E41 }; // latin small letter m with dot above
|
||||
static const std::vector<wchar_t> HiraganaChar = { 0x3059 }; // hiragana su
|
||||
static const std::vector<wchar_t> SunglassesEmoji = { 0xD83D, 0xDE0E }; // smiling face with sunglasses emoji
|
||||
|
||||
class Utf16ParserTests
|
||||
{
|
||||
TEST_CLASS(Utf16ParserTests);
|
||||
|
||||
TEST_METHOD(CanParseNonSurrogateText)
|
||||
{
|
||||
const std::vector<std::vector<wchar_t>> expected = { CyrillicChar, LatinChar, FullWidthChar, GaelicChar, HiraganaChar };
|
||||
|
||||
std::wstring wstr;
|
||||
for (const auto& charData : expected)
|
||||
{
|
||||
wstr.push_back(charData.at(0));
|
||||
}
|
||||
|
||||
const auto result = Utf16Parser::Parse(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(expected.size(), result.size());
|
||||
for (size_t i = 0; i < result.size(); ++i)
|
||||
{
|
||||
const auto& sequence = result.at(i);
|
||||
VERIFY_ARE_EQUAL(sequence, expected.at(i));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_METHOD(CanParseSurrogatePairs)
|
||||
{
|
||||
const std::wstring wstr{ SunglassesEmoji.begin(), SunglassesEmoji.end() };
|
||||
const auto result = Utf16Parser::Parse(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(result.size(), 1u);
|
||||
VERIFY_ARE_EQUAL(result.at(0).size(), SunglassesEmoji.size());
|
||||
for (size_t i = 0; i < SunglassesEmoji.size(); ++i)
|
||||
{
|
||||
VERIFY_ARE_EQUAL(result.at(0).at(i), SunglassesEmoji.at(i));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_METHOD(WillDropBadSurrogateCombinations)
|
||||
{
|
||||
// test dropping of invalid leading surrogates
|
||||
std::wstring wstr{ SunglassesEmoji.begin(), SunglassesEmoji.end() };
|
||||
wstr += wstr;
|
||||
wstr.at(1) = SunglassesEmoji.at(0); // wstr contains 3 leading, 1 trailing surrogate sequence
|
||||
|
||||
auto result = Utf16Parser::Parse(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(result.size(), 1u);
|
||||
VERIFY_ARE_EQUAL(result.at(0).size(), SunglassesEmoji.size());
|
||||
for (size_t i = 0; i < SunglassesEmoji.size(); ++i)
|
||||
{
|
||||
VERIFY_ARE_EQUAL(result.at(0).at(i), SunglassesEmoji.at(i));
|
||||
}
|
||||
|
||||
// test dropping of invalid trailing surrogates
|
||||
wstr = { SunglassesEmoji.begin(), SunglassesEmoji.end() };
|
||||
wstr += wstr;
|
||||
wstr.at(0) = SunglassesEmoji.at(1); // wstr contains 2 trailing, 1 leading, 1 trailing surrogate sequence
|
||||
|
||||
result = Utf16Parser::Parse(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(result.size(), 1u);
|
||||
VERIFY_ARE_EQUAL(result.at(0).size(), SunglassesEmoji.size());
|
||||
for (size_t i = 0; i < SunglassesEmoji.size(); ++i)
|
||||
{
|
||||
VERIFY_ARE_EQUAL(result.at(0).at(i), SunglassesEmoji.at(i));
|
||||
}
|
||||
}
|
||||
|
||||
const std::wstring_view Replacement{ &UNICODE_REPLACEMENT, 1 };
|
||||
|
||||
TEST_METHOD(ParseNextLeadOnly)
|
||||
{
|
||||
std::wstring wstr{ SunglassesEmoji.at(0) };
|
||||
|
||||
const auto expected = Replacement;
|
||||
const auto actual = Utf16Parser::ParseNext(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(expected, actual);
|
||||
}
|
||||
|
||||
TEST_METHOD(ParseNextTrailOnly)
|
||||
{
|
||||
std::wstring wstr{ SunglassesEmoji.at(1) };
|
||||
|
||||
const auto expected = Replacement;
|
||||
const auto actual = Utf16Parser::ParseNext(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(expected, actual);
|
||||
}
|
||||
|
||||
TEST_METHOD(ParseNextSingleOnly)
|
||||
{
|
||||
std::wstring wstr{ CyrillicChar.at(0) };
|
||||
|
||||
const auto expected = std::wstring_view{ CyrillicChar.data(), CyrillicChar.size() };
|
||||
const auto actual = Utf16Parser::ParseNext(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(expected, actual);
|
||||
}
|
||||
|
||||
TEST_METHOD(ParseNextLeadLead)
|
||||
{
|
||||
std::wstring wstr{ SunglassesEmoji.at(0) };
|
||||
wstr += SunglassesEmoji.at(0);
|
||||
|
||||
const auto expected = Replacement;
|
||||
const auto actual = Utf16Parser::ParseNext(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(expected, actual);
|
||||
}
|
||||
|
||||
TEST_METHOD(ParseNextLeadTrail)
|
||||
{
|
||||
std::wstring wstr{ SunglassesEmoji.at(0) };
|
||||
wstr += SunglassesEmoji.at(1);
|
||||
|
||||
const auto expected = std::wstring_view{ SunglassesEmoji.data(), SunglassesEmoji.size() };
|
||||
const auto actual = Utf16Parser::ParseNext(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(expected, actual);
|
||||
}
|
||||
|
||||
TEST_METHOD(ParseNextTrailTrail)
|
||||
{
|
||||
std::wstring wstr{ SunglassesEmoji.at(1) };
|
||||
wstr += SunglassesEmoji.at(1);
|
||||
|
||||
const auto expected = Replacement;
|
||||
const auto actual = Utf16Parser::ParseNext(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(expected, actual);
|
||||
}
|
||||
|
||||
TEST_METHOD(ParseNextLeadSingle)
|
||||
{
|
||||
std::wstring wstr{ SunglassesEmoji.at(0) };
|
||||
wstr += LatinChar.at(0);
|
||||
|
||||
const auto expected = std::wstring_view{ LatinChar.data(), LatinChar.size() };
|
||||
const auto actual = Utf16Parser::ParseNext(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(expected, actual);
|
||||
}
|
||||
|
||||
TEST_METHOD(ParseNextTrailSingle)
|
||||
{
|
||||
std::wstring wstr{ SunglassesEmoji.at(1) };
|
||||
wstr += LatinChar.at(0);
|
||||
|
||||
const auto expected = std::wstring_view{ LatinChar.data(), LatinChar.size() };
|
||||
const auto actual = Utf16Parser::ParseNext(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(expected, actual);
|
||||
}
|
||||
|
||||
TEST_METHOD(ParseNextLeadLeadTrail)
|
||||
{
|
||||
std::wstring wstr{ SunglassesEmoji.at(0) };
|
||||
wstr += SunglassesEmoji.at(0);
|
||||
wstr += SunglassesEmoji.at(1);
|
||||
|
||||
const auto expected = std::wstring_view{ SunglassesEmoji.data(), SunglassesEmoji.size() };
|
||||
const auto actual = Utf16Parser::ParseNext(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(expected, actual);
|
||||
}
|
||||
|
||||
TEST_METHOD(ParseNextTrailLeadTrail)
|
||||
{
|
||||
std::wstring wstr{ SunglassesEmoji.at(1) };
|
||||
wstr += SunglassesEmoji.at(0);
|
||||
wstr += SunglassesEmoji.at(1);
|
||||
|
||||
const auto expected = std::wstring_view{ SunglassesEmoji.data(), SunglassesEmoji.size() };
|
||||
const auto actual = Utf16Parser::ParseNext(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(expected, actual);
|
||||
}
|
||||
|
||||
TEST_METHOD(ParseNextSingleLeadTrail)
|
||||
{
|
||||
std::wstring wstr{ GaelicChar.at(0) };
|
||||
wstr += SunglassesEmoji.at(0);
|
||||
wstr += SunglassesEmoji.at(1);
|
||||
|
||||
const auto expected = std::wstring_view{ GaelicChar.data(), GaelicChar.size() };
|
||||
const auto actual = Utf16Parser::ParseNext(wstr);
|
||||
|
||||
VERIFY_ARE_EQUAL(expected, actual);
|
||||
}
|
||||
};
|
|
@ -28,7 +28,6 @@ SOURCES = \
|
|||
ClipboardTests.cpp \
|
||||
SelectionTests.cpp \
|
||||
Utf8ToWideCharParserTests.cpp \
|
||||
Utf16ParserTests.cpp \
|
||||
OutputCellIteratorTests.cpp \
|
||||
InitTests.cpp \
|
||||
TitleTests.cpp \
|
||||
|
|
|
@ -0,0 +1,164 @@
|
|||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace til
|
||||
{
|
||||
namespace details
|
||||
{
|
||||
inline constexpr wchar_t UNICODE_REPLACEMENT = 0xFFFD;
|
||||
}
|
||||
|
||||
static constexpr bool is_surrogate(const wchar_t wch) noexcept
|
||||
{
|
||||
return (wch & 0xF800) == 0xD800;
|
||||
}
|
||||
|
||||
static constexpr bool is_leading_surrogate(const wchar_t wch) noexcept
|
||||
{
|
||||
return (wch & 0xFC00) == 0xD800;
|
||||
}
|
||||
|
||||
static constexpr bool is_trailing_surrogate(const wchar_t wch) noexcept
|
||||
{
|
||||
return (wch & 0xFC00) == 0xDC00;
|
||||
}
|
||||
|
||||
// Verifies the beginning of the given UTF16 string and returns the first UTF16 sequence
|
||||
// or U+FFFD otherwise. It's not really useful and at the time of writing only a
|
||||
// single caller uses this. It's best to delete this if you read this comment.
|
||||
constexpr std::wstring_view utf16_next(std::wstring_view wstr) noexcept
|
||||
{
|
||||
auto it = wstr.begin();
|
||||
const auto end = wstr.end();
|
||||
auto ptr = &details::UNICODE_REPLACEMENT;
|
||||
size_t len = 1;
|
||||
|
||||
if (it != end)
|
||||
{
|
||||
const auto wch = *it;
|
||||
ptr = &*it;
|
||||
|
||||
if (is_surrogate(wch))
|
||||
{
|
||||
++it;
|
||||
const auto wch2 = it != end ? *it : wchar_t{};
|
||||
if (is_leading_surrogate(wch) && is_trailing_surrogate(wch2))
|
||||
{
|
||||
len = 2;
|
||||
++it;
|
||||
}
|
||||
else
|
||||
{
|
||||
ptr = &details::UNICODE_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { ptr, len };
|
||||
}
|
||||
|
||||
// Splits a UTF16 string into codepoints, yielding `wstring_view`s of UTF16 text. Use it as:
|
||||
// for (const auto& str : til::utf16_iterator{ input }) { ... }
|
||||
struct utf16_iterator
|
||||
{
|
||||
struct sentinel
|
||||
{
|
||||
};
|
||||
|
||||
struct iterator
|
||||
{
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
using value_type = std::wstring_view;
|
||||
using reference = value_type&;
|
||||
using pointer = value_type*;
|
||||
using difference_type = std::ptrdiff_t;
|
||||
|
||||
explicit constexpr iterator(utf16_iterator& p) noexcept :
|
||||
_iter{ p }
|
||||
{
|
||||
}
|
||||
|
||||
const value_type& operator*() const noexcept
|
||||
{
|
||||
return _iter.value();
|
||||
}
|
||||
|
||||
iterator& operator++() noexcept
|
||||
{
|
||||
_iter._advance = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool operator!=(const sentinel&) const noexcept
|
||||
{
|
||||
return _iter.valid();
|
||||
}
|
||||
|
||||
private:
|
||||
utf16_iterator& _iter;
|
||||
};
|
||||
|
||||
explicit constexpr utf16_iterator(std::wstring_view wstr) noexcept :
|
||||
_it{ wstr.begin() }, _end{ wstr.end() }, _advance{ _it != _end }
|
||||
{
|
||||
}
|
||||
|
||||
iterator begin() noexcept
|
||||
{
|
||||
return iterator{ *this };
|
||||
}
|
||||
|
||||
sentinel end() noexcept
|
||||
{
|
||||
return sentinel{};
|
||||
}
|
||||
|
||||
private:
|
||||
bool valid() const noexcept
|
||||
{
|
||||
return _it != _end;
|
||||
}
|
||||
|
||||
void advance() noexcept
|
||||
{
|
||||
const auto wch = *_it;
|
||||
auto ptr = &*_it;
|
||||
size_t len = 1;
|
||||
|
||||
++_it;
|
||||
|
||||
if (is_surrogate(wch))
|
||||
{
|
||||
const auto wch2 = _it != _end ? *_it : wchar_t{};
|
||||
if (is_leading_surrogate(wch) && is_trailing_surrogate(wch2))
|
||||
{
|
||||
len = 2;
|
||||
++_it;
|
||||
}
|
||||
else
|
||||
{
|
||||
ptr = &details::UNICODE_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
|
||||
_value = { ptr, len };
|
||||
_advance = false;
|
||||
}
|
||||
|
||||
const std::wstring_view& value() noexcept
|
||||
{
|
||||
if (_advance)
|
||||
{
|
||||
advance();
|
||||
}
|
||||
return _value;
|
||||
}
|
||||
|
||||
std::wstring_view::iterator _it;
|
||||
std::wstring_view::iterator _end;
|
||||
std::wstring_view _value;
|
||||
bool _advance = true;
|
||||
};
|
||||
}
|
|
@ -2,17 +2,16 @@
|
|||
// Licensed under the MIT license.
|
||||
|
||||
#include "precomp.h"
|
||||
#include <windows.h>
|
||||
#include "terminalInput.hpp"
|
||||
|
||||
#include "strsafe.h"
|
||||
#include <til/unicode.h>
|
||||
#include <strsafe.h>
|
||||
|
||||
#define WIL_SUPPORT_BITOPERATION_PASCAL_NAMES
|
||||
#include <wil/Common.h>
|
||||
|
||||
#include "../../interactivity/inc/VtApiRedirection.hpp"
|
||||
#include "../../inc/unicode.hpp"
|
||||
#include "../../types/inc/Utf16Parser.hpp"
|
||||
|
||||
using namespace Microsoft::Console::VirtualTerminal;
|
||||
|
||||
|
@ -739,7 +738,7 @@ bool TerminalInput::HandleFocus(const bool focused) noexcept
|
|||
// - ch: The UTF-16 character to send.
|
||||
void TerminalInput::_SendChar(const wchar_t ch)
|
||||
{
|
||||
if (Utf16Parser::IsLeadingSurrogate(ch))
|
||||
if (til::is_leading_surrogate(ch))
|
||||
{
|
||||
if (_leadingSurrogate.has_value())
|
||||
{
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include "precomp.h"
|
||||
#include "WexTestClass.h"
|
||||
|
||||
#include <til/unicode.h>
|
||||
|
||||
using namespace WEX::Common;
|
||||
using namespace WEX::Logging;
|
||||
using namespace WEX::TestExecution;
|
||||
|
||||
#define REPLACEMENT L"\xFFFD"
|
||||
#define LEADING L"\xD801"
|
||||
#define TRAILING L"\xDC01"
|
||||
#define PAIR L"\xD801\xDC01"
|
||||
|
||||
class UnicodeTests
|
||||
{
|
||||
TEST_CLASS(UnicodeTests);
|
||||
|
||||
TEST_METHOD(utf16_next)
|
||||
{
|
||||
struct Test
|
||||
{
|
||||
std::wstring_view input;
|
||||
std::wstring_view expected;
|
||||
};
|
||||
|
||||
static constexpr std::array tests{
|
||||
Test{ L"", REPLACEMENT },
|
||||
Test{ L"a", L"a" },
|
||||
Test{ L"abc", L"a" },
|
||||
Test{ L"a" PAIR, L"a" },
|
||||
Test{ L"a" LEADING, L"a" },
|
||||
Test{ L"a" TRAILING, L"a" },
|
||||
Test{ PAIR L"a", PAIR },
|
||||
Test{ LEADING L"a", REPLACEMENT },
|
||||
Test{ TRAILING L"a", REPLACEMENT },
|
||||
};
|
||||
|
||||
for (const auto& t : tests)
|
||||
{
|
||||
const auto actual = til::utf16_next(t.input);
|
||||
VERIFY_ARE_EQUAL(t.expected, actual);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_METHOD(utf16_iterator)
|
||||
{
|
||||
struct Test
|
||||
{
|
||||
std::wstring_view input;
|
||||
til::some<std::wstring_view, 5> expected;
|
||||
};
|
||||
|
||||
static constexpr std::array tests{
|
||||
Test{ L"", {} },
|
||||
Test{ L"a", { L"a" } },
|
||||
Test{ L"abc", { L"a", L"b", L"c" } },
|
||||
Test{ PAIR L"a" PAIR L"b" PAIR, { PAIR, L"a", PAIR, L"b", PAIR } },
|
||||
Test{ LEADING L"a" LEADING L"b" LEADING, { REPLACEMENT, L"a", REPLACEMENT, L"b", REPLACEMENT } },
|
||||
Test{ TRAILING L"a" TRAILING L"b" TRAILING, { REPLACEMENT, L"a", REPLACEMENT, L"b", REPLACEMENT } },
|
||||
Test{ L"a" TRAILING LEADING L"b", { L"a", REPLACEMENT, REPLACEMENT, L"b" } },
|
||||
};
|
||||
|
||||
for (const auto& t : tests)
|
||||
{
|
||||
auto it = t.expected.begin();
|
||||
const auto end = t.expected.end();
|
||||
|
||||
for (const auto& v : til::utf16_iterator{ t.input })
|
||||
{
|
||||
VERIFY_ARE_NOT_EQUAL(end, it);
|
||||
VERIFY_ARE_EQUAL(*it, v);
|
||||
++it;
|
||||
}
|
||||
|
||||
VERIFY_ARE_EQUAL(end, it);
|
||||
}
|
||||
}
|
||||
};
|
|
@ -33,6 +33,7 @@ SOURCES = \
|
|||
StaticMapTests.cpp \
|
||||
string.cpp \
|
||||
u8u16convertTests.cpp \
|
||||
UnicodeTests.cpp \
|
||||
DefaultResource.rc \
|
||||
|
||||
# These tests are disabled because of a missing symbol.
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
<ClCompile Include="string.cpp" />
|
||||
<ClCompile Include="throttled_func.cpp" />
|
||||
<ClCompile Include="u8u16convertTests.cpp" />
|
||||
<ClCompile Include="UnicodeTests.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\..\inc\til\at.h" />
|
||||
|
@ -64,6 +65,7 @@
|
|||
<ClInclude Include="..\..\inc\til\throttled_func.h" />
|
||||
<ClInclude Include="..\..\inc\til\ticket_lock.h" />
|
||||
<ClInclude Include="..\..\inc\til\u8u16convert.h" />
|
||||
<ClInclude Include="..\..\inc\til\unicode.h" />
|
||||
<ClInclude Include="..\precomp.h" />
|
||||
</ItemGroup>
|
||||
<ItemDefinitionGroup>
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
<ClCompile Include="string.cpp" />
|
||||
<ClCompile Include="throttled_func.cpp" />
|
||||
<ClCompile Include="u8u16convertTests.cpp" />
|
||||
<ClCompile Include="UnicodeTests.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\precomp.h" />
|
||||
|
@ -110,6 +111,9 @@
|
|||
<ClInclude Include="..\..\inc\til\u8u16convert.h">
|
||||
<Filter>inc</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\inc\til\unicode.h">
|
||||
<Filter>inc</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Filter Include="inc">
|
||||
|
|
|
@ -1,91 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include "precomp.h"
|
||||
|
||||
#include "inc/Utf16Parser.hpp"
|
||||
#include "unicode.hpp"
|
||||
|
||||
// Routine Description:
|
||||
// - Finds the next single collection for the codepoint out of the given UTF-16 string information.
|
||||
// - In simpler terms, it will group UTF-16 surrogate pairs into a single unit or give you a valid single-item UTF-16 character.
|
||||
// - Does not validate UTF-16 input beyond proper leading/trailing character sequences.
|
||||
// Arguments:
|
||||
// - wstr - The UTF-16 string to parse.
|
||||
// Return Value:
|
||||
// - A view into the string given of just the next codepoint unit.
|
||||
std::wstring_view Utf16Parser::ParseNext(std::wstring_view wstr) noexcept
|
||||
{
|
||||
for (size_t pos = 0; pos < wstr.size(); ++pos)
|
||||
{
|
||||
const auto wch = wstr.at(pos);
|
||||
|
||||
// If it's a lead and followed directly by a trail, then return the pair.
|
||||
// If it's not followed directly by the trail, go around again and seek forward.
|
||||
if (IsLeadingSurrogate(wch))
|
||||
{
|
||||
// Try to find the next item... if it isn't there, we'll go around again.
|
||||
const auto posNext = pos + 1;
|
||||
if (posNext < wstr.size())
|
||||
{
|
||||
// If we found it and it's trailing, return the pair.
|
||||
const auto wchNext = wstr.at(posNext);
|
||||
if (IsTrailingSurrogate(wchNext))
|
||||
{
|
||||
return wstr.substr(pos, 2);
|
||||
}
|
||||
}
|
||||
// If we missed either if in any way, we'll fall through and go around again searching for more.
|
||||
}
|
||||
// If it's just a trail at this point, go around again and seek forward.
|
||||
else if (IsTrailingSurrogate(wch))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// If it's neither lead nor trail, then it's < U+10000 and it can be returned as a single wchar_t point.
|
||||
else
|
||||
{
|
||||
return wstr.substr(pos, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// If we get all the way through and there's nothing valid, then this is just a replacement character as it was broken/garbage.
|
||||
return std::wstring_view{ &UNICODE_REPLACEMENT, 1 };
|
||||
}
|
||||
|
||||
// Routine Description:
|
||||
// - formats a utf16 encoded wstring and splits the codepoints into individual collections.
|
||||
// - will drop badly formatted leading/trailing char sequences.
|
||||
// - does not validate utf16 input beyond proper leading/trailing char sequences.
|
||||
// Arguments:
|
||||
// - wstr - the string to parse
|
||||
// Return Value:
|
||||
// - a vector of utf16 codepoints. glyphs that require surrogate pairs will be grouped
|
||||
// together in a vector and codepoints that use only one wchar will be in a vector by themselves.
|
||||
std::vector<std::vector<wchar_t>> Utf16Parser::Parse(std::wstring_view wstr)
|
||||
{
|
||||
std::vector<std::vector<wchar_t>> result;
|
||||
std::vector<wchar_t> sequence;
|
||||
for (const auto wch : wstr)
|
||||
{
|
||||
if (IsLeadingSurrogate(wch))
|
||||
{
|
||||
sequence.clear();
|
||||
sequence.push_back(wch);
|
||||
}
|
||||
else if (IsTrailingSurrogate(wch))
|
||||
{
|
||||
if (!sequence.empty())
|
||||
{
|
||||
sequence.push_back(wch);
|
||||
result.push_back(sequence);
|
||||
sequence.clear();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
result.push_back({ wch });
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
|
@ -1,46 +0,0 @@
|
|||
/*++
|
||||
Copyright (c) Microsoft Corporation
|
||||
|
||||
Module Name:
|
||||
- Utf16Parser.hpp
|
||||
|
||||
Abstract:
|
||||
- Parser for grouping together utf16 codepoints from a string of utf16 encoded text
|
||||
|
||||
Author(s):
|
||||
- Austin Diviness (AustDi) 25-Apr-2018
|
||||
|
||||
--*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
class Utf16Parser final
|
||||
{
|
||||
public:
|
||||
static std::vector<std::vector<wchar_t>> Parse(std::wstring_view wstr);
|
||||
static std::wstring_view ParseNext(std::wstring_view wstr) noexcept;
|
||||
|
||||
// Routine Description:
|
||||
// - checks if wchar is a utf16 leading surrogate
|
||||
// Arguments:
|
||||
// - wch - the wchar to check
|
||||
// Return Value:
|
||||
// - true if wch is a leading surrogate, false otherwise
|
||||
static constexpr bool IsLeadingSurrogate(const wchar_t wch) noexcept
|
||||
{
|
||||
return wch >= 0xD800 && wch <= 0xDBFF;
|
||||
}
|
||||
|
||||
// Routine Description:
|
||||
// - checks if wchar is a utf16 trailing surrogate
|
||||
// Arguments:
|
||||
// - wch - the wchar to check
|
||||
// Return Value:
|
||||
// - true if wch is a trailing surrogate, false otherwise
|
||||
static constexpr bool IsTrailingSurrogate(const wchar_t wch) noexcept
|
||||
{
|
||||
return wch >= 0xDC00 && wch <= 0xDFFF;
|
||||
}
|
||||
};
|
|
@ -30,7 +30,6 @@
|
|||
<ClCompile Include="..\UiaTracing.cpp" />
|
||||
<ClCompile Include="..\TermControlUiaTextRange.cpp" />
|
||||
<ClCompile Include="..\TermControlUiaProvider.cpp" />
|
||||
<ClCompile Include="..\Utf16Parser.cpp" />
|
||||
<ClCompile Include="..\Viewport.cpp" />
|
||||
<ClCompile Include="..\WindowBufferSizeEvent.cpp" />
|
||||
<ClCompile Include="..\precomp.cpp">
|
||||
|
@ -52,7 +51,6 @@
|
|||
<ClInclude Include="..\inc\ThemeUtils.h" />
|
||||
<ClInclude Include="..\inc\utils.hpp" />
|
||||
<ClInclude Include="..\inc\Viewport.hpp" />
|
||||
<ClInclude Include="..\inc\Utf16Parser.hpp" />
|
||||
<ClInclude Include="..\IUiaData.h" />
|
||||
<ClInclude Include="..\IUiaEventDispatcher.h" />
|
||||
<ClInclude Include="..\IUiaTraceable.h" />
|
||||
|
|
|
@ -57,9 +57,6 @@
|
|||
<ClCompile Include="..\GlyphWidth.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Utf16Parser.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\utils.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
|
@ -110,9 +107,6 @@
|
|||
<ClInclude Include="..\inc\CodepointWidthDetector.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\inc\Utf16Parser.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\inc\GlyphWidth.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
|
@ -137,9 +131,6 @@
|
|||
<ClInclude Include="..\inc\Viewport.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\inc\Utf16Parser.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\precomp.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
|
|
|
@ -41,7 +41,6 @@ SOURCES= \
|
|||
..\WindowBufferSizeEvent.cpp \
|
||||
..\convert.cpp \
|
||||
..\colorTable.cpp \
|
||||
..\Utf16Parser.cpp \
|
||||
..\utils.cpp \
|
||||
..\ThemeUtils.cpp \
|
||||
..\ScreenInfoUiaProviderBase.cpp \
|
||||
|
|
Loading…
Reference in New Issue