Rewrite Utf16Parser (#14417)

This commit replaces `Utf16Parser` with `<til/unicode.h>` which includes:
* `til::utf16_iterator` as a replacement for `Utf16Parser::Parse`
* `til::utf16_next` as a replacement for `Utf16Parser::ParseNext`

This fixes 2 bugs with `Utf16Parser`:
* Swallowing invalid surrogate pairs instead of turning them into U+FFFD.
* `std::vector<std::vector<wchar_t>>`. It's now >12000% faster.

## Validation Steps Performed
* New unit tests pass 
* Searching for narrow/wide characters in conhost works 
This commit is contained in:
Leonard Hecker 2022-11-23 22:13:36 +01:00 committed by GitHub
parent 437b5ac595
commit 8f346a7158
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 275 additions and 398 deletions

View File

@ -5,8 +5,9 @@
#include "OutputCellIterator.hpp"
#include <til/unicode.h>
#include "../../types/inc/convert.hpp"
#include "../../types/inc/Utf16Parser.hpp"
#include "../../types/inc/GlyphWidth.hpp"
#include "../../inc/conattrs.hpp"
@ -392,7 +393,7 @@ OutputCellView OutputCellIterator::s_GenerateView(const std::wstring_view view,
const TextAttribute attr,
const TextAttributeBehavior behavior)
{
const auto glyph = Utf16Parser::ParseNext(view);
const auto glyph = til::utf16_next(view);
const auto dbcsAttr = IsGlyphFullWidth(glyph) ? DbcsAttribute::Leading : DbcsAttribute::Single;
return OutputCellView(glyph, dbcsAttr, attr, behavior);
}

View File

@ -5,8 +5,9 @@
#include "search.h"
#include <til/unicode.h>
#include "textBuffer.hpp"
#include "../types/inc/Utf16Parser.hpp"
#include "../types/inc/GlyphWidth.hpp"
using namespace Microsoft::Console::Types;
@ -192,12 +193,11 @@ bool Search::_FindNeedleInHaystackAt(const til::point pos, til::point& start, ti
auto bufferPos = pos;
for (const auto& needleCell : _needle)
for (const auto& needleChars : _needle)
{
// Haystack is the buffer. Needle is the string we were given.
const auto hayIter = _uiaData.GetTextBuffer().GetTextDataAt(bufferPos);
const auto hayChars = *hayIter;
const auto needleChars = std::wstring_view(needleCell.data(), needleCell.size());
// If we didn't match at any point of the needle, return false.
if (!_CompareChars(hayChars, needleChars))
@ -328,13 +328,12 @@ void Search::_UpdateNextPosition()
// - wstr - String that will be our search term
// Return Value:
// - Structured text data for comparison to screen buffer text data.
std::vector<std::vector<wchar_t>> Search::s_CreateNeedleFromString(const std::wstring_view wstr)
std::vector<std::wstring> Search::s_CreateNeedleFromString(const std::wstring_view wstr)
{
const auto charData = Utf16Parser::Parse(wstr);
std::vector<std::vector<wchar_t>> cells;
for (const auto chars : charData)
std::vector<std::wstring> cells;
for (const auto& chars : til::utf16_iterator{ wstr })
{
if (IsGlyphFullWidth(std::wstring_view{ chars.data(), chars.size() }))
if (IsGlyphFullWidth(chars))
{
cells.emplace_back(chars);
}

View File

@ -68,7 +68,7 @@ private:
static til::point s_GetInitialAnchor(const Microsoft::Console::Types::IUiaData& uiaData, const Direction dir);
static std::vector<std::vector<wchar_t>> s_CreateNeedleFromString(const std::wstring_view wstr);
static std::vector<std::wstring> s_CreateNeedleFromString(const std::wstring_view wstr);
bool _reachedEnd = false;
til::point _coordNext;
@ -76,7 +76,7 @@ private:
til::point _coordSelEnd;
const til::point _coordAnchor;
const std::vector<std::vector<wchar_t>> _needle;
const std::vector<std::wstring> _needle;
const Direction _direction;
const Sensitivity _sensitivity;
Microsoft::Console::Types::IUiaData& _uiaData;

View File

@ -6,11 +6,11 @@
#include "textBuffer.hpp"
#include <til/hash.h>
#include <til/unicode.h>
#include "../renderer/base/renderer.hpp"
#include "../types/inc/utils.hpp"
#include "../types/inc/convert.hpp"
#include "../../types/inc/Utf16Parser.hpp"
#include "../../types/inc/GlyphWidth.hpp"
namespace
@ -2810,16 +2810,14 @@ PointTree TextBuffer::GetPatterns(const til::CoordType firstRow, const til::Coor
// match and the previous match, so we use the size of the prefix
// along with the size of the match to determine the locations
til::CoordType prefixSize = 0;
for (const auto parsedGlyph : Utf16Parser::Parse(i->prefix().str()))
for (const auto str = i->prefix().str(); const auto& glyph : til::utf16_iterator{ str })
{
const std::wstring_view glyph{ parsedGlyph.data(), parsedGlyph.size() };
prefixSize += IsGlyphFullWidth(glyph) ? 2 : 1;
}
const auto start = lenUpToThis + prefixSize;
til::CoordType matchSize = 0;
for (const auto parsedGlyph : Utf16Parser::Parse(i->str()))
for (const auto str = i->str(); const auto& glyph : til::utf16_iterator{ str })
{
const std::wstring_view glyph{ parsedGlyph.data(), parsedGlyph.size() };
matchSize += IsGlyphFullWidth(glyph) ? 2 : 1;
}
const auto end = start + matchSize;

View File

@ -7,7 +7,6 @@
#include "../textBuffer.hpp"
#include "../../renderer/inc/DummyRenderer.hpp"
#include "../../types/inc/Utf16Parser.hpp"
#include "../../types/inc/GlyphWidth.hpp"
#include <IDataSource.h>

View File

@ -10,7 +10,6 @@
#include <DefaultSettings.h>
#include <unicode.hpp>
#include <Utf16Parser.hpp>
#include <WinUser.h>
#include <LibraryResources.h>

View File

@ -5,7 +5,6 @@
#include "ControlInteractivity.h"
#include <DefaultSettings.h>
#include <unicode.hpp>
#include <Utf16Parser.hpp>
#include <Utils.h>
#include <LibraryResources.h>
#include "../../types/inc/GlyphWidth.hpp"

View File

@ -5,7 +5,6 @@
#include "TermControl.h"
#include <unicode.hpp>
#include <Utf16Parser.hpp>
#include <LibraryResources.h>
#include "TermControlAutomationPeer.h"

View File

@ -12,7 +12,6 @@
#include "../interactivity/inc/ServiceLocator.hpp"
#include "../types/inc/Viewport.hpp"
#include "../types/inc/convert.hpp"
#include "../types/inc/Utf16Parser.hpp"
#include <algorithm>
#include <iterator>

View File

@ -4,14 +4,14 @@
#include "precomp.h"
#include "conimeinfo.h"
#include "conareainfo.h"
#include <til/unicode.h>
#include "conareainfo.h"
#include "_output.h"
#include "dbcs.h"
#include "../interactivity/inc/ServiceLocator.hpp"
#include "../types/inc/GlyphWidth.hpp"
#include "../types/inc/Utf16Parser.hpp"
// Attributes flags:
#define COMMON_LVB_GRID_SINGLEFLAG 0x2000 // DBCS: Grid attribute: use for ime cursor.
@ -223,12 +223,9 @@ std::vector<OutputCell> ConsoleImeInfo::s_ConvertToCells(const std::wstring_view
{
std::vector<OutputCell> cells;
// - Convert incoming wchar_t stream into UTF-16 units.
const auto glyphs = Utf16Parser::Parse(text);
// - Walk through all of the grouped up text, match up the correct attribute to it, and make a new cell.
size_t attributesUsed = 0;
for (const auto& parsedGlyph : glyphs)
for (const auto& parsedGlyph : til::utf16_iterator{ text })
{
const std::wstring_view glyph{ parsedGlyph.data(), parsedGlyph.size() };
// Collect up attributes that apply to this glyph range.

View File

@ -34,7 +34,6 @@
<ClCompile Include="TitleTests.cpp" />
<ClCompile Include="UtilsTests.cpp" />
<ClCompile Include="Utf8ToWideCharParserTests.cpp" />
<ClCompile Include="Utf16ParserTests.cpp" />
<ClCompile Include="InputBufferTests.cpp" />
<ClCompile Include="ReadWaitTests.cpp" />
<ClCompile Include="ViewportTests.cpp" />

View File

@ -72,9 +72,6 @@
<ClCompile Include="AliasTests.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="Utf16ParserTests.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="SearchTests.cpp">
<Filter>Source Files</Filter>
</ClCompile>

View File

@ -1,211 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include "precomp.h"
#include "WexTestClass.h"
#include "../../inc/consoletaeftemplates.hpp"
#include "../../types/inc/Utf16Parser.hpp"
using namespace WEX::Common;
using namespace WEX::Logging;
using namespace WEX::TestExecution;
static const std::vector<wchar_t> CyrillicChar = { 0x0431 }; // lowercase be
static const std::vector<wchar_t> LatinChar = { 0x0061 }; // uppercase A
static const std::vector<wchar_t> FullWidthChar = { 0xFF2D }; // fullwidth latin small letter m
static const std::vector<wchar_t> GaelicChar = { 0x1E41 }; // latin small letter m with dot above
static const std::vector<wchar_t> HiraganaChar = { 0x3059 }; // hiragana su
static const std::vector<wchar_t> SunglassesEmoji = { 0xD83D, 0xDE0E }; // smiling face with sunglasses emoji
class Utf16ParserTests
{
TEST_CLASS(Utf16ParserTests);
TEST_METHOD(CanParseNonSurrogateText)
{
const std::vector<std::vector<wchar_t>> expected = { CyrillicChar, LatinChar, FullWidthChar, GaelicChar, HiraganaChar };
std::wstring wstr;
for (const auto& charData : expected)
{
wstr.push_back(charData.at(0));
}
const auto result = Utf16Parser::Parse(wstr);
VERIFY_ARE_EQUAL(expected.size(), result.size());
for (size_t i = 0; i < result.size(); ++i)
{
const auto& sequence = result.at(i);
VERIFY_ARE_EQUAL(sequence, expected.at(i));
}
}
TEST_METHOD(CanParseSurrogatePairs)
{
const std::wstring wstr{ SunglassesEmoji.begin(), SunglassesEmoji.end() };
const auto result = Utf16Parser::Parse(wstr);
VERIFY_ARE_EQUAL(result.size(), 1u);
VERIFY_ARE_EQUAL(result.at(0).size(), SunglassesEmoji.size());
for (size_t i = 0; i < SunglassesEmoji.size(); ++i)
{
VERIFY_ARE_EQUAL(result.at(0).at(i), SunglassesEmoji.at(i));
}
}
TEST_METHOD(WillDropBadSurrogateCombinations)
{
// test dropping of invalid leading surrogates
std::wstring wstr{ SunglassesEmoji.begin(), SunglassesEmoji.end() };
wstr += wstr;
wstr.at(1) = SunglassesEmoji.at(0); // wstr contains 3 leading, 1 trailing surrogate sequence
auto result = Utf16Parser::Parse(wstr);
VERIFY_ARE_EQUAL(result.size(), 1u);
VERIFY_ARE_EQUAL(result.at(0).size(), SunglassesEmoji.size());
for (size_t i = 0; i < SunglassesEmoji.size(); ++i)
{
VERIFY_ARE_EQUAL(result.at(0).at(i), SunglassesEmoji.at(i));
}
// test dropping of invalid trailing surrogates
wstr = { SunglassesEmoji.begin(), SunglassesEmoji.end() };
wstr += wstr;
wstr.at(0) = SunglassesEmoji.at(1); // wstr contains 2 trailing, 1 leading, 1 trailing surrogate sequence
result = Utf16Parser::Parse(wstr);
VERIFY_ARE_EQUAL(result.size(), 1u);
VERIFY_ARE_EQUAL(result.at(0).size(), SunglassesEmoji.size());
for (size_t i = 0; i < SunglassesEmoji.size(); ++i)
{
VERIFY_ARE_EQUAL(result.at(0).at(i), SunglassesEmoji.at(i));
}
}
const std::wstring_view Replacement{ &UNICODE_REPLACEMENT, 1 };
TEST_METHOD(ParseNextLeadOnly)
{
std::wstring wstr{ SunglassesEmoji.at(0) };
const auto expected = Replacement;
const auto actual = Utf16Parser::ParseNext(wstr);
VERIFY_ARE_EQUAL(expected, actual);
}
TEST_METHOD(ParseNextTrailOnly)
{
std::wstring wstr{ SunglassesEmoji.at(1) };
const auto expected = Replacement;
const auto actual = Utf16Parser::ParseNext(wstr);
VERIFY_ARE_EQUAL(expected, actual);
}
TEST_METHOD(ParseNextSingleOnly)
{
std::wstring wstr{ CyrillicChar.at(0) };
const auto expected = std::wstring_view{ CyrillicChar.data(), CyrillicChar.size() };
const auto actual = Utf16Parser::ParseNext(wstr);
VERIFY_ARE_EQUAL(expected, actual);
}
TEST_METHOD(ParseNextLeadLead)
{
std::wstring wstr{ SunglassesEmoji.at(0) };
wstr += SunglassesEmoji.at(0);
const auto expected = Replacement;
const auto actual = Utf16Parser::ParseNext(wstr);
VERIFY_ARE_EQUAL(expected, actual);
}
TEST_METHOD(ParseNextLeadTrail)
{
std::wstring wstr{ SunglassesEmoji.at(0) };
wstr += SunglassesEmoji.at(1);
const auto expected = std::wstring_view{ SunglassesEmoji.data(), SunglassesEmoji.size() };
const auto actual = Utf16Parser::ParseNext(wstr);
VERIFY_ARE_EQUAL(expected, actual);
}
TEST_METHOD(ParseNextTrailTrail)
{
std::wstring wstr{ SunglassesEmoji.at(1) };
wstr += SunglassesEmoji.at(1);
const auto expected = Replacement;
const auto actual = Utf16Parser::ParseNext(wstr);
VERIFY_ARE_EQUAL(expected, actual);
}
TEST_METHOD(ParseNextLeadSingle)
{
std::wstring wstr{ SunglassesEmoji.at(0) };
wstr += LatinChar.at(0);
const auto expected = std::wstring_view{ LatinChar.data(), LatinChar.size() };
const auto actual = Utf16Parser::ParseNext(wstr);
VERIFY_ARE_EQUAL(expected, actual);
}
TEST_METHOD(ParseNextTrailSingle)
{
std::wstring wstr{ SunglassesEmoji.at(1) };
wstr += LatinChar.at(0);
const auto expected = std::wstring_view{ LatinChar.data(), LatinChar.size() };
const auto actual = Utf16Parser::ParseNext(wstr);
VERIFY_ARE_EQUAL(expected, actual);
}
TEST_METHOD(ParseNextLeadLeadTrail)
{
std::wstring wstr{ SunglassesEmoji.at(0) };
wstr += SunglassesEmoji.at(0);
wstr += SunglassesEmoji.at(1);
const auto expected = std::wstring_view{ SunglassesEmoji.data(), SunglassesEmoji.size() };
const auto actual = Utf16Parser::ParseNext(wstr);
VERIFY_ARE_EQUAL(expected, actual);
}
TEST_METHOD(ParseNextTrailLeadTrail)
{
std::wstring wstr{ SunglassesEmoji.at(1) };
wstr += SunglassesEmoji.at(0);
wstr += SunglassesEmoji.at(1);
const auto expected = std::wstring_view{ SunglassesEmoji.data(), SunglassesEmoji.size() };
const auto actual = Utf16Parser::ParseNext(wstr);
VERIFY_ARE_EQUAL(expected, actual);
}
TEST_METHOD(ParseNextSingleLeadTrail)
{
std::wstring wstr{ GaelicChar.at(0) };
wstr += SunglassesEmoji.at(0);
wstr += SunglassesEmoji.at(1);
const auto expected = std::wstring_view{ GaelicChar.data(), GaelicChar.size() };
const auto actual = Utf16Parser::ParseNext(wstr);
VERIFY_ARE_EQUAL(expected, actual);
}
};

View File

@ -28,7 +28,6 @@ SOURCES = \
ClipboardTests.cpp \
SelectionTests.cpp \
Utf8ToWideCharParserTests.cpp \
Utf16ParserTests.cpp \
OutputCellIteratorTests.cpp \
InitTests.cpp \
TitleTests.cpp \

164
src/inc/til/unicode.h Normal file
View File

@ -0,0 +1,164 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#pragma once
namespace til
{
namespace details
{
inline constexpr wchar_t UNICODE_REPLACEMENT = 0xFFFD;
}
static constexpr bool is_surrogate(const wchar_t wch) noexcept
{
return (wch & 0xF800) == 0xD800;
}
static constexpr bool is_leading_surrogate(const wchar_t wch) noexcept
{
return (wch & 0xFC00) == 0xD800;
}
static constexpr bool is_trailing_surrogate(const wchar_t wch) noexcept
{
return (wch & 0xFC00) == 0xDC00;
}
// Verifies the beginning of the given UTF16 string and returns the first UTF16 sequence
// or U+FFFD otherwise. It's not really useful and at the time of writing only a
// single caller uses this. It's best to delete this if you read this comment.
constexpr std::wstring_view utf16_next(std::wstring_view wstr) noexcept
{
auto it = wstr.begin();
const auto end = wstr.end();
auto ptr = &details::UNICODE_REPLACEMENT;
size_t len = 1;
if (it != end)
{
const auto wch = *it;
ptr = &*it;
if (is_surrogate(wch))
{
++it;
const auto wch2 = it != end ? *it : wchar_t{};
if (is_leading_surrogate(wch) && is_trailing_surrogate(wch2))
{
len = 2;
++it;
}
else
{
ptr = &details::UNICODE_REPLACEMENT;
}
}
}
return { ptr, len };
}
// Splits a UTF16 string into codepoints, yielding `wstring_view`s of UTF16 text. Use it as:
// for (const auto& str : til::utf16_iterator{ input }) { ... }
struct utf16_iterator
{
struct sentinel
{
};
struct iterator
{
using iterator_category = std::forward_iterator_tag;
using value_type = std::wstring_view;
using reference = value_type&;
using pointer = value_type*;
using difference_type = std::ptrdiff_t;
explicit constexpr iterator(utf16_iterator& p) noexcept :
_iter{ p }
{
}
const value_type& operator*() const noexcept
{
return _iter.value();
}
iterator& operator++() noexcept
{
_iter._advance = true;
return *this;
}
bool operator!=(const sentinel&) const noexcept
{
return _iter.valid();
}
private:
utf16_iterator& _iter;
};
explicit constexpr utf16_iterator(std::wstring_view wstr) noexcept :
_it{ wstr.begin() }, _end{ wstr.end() }, _advance{ _it != _end }
{
}
iterator begin() noexcept
{
return iterator{ *this };
}
sentinel end() noexcept
{
return sentinel{};
}
private:
bool valid() const noexcept
{
return _it != _end;
}
void advance() noexcept
{
const auto wch = *_it;
auto ptr = &*_it;
size_t len = 1;
++_it;
if (is_surrogate(wch))
{
const auto wch2 = _it != _end ? *_it : wchar_t{};
if (is_leading_surrogate(wch) && is_trailing_surrogate(wch2))
{
len = 2;
++_it;
}
else
{
ptr = &details::UNICODE_REPLACEMENT;
}
}
_value = { ptr, len };
_advance = false;
}
const std::wstring_view& value() noexcept
{
if (_advance)
{
advance();
}
return _value;
}
std::wstring_view::iterator _it;
std::wstring_view::iterator _end;
std::wstring_view _value;
bool _advance = true;
};
}

View File

@ -2,17 +2,16 @@
// Licensed under the MIT license.
#include "precomp.h"
#include <windows.h>
#include "terminalInput.hpp"
#include "strsafe.h"
#include <til/unicode.h>
#include <strsafe.h>
#define WIL_SUPPORT_BITOPERATION_PASCAL_NAMES
#include <wil/Common.h>
#include "../../interactivity/inc/VtApiRedirection.hpp"
#include "../../inc/unicode.hpp"
#include "../../types/inc/Utf16Parser.hpp"
using namespace Microsoft::Console::VirtualTerminal;
@ -739,7 +738,7 @@ bool TerminalInput::HandleFocus(const bool focused) noexcept
// - ch: The UTF-16 character to send.
void TerminalInput::_SendChar(const wchar_t ch)
{
if (Utf16Parser::IsLeadingSurrogate(ch))
if (til::is_leading_surrogate(ch))
{
if (_leadingSurrogate.has_value())
{

View File

@ -0,0 +1,82 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include "precomp.h"
#include "WexTestClass.h"
#include <til/unicode.h>
using namespace WEX::Common;
using namespace WEX::Logging;
using namespace WEX::TestExecution;
#define REPLACEMENT L"\xFFFD"
#define LEADING L"\xD801"
#define TRAILING L"\xDC01"
#define PAIR L"\xD801\xDC01"
class UnicodeTests
{
TEST_CLASS(UnicodeTests);
TEST_METHOD(utf16_next)
{
struct Test
{
std::wstring_view input;
std::wstring_view expected;
};
static constexpr std::array tests{
Test{ L"", REPLACEMENT },
Test{ L"a", L"a" },
Test{ L"abc", L"a" },
Test{ L"a" PAIR, L"a" },
Test{ L"a" LEADING, L"a" },
Test{ L"a" TRAILING, L"a" },
Test{ PAIR L"a", PAIR },
Test{ LEADING L"a", REPLACEMENT },
Test{ TRAILING L"a", REPLACEMENT },
};
for (const auto& t : tests)
{
const auto actual = til::utf16_next(t.input);
VERIFY_ARE_EQUAL(t.expected, actual);
}
}
TEST_METHOD(utf16_iterator)
{
struct Test
{
std::wstring_view input;
til::some<std::wstring_view, 5> expected;
};
static constexpr std::array tests{
Test{ L"", {} },
Test{ L"a", { L"a" } },
Test{ L"abc", { L"a", L"b", L"c" } },
Test{ PAIR L"a" PAIR L"b" PAIR, { PAIR, L"a", PAIR, L"b", PAIR } },
Test{ LEADING L"a" LEADING L"b" LEADING, { REPLACEMENT, L"a", REPLACEMENT, L"b", REPLACEMENT } },
Test{ TRAILING L"a" TRAILING L"b" TRAILING, { REPLACEMENT, L"a", REPLACEMENT, L"b", REPLACEMENT } },
Test{ L"a" TRAILING LEADING L"b", { L"a", REPLACEMENT, REPLACEMENT, L"b" } },
};
for (const auto& t : tests)
{
auto it = t.expected.begin();
const auto end = t.expected.end();
for (const auto& v : til::utf16_iterator{ t.input })
{
VERIFY_ARE_NOT_EQUAL(end, it);
VERIFY_ARE_EQUAL(*it, v);
++it;
}
VERIFY_ARE_EQUAL(end, it);
}
}
};

View File

@ -33,6 +33,7 @@ SOURCES = \
StaticMapTests.cpp \
string.cpp \
u8u16convertTests.cpp \
UnicodeTests.cpp \
DefaultResource.rc \
# These tests are disabled because of a missing symbol.

View File

@ -35,6 +35,7 @@
<ClCompile Include="string.cpp" />
<ClCompile Include="throttled_func.cpp" />
<ClCompile Include="u8u16convertTests.cpp" />
<ClCompile Include="UnicodeTests.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\inc\til\at.h" />
@ -64,6 +65,7 @@
<ClInclude Include="..\..\inc\til\throttled_func.h" />
<ClInclude Include="..\..\inc\til\ticket_lock.h" />
<ClInclude Include="..\..\inc\til\u8u16convert.h" />
<ClInclude Include="..\..\inc\til\unicode.h" />
<ClInclude Include="..\precomp.h" />
</ItemGroup>
<ItemDefinitionGroup>

View File

@ -26,6 +26,7 @@
<ClCompile Include="string.cpp" />
<ClCompile Include="throttled_func.cpp" />
<ClCompile Include="u8u16convertTests.cpp" />
<ClCompile Include="UnicodeTests.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\precomp.h" />
@ -110,6 +111,9 @@
<ClInclude Include="..\..\inc\til\u8u16convert.h">
<Filter>inc</Filter>
</ClInclude>
<ClInclude Include="..\..\inc\til\unicode.h">
<Filter>inc</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Filter Include="inc">

View File

@ -1,91 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include "precomp.h"
#include "inc/Utf16Parser.hpp"
#include "unicode.hpp"
// Routine Description:
// - Finds the next single collection for the codepoint out of the given UTF-16 string information.
// - In simpler terms, it will group UTF-16 surrogate pairs into a single unit or give you a valid single-item UTF-16 character.
// - Does not validate UTF-16 input beyond proper leading/trailing character sequences.
// Arguments:
// - wstr - The UTF-16 string to parse.
// Return Value:
// - A view into the string given of just the next codepoint unit.
std::wstring_view Utf16Parser::ParseNext(std::wstring_view wstr) noexcept
{
for (size_t pos = 0; pos < wstr.size(); ++pos)
{
const auto wch = wstr.at(pos);
// If it's a lead and followed directly by a trail, then return the pair.
// If it's not followed directly by the trail, go around again and seek forward.
if (IsLeadingSurrogate(wch))
{
// Try to find the next item... if it isn't there, we'll go around again.
const auto posNext = pos + 1;
if (posNext < wstr.size())
{
// If we found it and it's trailing, return the pair.
const auto wchNext = wstr.at(posNext);
if (IsTrailingSurrogate(wchNext))
{
return wstr.substr(pos, 2);
}
}
// If we missed either if in any way, we'll fall through and go around again searching for more.
}
// If it's just a trail at this point, go around again and seek forward.
else if (IsTrailingSurrogate(wch))
{
continue;
}
// If it's neither lead nor trail, then it's < U+10000 and it can be returned as a single wchar_t point.
else
{
return wstr.substr(pos, 1);
}
}
// If we get all the way through and there's nothing valid, then this is just a replacement character as it was broken/garbage.
return std::wstring_view{ &UNICODE_REPLACEMENT, 1 };
}
// Routine Description:
// - formats a utf16 encoded wstring and splits the codepoints into individual collections.
// - will drop badly formatted leading/trailing char sequences.
// - does not validate utf16 input beyond proper leading/trailing char sequences.
// Arguments:
// - wstr - the string to parse
// Return Value:
// - a vector of utf16 codepoints. glyphs that require surrogate pairs will be grouped
// together in a vector and codepoints that use only one wchar will be in a vector by themselves.
std::vector<std::vector<wchar_t>> Utf16Parser::Parse(std::wstring_view wstr)
{
std::vector<std::vector<wchar_t>> result;
std::vector<wchar_t> sequence;
for (const auto wch : wstr)
{
if (IsLeadingSurrogate(wch))
{
sequence.clear();
sequence.push_back(wch);
}
else if (IsTrailingSurrogate(wch))
{
if (!sequence.empty())
{
sequence.push_back(wch);
result.push_back(sequence);
sequence.clear();
}
}
else
{
result.push_back({ wch });
}
}
return result;
}

View File

@ -1,46 +0,0 @@
/*++
Copyright (c) Microsoft Corporation
Module Name:
- Utf16Parser.hpp
Abstract:
- Parser for grouping together utf16 codepoints from a string of utf16 encoded text
Author(s):
- Austin Diviness (AustDi) 25-Apr-2018
--*/
#pragma once
#include <vector>
class Utf16Parser final
{
public:
static std::vector<std::vector<wchar_t>> Parse(std::wstring_view wstr);
static std::wstring_view ParseNext(std::wstring_view wstr) noexcept;
// Routine Description:
// - checks if wchar is a utf16 leading surrogate
// Arguments:
// - wch - the wchar to check
// Return Value:
// - true if wch is a leading surrogate, false otherwise
static constexpr bool IsLeadingSurrogate(const wchar_t wch) noexcept
{
return wch >= 0xD800 && wch <= 0xDBFF;
}
// Routine Description:
// - checks if wchar is a utf16 trailing surrogate
// Arguments:
// - wch - the wchar to check
// Return Value:
// - true if wch is a trailing surrogate, false otherwise
static constexpr bool IsTrailingSurrogate(const wchar_t wch) noexcept
{
return wch >= 0xDC00 && wch <= 0xDFFF;
}
};

View File

@ -30,7 +30,6 @@
<ClCompile Include="..\UiaTracing.cpp" />
<ClCompile Include="..\TermControlUiaTextRange.cpp" />
<ClCompile Include="..\TermControlUiaProvider.cpp" />
<ClCompile Include="..\Utf16Parser.cpp" />
<ClCompile Include="..\Viewport.cpp" />
<ClCompile Include="..\WindowBufferSizeEvent.cpp" />
<ClCompile Include="..\precomp.cpp">
@ -52,7 +51,6 @@
<ClInclude Include="..\inc\ThemeUtils.h" />
<ClInclude Include="..\inc\utils.hpp" />
<ClInclude Include="..\inc\Viewport.hpp" />
<ClInclude Include="..\inc\Utf16Parser.hpp" />
<ClInclude Include="..\IUiaData.h" />
<ClInclude Include="..\IUiaEventDispatcher.h" />
<ClInclude Include="..\IUiaTraceable.h" />

View File

@ -57,9 +57,6 @@
<ClCompile Include="..\GlyphWidth.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Utf16Parser.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\utils.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@ -110,9 +107,6 @@
<ClInclude Include="..\inc\CodepointWidthDetector.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\inc\Utf16Parser.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\inc\GlyphWidth.hpp">
<Filter>Header Files</Filter>
</ClInclude>
@ -137,9 +131,6 @@
<ClInclude Include="..\inc\Viewport.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\inc\Utf16Parser.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\precomp.h">
<Filter>Header Files</Filter>
</ClInclude>

View File

@ -41,7 +41,6 @@ SOURCES= \
..\WindowBufferSizeEvent.cpp \
..\convert.cpp \
..\colorTable.cpp \
..\Utf16Parser.cpp \
..\utils.cpp \
..\ThemeUtils.cpp \
..\ScreenInfoUiaProviderBase.cpp \