[Support] Move LLD's parallel algorithm wrappers to support

Essentially takes the lld/Common/Threads.h wrappers and moves them to
the llvm/Support/Paralle.h algorithm header.

The changes are:
- Remove policy parameter, since all clients use `par`.
- Rename the methods to `parallelSort` etc to match LLVM style, since
  they are no longer C++17 pstl compatible.
- Move algorithms from llvm::parallel:: to llvm::, since they have
  "parallel" in the name and are no longer overloads of the regular
  algorithms.
- Add range overloads
- Use the sequential algorithm directly when 1 thread is requested
  (skips task grouping)
- Fix the index type of parallelForEachN to size_t. Nobody in LLVM was
  using any other parameter, and it made overload resolution hard for
  for_each_n(par, 0, foo.size(), ...) because 0 is int, not size_t.

Remove Threads.h and update LLD for that.

This is a prerequisite for parallel public symbol processing in the PDB
library, which is in LLVM.

Reviewed By: MaskRay, aganea

Differential Revision: https://reviews.llvm.org/D79390
This commit is contained in:
Reid Kleckner 2020-05-04 20:03:19 -07:00
parent 855e02e799
commit 932f0276ea
25 changed files with 67 additions and 175 deletions

View File

@ -21,7 +21,6 @@
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Filesystem.h"
#include "lld/Common/Memory.h"
#include "lld/Common/Threads.h"
#include "lld/Common/Timer.h"
#include "lld/Common/Version.h"
#include "llvm/ADT/Optional.h"
@ -39,6 +38,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/TarWriter.h"

View File

@ -21,7 +21,6 @@
#include "Chunks.h"
#include "Symbols.h"
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Threads.h"
#include "lld/Common/Timer.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/Support/Debug.h"

View File

@ -23,7 +23,7 @@
#include "Symbols.h"
#include "Writer.h"
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Threads.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;

View File

@ -32,8 +32,8 @@
#include "Symbols.h"
#include "Writer.h"
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Threads.h"
#include "lld/Common/Timer.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/raw_ostream.h"

View File

@ -16,7 +16,6 @@
#include "TypeMerger.h"
#include "Writer.h"
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Threads.h"
#include "lld/Common/Timer.h"
#include "llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h"
#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
@ -57,6 +56,7 @@
#include "llvm/Support/Errc.h"
#include "llvm/Support/FormatAdapters.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/ScopedPrinter.h"
#include <memory>

View File

@ -17,7 +17,6 @@
#include "Symbols.h"
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Memory.h"
#include "lld/Common/Threads.h"
#include "lld/Common/Timer.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"

View File

@ -8,7 +8,7 @@
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Threads.h"
#include "llvm/Support/Parallel.h"
#include "llvm/ADT/Twine.h"
#include "llvm/IR/DiagnosticInfo.h"

View File

@ -11,10 +11,10 @@
//===----------------------------------------------------------------------===//
#include "lld/Common/Filesystem.h"
#include "lld/Common/Threads.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/Support/FileOutputBuffer.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Parallel.h"
#if LLVM_ON_UNIX
#include <unistd.h>
#endif

View File

@ -43,7 +43,6 @@
#include "lld/Common/Memory.h"
#include "lld/Common/Strings.h"
#include "lld/Common/TargetOptionsCommandFlags.h"
#include "lld/Common/Threads.h"
#include "lld/Common/Version.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/StringExtras.h"
@ -53,6 +52,7 @@
#include "llvm/Support/Compression.h"
#include "llvm/Support/GlobPattern.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/TarWriter.h"
#include "llvm/Support/TargetSelect.h"

View File

@ -80,10 +80,10 @@
#include "Symbols.h"
#include "SyntheticSections.h"
#include "Writer.h"
#include "lld/Common/Threads.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/Object/ELF.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/TimeProfiler.h"
#include "llvm/Support/xxhash.h"
#include <algorithm>
@ -467,9 +467,8 @@ template <class ELFT> void ICF<ELFT>::run() {
}
// Initially, we use hash values to partition sections.
parallelForEach(sections, [&](InputSection *s) {
s->eqClass[0] = xxHash64(s->data());
});
parallelForEach(
sections, [&](InputSection *s) { s->eqClass[0] = xxHash64(s->data()); });
for (unsigned cnt = 0; cnt != 2; ++cnt) {
parallelForEach(sections, [&](InputSection *s) {

View File

@ -21,7 +21,6 @@
#include "Writer.h"
#include "lld/Common/Memory.h"
#include "lld/Common/Strings.h"
#include "lld/Common/Threads.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/BinaryFormat/ELF.h"
@ -29,6 +28,7 @@
#include "llvm/Support/Endian.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/Path.h"
#include <algorithm>
#include <cassert>

View File

@ -26,9 +26,9 @@
#include "Symbols.h"
#include "SyntheticSections.h"
#include "lld/Common/Strings.h"
#include "lld/Common/Threads.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;

View File

@ -14,11 +14,11 @@
#include "Target.h"
#include "lld/Common/Memory.h"
#include "lld/Common/Strings.h"
#include "lld/Common/Threads.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/Support/Compression.h"
#include "llvm/Support/MD5.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/SHA1.h"
#include <regex>

View File

@ -26,7 +26,6 @@
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Memory.h"
#include "lld/Common/Strings.h"
#include "lld/Common/Threads.h"
#include "lld/Common/Version.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/StringExtras.h"
@ -37,6 +36,7 @@
#include "llvm/Support/Endian.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/MD5.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/TimeProfiler.h"
#include <cstdlib>
#include <thread>

View File

@ -22,9 +22,9 @@
#include "lld/Common/Filesystem.h"
#include "lld/Common/Memory.h"
#include "lld/Common/Strings.h"
#include "lld/Common/Threads.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/RandomNumberGenerator.h"
#include "llvm/Support/SHA1.h"
#include "llvm/Support/TimeProfiler.h"

View File

@ -1,90 +0,0 @@
//===- Threads.h ------------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// LLD supports threads to distribute workloads to multiple cores. Using
// multicore is most effective when more than one core are idle. At the
// last step of a build, it is often the case that a linker is the only
// active process on a computer. So, we are naturally interested in using
// threads wisely to reduce latency to deliver results to users.
//
// That said, we don't want to do "too clever" things using threads.
// Complex multi-threaded algorithms are sometimes extremely hard to
// reason about and can easily mess up the entire design.
//
// Fortunately, when a linker links large programs (when the link time is
// most critical), it spends most of the time to work on massive number of
// small pieces of data of the same kind, and there are opportunities for
// large parallelism there. Here are examples:
//
// - We have hundreds of thousands of input sections that need to be
// copied to a result file at the last step of link. Once we fix a file
// layout, each section can be copied to its destination and its
// relocations can be applied independently.
//
// - We have tens of millions of small strings when constructing a
// mergeable string section.
//
// For the cases such as the former, we can just use parallelForEach
// instead of std::for_each (or a plain for loop). Because tasks are
// completely independent from each other, we can run them in parallel
// without any coordination between them. That's very easy to understand
// and reason about.
//
// For the cases such as the latter, we can use parallel algorithms to
// deal with massive data. We have to write code for a tailored algorithm
// for each problem, but the complexity of multi-threading is isolated in
// a single pass and doesn't affect the linker's overall design.
//
// The above approach seems to be working fairly well. As an example, when
// linking Chromium (output size 1.6 GB), using 4 cores reduces latency to
// 75% compared to single core (from 12.66 seconds to 9.55 seconds) on my
// Ivy Bridge Xeon 2.8 GHz machine. Using 40 cores reduces it to 63% (from
// 12.66 seconds to 7.95 seconds). Because of the Amdahl's law, the
// speedup is not linear, but as you add more cores, it gets faster.
//
// On a final note, if you are trying to optimize, keep the axiom "don't
// guess, measure!" in mind. Some important passes of the linker are not
// that slow. For example, resolving all symbols is not a very heavy pass,
// although it would be very hard to parallelize it. You want to first
// identify a slow pass and then optimize it.
//
//===----------------------------------------------------------------------===//
#ifndef LLD_COMMON_THREADS_H
#define LLD_COMMON_THREADS_H
#include "llvm/Support/Parallel.h"
#include <functional>
namespace lld {
template <typename R, class FuncTy> void parallelForEach(R &&range, FuncTy fn) {
if (llvm::parallel::strategy.ThreadsRequested != 1)
for_each(llvm::parallel::par, std::begin(range), std::end(range), fn);
else
for_each(llvm::parallel::seq, std::begin(range), std::end(range), fn);
}
inline void parallelForEachN(size_t begin, size_t end,
llvm::function_ref<void(size_t)> fn) {
if (llvm::parallel::strategy.ThreadsRequested != 1)
for_each_n(llvm::parallel::par, begin, end, fn);
else
for_each_n(llvm::parallel::seq, begin, end, fn);
}
template <typename R, class FuncTy> void parallelSort(R &&range, FuncTy fn) {
if (llvm::parallel::strategy.ThreadsRequested != 1)
sort(llvm::parallel::par, std::begin(range), std::end(range), fn);
else
sort(llvm::parallel::seq, std::begin(range), std::end(range), fn);
}
} // namespace lld
#endif

View File

@ -461,10 +461,11 @@ llvm::Error LayoutPass::perform(SimpleFile &mergedFile) {
});
std::vector<LayoutPass::SortKey> vec = decorate(atomRange);
sort(llvm::parallel::par, vec.begin(), vec.end(),
[&](const LayoutPass::SortKey &l, const LayoutPass::SortKey &r) -> bool {
return compareAtoms(l, r, _customSorter);
});
llvm::parallelSort(
vec,
[&](const LayoutPass::SortKey &l, const LayoutPass::SortKey &r) -> bool {
return compareAtoms(l, r, _customSorter);
});
LLVM_DEBUG(checkTransitivity(vec, _customSorter));
undecorate(atomRange, vec);

View File

@ -19,7 +19,6 @@
#include "lld/Common/Memory.h"
#include "lld/Common/Reproduce.h"
#include "lld/Common/Strings.h"
#include "lld/Common/Threads.h"
#include "lld/Common/Version.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Object/Wasm.h"
@ -27,6 +26,7 @@
#include "llvm/Option/ArgList.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Host.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/TarWriter.h"

View File

@ -12,9 +12,9 @@
#include "OutputSegment.h"
#include "WriterUtils.h"
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Threads.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/Parallel.h"
#define DEBUG_TYPE "lld"

View File

@ -20,7 +20,6 @@
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Memory.h"
#include "lld/Common/Strings.h"
#include "lld/Common/Threads.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
@ -31,6 +30,7 @@
#include "llvm/Support/Format.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/Parallel.h"
#include <cstdarg>
#include <map>

View File

@ -22,17 +22,6 @@
namespace llvm {
namespace parallel {
struct sequential_execution_policy {};
struct parallel_execution_policy {};
template <typename T>
struct is_execution_policy
: public std::integral_constant<
bool, llvm::is_one_of<T, sequential_execution_policy,
parallel_execution_policy>::value> {};
constexpr sequential_execution_policy seq{};
constexpr parallel_execution_policy par{};
// Strategy for the default executor used by the parallel routines provided by
// this file. It defaults to using all hardware threads and should be
@ -169,61 +158,58 @@ void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) {
#endif
template <typename Iter>
using DefComparator =
std::less<typename std::iterator_traits<Iter>::value_type>;
} // namespace detail
} // namespace parallel
// sequential algorithm implementations.
template <class Policy, class RandomAccessIterator,
class Comparator = detail::DefComparator<RandomAccessIterator>>
void sort(Policy policy, RandomAccessIterator Start, RandomAccessIterator End,
const Comparator &Comp = Comparator()) {
static_assert(is_execution_policy<Policy>::value,
"Invalid execution policy!");
template <class RandomAccessIterator,
class Comparator = std::less<
typename std::iterator_traits<RandomAccessIterator>::value_type>>
void parallelSort(RandomAccessIterator Start, RandomAccessIterator End,
const Comparator &Comp = Comparator()) {
#if LLVM_ENABLE_THREADS
if (parallel::strategy.ThreadsRequested != 1) {
parallel::detail::parallel_sort(Start, End, Comp);
return;
}
#endif
llvm::sort(Start, End, Comp);
}
template <class Policy, class IterTy, class FuncTy>
void for_each(Policy policy, IterTy Begin, IterTy End, FuncTy Fn) {
static_assert(is_execution_policy<Policy>::value,
"Invalid execution policy!");
template <class IterTy, class FuncTy>
void parallelForEach(IterTy Begin, IterTy End, FuncTy Fn) {
#if LLVM_ENABLE_THREADS
if (parallel::strategy.ThreadsRequested != 1) {
parallel::detail::parallel_for_each(Begin, End, Fn);
return;
}
#endif
std::for_each(Begin, End, Fn);
}
template <class Policy, class IndexTy, class FuncTy>
void for_each_n(Policy policy, IndexTy Begin, IndexTy End, FuncTy Fn) {
static_assert(is_execution_policy<Policy>::value,
"Invalid execution policy!");
for (IndexTy I = Begin; I != End; ++I)
template <class FuncTy>
void parallelForEachN(size_t Begin, size_t End, FuncTy Fn) {
#if LLVM_ENABLE_THREADS
if (parallel::strategy.ThreadsRequested != 1) {
parallel::detail::parallel_for_each_n(Begin, End, Fn);
return;
}
#endif
for (size_t I = Begin; I != End; ++I)
Fn(I);
}
// Parallel algorithm implementations, only available when LLVM_ENABLE_THREADS
// is true.
#if LLVM_ENABLE_THREADS
template <class RandomAccessIterator,
class Comparator = detail::DefComparator<RandomAccessIterator>>
void sort(parallel_execution_policy policy, RandomAccessIterator Start,
RandomAccessIterator End, const Comparator &Comp = Comparator()) {
detail::parallel_sort(Start, End, Comp);
// Range wrappers.
template <class RangeTy,
class Comparator = std::less<decltype(*std::begin(RangeTy()))>>
void parallelSort(RangeTy &&R, const Comparator &Comp = Comparator()) {
parallelSort(std::begin(R), std::end(R), Comp);
}
template <class IterTy, class FuncTy>
void for_each(parallel_execution_policy policy, IterTy Begin, IterTy End,
FuncTy Fn) {
detail::parallel_for_each(Begin, End, Fn);
template <class RangeTy, class FuncTy>
void parallelForEach(RangeTy &&R, FuncTy Fn) {
parallelForEach(std::begin(R), std::end(R), Fn);
}
template <class IndexTy, class FuncTy>
void for_each_n(parallel_execution_policy policy, IndexTy Begin, IndexTy End,
FuncTy Fn) {
detail::parallel_for_each_n(Begin, End, Fn);
}
#endif
} // namespace parallel
} // namespace llvm
#endif // LLVM_SUPPORT_PARALLEL_H

View File

@ -30,7 +30,7 @@ TEST(Parallel, sort) {
for (auto &i : array)
i = dist(randEngine);
sort(parallel::par, std::begin(array), std::end(array));
parallelSort(std::begin(array), std::end(array));
ASSERT_TRUE(llvm::is_sorted(array));
}
@ -40,7 +40,7 @@ TEST(Parallel, parallel_for) {
// writing.
uint32_t range[2050];
std::fill(range, range + 2050, 1);
for_each_n(parallel::par, 0, 2049, [&range](size_t I) { ++range[I]; });
parallelForEachN(0, 2049, [&range](size_t I) { ++range[I]; });
uint32_t expected[2049];
std::fill(expected, expected + 2049, 2);

View File

@ -390,8 +390,7 @@ ParallelDiagnosticHandler handler(context);
// Process a list of operations in parallel.
std::vector<Operation *> opsToProcess = ...;
llvm::for_each_n(llvm::parallel::par, 0, opsToProcess.size(),
[&](size_t i) {
llvm::parallelForEachN(0, opsToProcess.size(), [&](size_t i) {
// Notify the handler that we are processing the i'th operation.
handler.setOrderIDForThread(i);
auto *op = opsToProcess[i];

View File

@ -493,8 +493,8 @@ void OpToOpPassAdaptor::runOnOperationAsyncImpl() {
// An atomic failure variable for the async executors.
std::atomic<bool> passFailed(false);
llvm::parallel::for_each(
llvm::parallel::par, asyncExecutors.begin(),
llvm::parallelForEach(
asyncExecutors.begin(),
std::next(asyncExecutors.begin(),
std::min(asyncExecutors.size(), opAMPairs.size())),
[&](MutableArrayRef<OpPassManager> pms) {

View File

@ -496,9 +496,8 @@ static void canonicalizeSCC(CallGraph &cg, CGUseList &useList,
// be reworked.
if (context->isMultithreadingEnabled()) {
ParallelDiagnosticHandler canonicalizationHandler(context);
llvm::parallel::for_each_n(
llvm::parallel::par, /*Begin=*/size_t(0),
/*End=*/nodesToCanonicalize.size(), [&](size_t index) {
llvm::parallelForEachN(
/*Begin=*/0, /*End=*/nodesToCanonicalize.size(), [&](size_t index) {
// Set the order for this thread so that diagnostics will be properly
// ordered.
canonicalizationHandler.setOrderIDForThread(index);