From c2e08aba1afd5a69dbe74b03ce6f463d45102222 Mon Sep 17 00:00:00 2001 From: wlei Date: Sun, 28 Nov 2021 18:42:09 -0800 Subject: [PATCH] [llvm-profgen] Compute and show profile density AutoFDO performance is sensitive to profile density, i.e., the amount of samples in the profile relative to the program size, because profiles with insufficient samples could be inaccurate due to statistical noise and thus hurt AutoFDO performance. A previous investigation showed that AutoFDO performed better on MySQL with increased amount of samples. Therefore, we implement a profile-density computation feature to give hints about profile density to users and the compiler. We define the density of a profile Prof as follows: - For each function A in the profile, density(A) = total_samples(A) / sizeof(A). - density(Prof) = min(density(A)) for all functions A that are warm (defined below). A function is considered warm if its total-samples is within top N percent of the profile. For implementation, we reuse the `ProfileSummaryBuilder::getHotCountThreshold(..)` as threshold which can be set by percent(`--profile-summary-cutoff-hot`) or by value(`--profile-summary-hot-count`). We also introduce `--hot-function-density-threshold` to set hot function density threshold and will give suggestion if profile density is below it which implies we should increase samples. This also applies for CS profile with all profiles merged into base. Reviewed By: hoy, wenlei Differential Revision: https://reviews.llvm.org/D113781 --- .../Inputs/profile-density-cs.raw.prof | 154 ++++++++++++++++++ .../Inputs/profile-density.raw.prof | 29 ++++ .../tools/llvm-profgen/profile-density.test | 64 ++++++++ llvm/tools/llvm-profgen/ProfileGenerator.cpp | 79 ++++++++- llvm/tools/llvm-profgen/ProfileGenerator.h | 22 ++- llvm/tools/llvm-profgen/ProfiledBinary.h | 15 ++ 6 files changed, 356 insertions(+), 7 deletions(-) create mode 100644 llvm/test/tools/llvm-profgen/Inputs/profile-density-cs.raw.prof create mode 100644 llvm/test/tools/llvm-profgen/Inputs/profile-density.raw.prof create mode 100644 llvm/test/tools/llvm-profgen/profile-density.test diff --git a/llvm/test/tools/llvm-profgen/Inputs/profile-density-cs.raw.prof b/llvm/test/tools/llvm-profgen/Inputs/profile-density-cs.raw.prof new file mode 100644 index 000000000000..7c001baadea5 --- /dev/null +++ b/llvm/test/tools/llvm-profgen/Inputs/profile-density-cs.raw.prof @@ -0,0 +1,154 @@ +[main] + 8 + 810-82f:15 + 834-85c:15 + 870-870:1544 + 875-8a1:11 + 875-8bf:1223 + 875-8c3:185 + 893-8bf:176 + 8a7-8c3:13 + 5 + 82f->790:15 + 870->540:1546 + 8a1->810:15 + 8bf->870:2022 + 8c3->893:276 +[partition_pivot_first] + 10 + 710-72d:238 + 740-753:1 + 740-75b:739 + 740-75f:267 + 740-761:1164 + 743-753:12 + 743-75b:2414 + 743-761:793 + 755-75b:103 + 755-75f:115 + 3 + 753->770:13 + 75b->743:3327 + 75f->740:385 +[partition_pivot_first:4.2 @ swap] + 1 + 764-76e:2904 + 1 + 76e->740:2999 +[partition_pivot_first:5 @ swap] + 2 + 770-770:619 + 77a-783:619 + 0 +[partition_pivot_last] + 15 + 650-66d:206 + 650-675:182 + 682-689:164 + 686-689:193 + 6b0-6b7:18 + 6b0-6bf:2082 + 6b0-6c8:1180 + 6b0-6ca:683 + 6b9-6bf:170 + 6b9-6c8:92 + 6b9-6ca:62 + 6d0-6d3:2230 + 6e3-6ea:712 + 6e3-6ef:1518 + 6ec-6ef:667 + 8 + 66d->686:206 + 675->682:79 + 689->6b9:359 + 6b7->68b:18 + 6bf->6d0:2307 + 6c8->6b0:1300 + 6ca->6ec:755 + 6ea->6b0:724 +[partition_pivot_last:5 @ swap] + 3 + 677-67d:292 + 6d6-6df:3621 + 6f2-700:3528 + 1 + 700->6b0:3619 +[partition_pivot_last:6 @ swap] + 2 + 68b-68b:1124 + 695-69e:1124 + 0 +[quick_sort] + 4 + 790-79c:1273 + 7a6-7a6:1273 + 7a8-7b8:941 + 7bd-7ca:791 + 4 + 7a6->650:817 + 7a6->710:489 + 7b8->790:961 + 7ca->790:805 +[quick_sort:2 @ partition_pivot_first] + 12 + 710-72d:408 + 740-753:208 + 740-75b:463 + 740-75f:262 + 740-761:496 + 743-753:386 + 743-75b:1300 + 743-761:451 + 755-75b:283 + 755-75f:144 + 774-777:619 + 787-788:619 + 4 + 753->770:619 + 75b->743:2137 + 75f->740:427 + 788->7a8:646 +[quick_sort:2 @ partition_pivot_last] + 17 + 650-66d:295 + 650-675:517 + 682-689:528 + 686-689:307 + 68f-692:1124 + 6a2-6a2:1124 + 6b0-6b7:806 + 6b0-6bf:1093 + 6b0-6c8:935 + 6b0-6ca:351 + 6b9-6bf:226 + 6b9-6c8:273 + 6b9-6ca:81 + 6d0-6d3:1391 + 6e3-6ea:500 + 6e3-6ef:891 + 6ec-6ef:452 + 9 + 66d->686:307 + 675->682:340 + 689->6b9:580 + 6a2->7a8:1167 + 6b7->68b:834 + 6bf->6d0:1391 + 6c8->6b0:1263 + 6ca->6ec:452 + 6ea->6b0:518 +[quick_sort:4 @ quick_sort] + 6 + 790-792:831 + 790-79c:331 + 7a6-7a6:331 + 7a8-7b8:441 + 7bd-7ca:632 + 7d7-7d7:2029 + 6 + 792->7d7:853 + 7a6->650:248 + 7a6->710:103 + 7b8->790:462 + 7ca->790:661 + 7d7->7cf:2097 diff --git a/llvm/test/tools/llvm-profgen/Inputs/profile-density.raw.prof b/llvm/test/tools/llvm-profgen/Inputs/profile-density.raw.prof new file mode 100644 index 000000000000..fcaa5da2f9a6 --- /dev/null +++ b/llvm/test/tools/llvm-profgen/Inputs/profile-density.raw.prof @@ -0,0 +1,29 @@ +27 +400540-400540:10 +400650-40066d:31 +400686-400689:3 +40068b-4006a2:3 +4006b0-4006b7:3 +4006b0-4006bf:6 +4006b0-4006c8:6 +4006d0-4006ea:51 +4006d0-400700:4 +4006ec-400700:30 +400710-40072f:5 +400740-400753:3 +400740-40075b:9 +400740-40076e:14 +400743-400753:3 +400743-40075b:43 +400743-40076e:11 +400755-40075b:4 +400770-400788:6 +400790-400792:12 +400790-4007a6:12 +4007a8-4007b8:11 +4007bd-4007ca:12 +4007cf-4007d7:12 +4007d7-4007d7:12 +400870-400870:12 +400875-4008bf:10 +0 diff --git a/llvm/test/tools/llvm-profgen/profile-density.test b/llvm/test/tools/llvm-profgen/profile-density.test new file mode 100644 index 000000000000..4ab1f79344fd --- /dev/null +++ b/llvm/test/tools/llvm-profgen/profile-density.test @@ -0,0 +1,64 @@ +; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -hot-function-density-threshold=1 &> %t2 +; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-DENSITY + +; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -hot-function-density-threshold=1 &> %t4 +; RUN: FileCheck %s --input-file %t4 --check-prefix=CHECK-DENSITY-CS + +;CHECK-DENSITY: AutoFDO is estimated to optimize better with 4.9x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples. +;CHECK-DENSITY: Minimum profile density for hot functions with top 99.00% total samples: 0.2 + +;CHECK-DENSITY-CS: Minimum profile density for hot functions with top 99.00% total samples: 31.4 + +; original code: +; clang -O3 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling qsort.c -o a.out +#include +#include + +void swap(int *a, int *b) { + int t = *a; + *a = *b; + *b = t; +} + +int partition_pivot_last(int* array, int low, int high) { + int pivot = array[high]; + int i = low - 1; + for (int j = low; j < high; j++) + if (array[j] < pivot) + swap(&array[++i], &array[j]); + swap(&array[i + 1], &array[high]); + return (i + 1); +} + +int partition_pivot_first(int* array, int low, int high) { + int pivot = array[low]; + int i = low + 1; + for (int j = low + 1; j <= high; j++) + if (array[j] < pivot) { if (j != i) swap(&array[i], &array[j]); i++;} + swap(&array[i - 1], &array[low]); + return i - 1; +} + +void quick_sort(int* array, int low, int high, int (*partition_func)(int *, int, int)) { + if (low < high) { + int pi = (*partition_func)(array, low, high); + quick_sort(array, low, pi - 1, partition_func); + quick_sort(array, pi + 1, high, partition_func); + } +} + +int main() { + const int size = 200; + int sum = 0; + int *array = malloc(size * sizeof(int)); + for(int i = 0; i < 100 * 1000; i++) { + for(int j = 0; j < size; j++) + array[j] = j % 10 ? rand() % size: j; + int (*fptr)(int *, int, int) = i % 3 ? partition_pivot_last : partition_pivot_first; + quick_sort(array, 0, size - 1, fptr); + sum += array[i % size]; + } + printf("sum=%d\n", sum); + + return 0; +} diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp index d0c074355629..82c36955cf65 100644 --- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp +++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp @@ -9,6 +9,7 @@ #include "ProfileGenerator.h" #include "ProfiledBinary.h" #include "llvm/ProfileData/ProfileCommon.h" +#include #include cl::opt OutputFilename("output", cl::value_desc("output"), @@ -70,7 +71,16 @@ static cl::opt CSProfMaxContextDepth( "depth limit."), cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth)); -extern cl::opt ProfileSummaryCutoffCold; +static cl::opt HotFunctionDensityThreshold( + "hot-function-density-threshold", llvm::cl::init(1000), + llvm::cl::desc( + "specify density threshold for hot functions (default: 1000)"), + llvm::cl::Optional); +static cl::opt ShowDensity("show-density", llvm::cl::init(false), + llvm::cl::desc("show profile density details"), + llvm::cl::Optional); + +extern cl::opt ProfileSummaryCutoffHot; using namespace llvm; using namespace sampleprof; @@ -127,6 +137,51 @@ void ProfileGeneratorBase::write() { write(std::move(WriterOrErr.get()), ProfileMap); } +void ProfileGeneratorBase::showDensitySuggestion(double Density) { + if (Density == 0.0) + WithColor::warning() << "The --profile-summary-cutoff-hot option may be " + "set too low. Please check your command.\n"; + else if (Density < HotFunctionDensityThreshold) + WithColor::warning() + << "AutoFDO is estimated to optimize better with " + << format("%.1f", HotFunctionDensityThreshold / Density) + << "x more samples. Please consider increasing sampling rate or " + "profiling for longer duration to get more samples.\n"; + + if (ShowDensity) + outs() << "Minimum profile density for hot functions with top " + << format("%.2f", + static_cast(ProfileSummaryCutoffHot.getValue()) / + 10000) + << "% total samples: " << format("%.1f", Density) << "\n"; +} + +double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles, + uint64_t HotCntThreshold) { + double Density = DBL_MAX; + std::vector HotFuncs; + for (auto &I : Profiles) { + auto &FuncSamples = I.second; + if (FuncSamples.getTotalSamples() < HotCntThreshold) + continue; + HotFuncs.emplace_back(&FuncSamples); + } + + for (auto *FuncSamples : HotFuncs) { + auto *Func = Binary->getBinaryFunction(FuncSamples->getName()); + if (!Func) + continue; + uint64_t FuncSize = Func->getFuncSize(); + if (FuncSize == 0) + continue; + Density = + std::min(Density, static_cast(FuncSamples->getTotalSamples()) / + FuncSize); + } + + return Density == DBL_MAX ? 0.0 : Density; +} + void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges, const RangeSample &Ranges) { @@ -311,6 +366,12 @@ void ProfileGenerator::generateProfile() { } else { generateLineNumBasedProfile(); } + postProcessProfiles(); +} + +void ProfileGenerator::postProcessProfiles() { + computeSummaryAndThreshold(); + calculateAndShowDensity(ProfileMap); } void ProfileGenerator::generateLineNumBasedProfile() { @@ -440,6 +501,12 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions( } } +void ProfileGeneratorBase::calculateAndShowDensity( + const SampleProfileMap &Profiles) { + double Density = calculateDensity(Profiles, HotCountThreshold); + showDensitySuggestion(Density); +} + FunctionSamples &CSProfileGenerator::getFunctionProfileForContext( const SampleContextFrameVector &Context, bool WasLeafInlined) { auto I = ProfileMap.find(SampleContext(Context)); @@ -664,9 +731,17 @@ void CSProfileGenerator::postProcessProfiles() { HotCountThreshold, CSProfTrimColdContext, CSProfMergeColdContext, CSProfMaxColdContextDepth, EnableCSPreInliner); } + + // Merge function samples of CS profile to calculate profile density. + sampleprof::SampleProfileMap ContextLessProfiles; + for (const auto &I : ProfileMap) { + ContextLessProfiles[I.second.getName()].merge(I.second); + } + + calculateAndShowDensity(ContextLessProfiles); } -void CSProfileGenerator::computeSummaryAndThreshold() { +void ProfileGeneratorBase::computeSummaryAndThreshold() { SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); auto Summary = Builder.computeSummaryForProfiles(ProfileMap); HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold( diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h index a0609b0a33af..979c2091714c 100644 --- a/llvm/tools/llvm-profgen/ProfileGenerator.h +++ b/llvm/tools/llvm-profgen/ProfileGenerator.h @@ -75,7 +75,23 @@ protected: const SampleContextFrame &LeafLoc, uint64_t Count); void updateTotalSamples(); + StringRef getCalleeNameForOffset(uint64_t TargetOffset); + + void computeSummaryAndThreshold(); + + void calculateAndShowDensity(const SampleProfileMap &Profiles); + + double calculateDensity(const SampleProfileMap &Profiles, + uint64_t HotCntThreshold); + + void showDensitySuggestion(double Density); + + // Thresholds from profile summary to answer isHotCount/isColdCount queries. + uint64_t HotCountThreshold; + + uint64_t ColdCountThreshold; + // Used by SampleProfileWriter SampleProfileMap ProfileMap; @@ -104,6 +120,7 @@ private: void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter); void populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters); + void postProcessProfiles(); }; using ProbeCounterMap = @@ -245,8 +262,6 @@ private: // and trimming cold profiles, running preinliner on profiles. void postProcessProfiles(); - void computeSummaryAndThreshold(); - void populateBodySamplesForFunction(FunctionSamples &FunctionProfile, const RangeSample &RangeCounters); void populateBoundarySamplesForFunction(SampleContextFrames ContextId, @@ -269,9 +284,6 @@ private: FunctionSamples & getFunctionProfileForLeafProbe(SampleContextFrames ContextStack, const MCDecodedPseudoProbe *LeafProbe); - // Thresholds from profile summary to answer isHotCount/isColdCount queries. - uint64_t HotCountThreshold; - uint64_t ColdCountThreshold; // Underlying context table serves for sample profile writer. std::unordered_set Contexts; diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h index 4249b78a9159..003477b89005 100644 --- a/llvm/tools/llvm-profgen/ProfiledBinary.h +++ b/llvm/tools/llvm-profgen/ProfiledBinary.h @@ -76,6 +76,14 @@ struct BinaryFunction { StringRef FuncName; // End of range is an exclusive bound. RangesTy Ranges; + + uint64_t getFuncSize() { + uint64_t Sum = 0; + for (auto &R : Ranges) { + Sum += R.second - R.first; + } + return Sum; + } }; // Info about function range. A function can be split into multiple @@ -406,6 +414,13 @@ public: return BinaryFunctions; } + BinaryFunction *getBinaryFunction(StringRef FName) { + auto I = BinaryFunctions.find(FName.str()); + if (I == BinaryFunctions.end()) + return nullptr; + return &I->second; + } + uint32_t getFuncSizeForContext(SampleContext &Context) { return FuncSizeTracker.getFuncSizeForContext(Context); }