Avoid doing binary search.

MergedInputSection::getOffset is the busiest function in LLD if string
merging is enabled and input files have lots of mergeable sections.
It is usually the case when creating executable with debug info,
so it is pretty common.

The reason why it is slow is because it has to do faily complex
computations. For non-mergeable sections, section contents are
contiguous in output, so in order to compute an output offset,
we only have to add the output section's base address to an input
offset. But for mergeable strings, section contents are split for
merging, so they are not contigous. We've got to do some lookups.

We used to do binary search on the list of section pieces.
It is slow because I think it's hostile to branch prediction.

This patch replaces it with hash table lookup. Seems it's working
pretty well. Below is "perf stat -r10" output when linking clang
with debug info. In this case this patch speeds up about 4%.

Before:

       6584.153205 task-clock (msec)         #    1.001 CPUs utilized            ( +-  0.09% )
               238 context-switches          #    0.036 K/sec                    ( +-  6.59% )
                 0 cpu-migrations            #    0.000 K/sec                    ( +- 50.92% )
         1,067,675 page-faults               #    0.162 M/sec                    ( +-  0.15% )
    18,369,931,470 cycles                    #    2.790 GHz                      ( +-  0.09% )
     9,640,680,143 stalled-cycles-frontend   #   52.48% frontend cycles idle     ( +-  0.18% )
   <not supported> stalled-cycles-backend
    21,206,747,787 instructions              #    1.15  insns per cycle
                                             #    0.45  stalled cycles per insn  ( +-  0.04% )
     3,817,398,032 branches                  #  579.786 M/sec                    ( +-  0.04% )
       132,787,249 branch-misses             #    3.48% of all branches          ( +-  0.02% )

       6.579106511 seconds time elapsed                                          ( +-  0.09% )

After:

       6312.317533 task-clock (msec)         #    1.001 CPUs utilized            ( +-  0.19% )
               221 context-switches          #    0.035 K/sec                    ( +-  4.11% )
                 1 cpu-migrations            #    0.000 K/sec                    ( +- 45.21% )
         1,280,775 page-faults               #    0.203 M/sec                    ( +-  0.37% )
    17,611,539,150 cycles                    #    2.790 GHz                      ( +-  0.19% )
    10,285,148,569 stalled-cycles-frontend   #   58.40% frontend cycles idle     ( +-  0.30% )
   <not supported> stalled-cycles-backend
    18,794,779,900 instructions              #    1.07  insns per cycle
                                             #    0.55  stalled cycles per insn  ( +-  0.03% )
     3,287,450,865 branches                  #  520.799 M/sec                    ( +-  0.03% )
        72,259,605 branch-misses             #    2.20% of all branches          ( +-  0.01% )

       6.307411828 seconds time elapsed                                          ( +-  0.19% )

Differential Revision: http://reviews.llvm.org/D20645

llvm-svn: 270999
This commit is contained in:
Rui Ueyama 2016-05-27 14:39:13 +00:00
parent 07c8654284
commit 406b469de4
5 changed files with 46 additions and 11 deletions

View File

@ -513,6 +513,7 @@ bool MergeInputSection<ELFT>::classof(const InputSectionBase<ELFT> *S) {
return S->SectionKind == InputSectionBase<ELFT>::Merge;
}
// Do binary search to get a section piece at a given input offset.
template <class ELFT>
SectionPiece *SplitInputSection<ELFT>::getSectionPiece(uintX_t Offset) {
ArrayRef<uint8_t> D = this->getSectionData();
@ -529,23 +530,40 @@ SectionPiece *SplitInputSection<ELFT>::getSectionPiece(uintX_t Offset) {
return &*I;
}
// Returns the offset in an output section for a given input offset.
// Because contents of a mergeable section is not contiguous in output,
// it is not just an addition to a base output offset.
template <class ELFT>
typename ELFT::uint MergeInputSection<ELFT>::getOffset(uintX_t Offset) {
auto It = OffsetMap.find(Offset);
if (It != OffsetMap.end())
return It->second;
// If Offset is not at beginning of a section piece, it is not in the map.
// In that case we need to search from the original section piece vector.
SectionPiece &Piece = *this->getSectionPiece(Offset);
assert(Piece.Live);
// Compute the Addend and if the Base is cached, return.
uintX_t Addend = Offset - Piece.InputOff;
if (Piece.OutputOff != size_t(-1))
return Piece.OutputOff + Addend;
uintX_t Ret = Piece.OutputOff + Addend;
return Ret;
}
// Map the base to the offset in the output section and cache it.
ArrayRef<uint8_t> D = this->getSectionData();
StringRef Data((const char *)D.data(), D.size());
StringRef Entry = Data.substr(Piece.InputOff, Piece.size());
auto *MOS = static_cast<MergeOutputSection<ELFT> *>(this->OutSec);
Piece.OutputOff = MOS->getOffset(Entry);
return Piece.OutputOff + Addend;
// Create a map from input offsets to output offsets for all section pieces.
// It is called after finalize().
template <class ELFT> void MergeInputSection<ELFT>::finalizePieces() {
OffsetMap.grow(this->Pieces.size());
for (SectionPiece &Piece : this->Pieces) {
if (!Piece.Live)
continue;
if (Piece.OutputOff == size_t(-1)) {
// Offsets of tail-merged strings are computed lazily.
auto *OutSec = static_cast<MergeOutputSection<ELFT> *>(this->OutSec);
ArrayRef<uint8_t> D = Piece.data();
StringRef S((const char *)D.data(), D.size());
Piece.OutputOff = OutSec->getOffset(S);
}
OffsetMap[Piece.InputOff] = Piece.OutputOff;
}
}
template <class ELFT>

View File

@ -145,7 +145,10 @@ public:
// in the output section.
uintX_t getOffset(uintX_t Offset);
void finalizePieces();
private:
llvm::DenseMap<uintX_t, uintX_t> OffsetMap;
llvm::DenseSet<uintX_t> LiveOffsets;
};

View File

@ -1164,6 +1164,7 @@ void MergeOutputSection<ELFT>::addSection(InputSectionBase<ELFT> *C) {
Sec->OutSec = this;
this->updateAlign(Sec->Align);
this->Header.sh_entsize = Sec->getSectionHdr()->sh_entsize;
Sections.push_back(Sec);
bool IsString = this->Header.sh_flags & SHF_STRINGS;
@ -1191,6 +1192,11 @@ template <class ELFT> void MergeOutputSection<ELFT>::finalize() {
this->Header.sh_size = Builder.getSize();
}
template <class ELFT> void MergeOutputSection<ELFT>::finalizePieces() {
for (MergeInputSection<ELFT> *Sec : Sections)
Sec->finalizePieces();
}
template <class ELFT>
StringTableSection<ELFT>::StringTableSection(StringRef Name, bool Dynamic)
: OutputSectionBase<ELFT>(Name, SHT_STRTAB,

View File

@ -93,6 +93,7 @@ public:
bool PageAlign = false;
virtual void finalize() {}
virtual void finalizePieces() {}
virtual void
forEachInputSection(std::function<void(InputSectionBase<ELFT> *)> F) {}
virtual void writeTo(uint8_t *Buf) {}
@ -320,10 +321,12 @@ public:
void writeTo(uint8_t *Buf) override;
unsigned getOffset(StringRef Val);
void finalize() override;
void finalizePieces() override;
bool shouldTailMerge() const;
private:
llvm::StringTableBuilder Builder;
std::vector<MergeInputSection<ELFT> *> Sections;
};
struct CieRecord {

View File

@ -873,6 +873,11 @@ template <class ELFT> void Writer<ELFT>::createSections() {
if (isOutputDynamic())
Out<ELFT>::Dynamic->finalize();
// Now that all output offsets are fixed. Finalize mergeable sections
// to fix their maps from input offsets to output offsets.
for (OutputSectionBase<ELFT> *Sec : OutputSections)
Sec->finalizePieces();
}
template <class ELFT> bool Writer<ELFT>::needsGot() {