COFF: Implement string tail merging.

In COFF, duplicate string literals are merged by placing them in a
comdat whose leader symbol name contains a specific prefix followed
by the hash and partial contents of the string literal. This gives
us an easy way to identify sections containing string literals in
the linker: check for leader symbol names with the given prefix.

Any sections that are identified in this way as containing string
literals may be tail merged. We do so using the StringTableBuilder
class, which is also used to tail merge string literals in the ELF
linker. Tail merging is enabled only if ICF is enabled, as this
provides a signal as to whether the user cares about binary size.

Differential Revision: https://reviews.llvm.org/D44504

llvm-svn: 327668
This commit is contained in:
Peter Collingbourne 2018-03-15 21:14:02 +00:00
parent 435b099115
commit f1a11f87a0
7 changed files with 184 additions and 5 deletions

View File

@ -571,5 +571,47 @@ uint8_t Baserel::getDefaultType() {
}
}
std::map<uint32_t, MergeChunk *> MergeChunk::Instances;
MergeChunk::MergeChunk(uint32_t Alignment)
: Builder(StringTableBuilder::RAW, Alignment) {
this->Alignment = Alignment;
}
void MergeChunk::addSection(SectionChunk *C) {
auto *&MC = Instances[C->Alignment];
if (!MC)
MC = make<MergeChunk>(C->Alignment);
MC->Sections.push_back(C);
}
void MergeChunk::finalizeContents() {
for (SectionChunk *C : Sections)
if (C->isLive())
Builder.add(toStringRef(C->getContents()));
Builder.finalize();
for (SectionChunk *C : Sections) {
if (!C->isLive())
continue;
size_t Off = Builder.getOffset(toStringRef(C->getContents()));
C->setOutputSection(Out);
C->setRVA(RVA + Off);
C->OutputSectionOff = OutputSectionOff + Off;
}
}
uint32_t MergeChunk::getPermissions() const {
return IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA;
}
size_t MergeChunk::getSize() const {
return Builder.getSize();
}
void MergeChunk::writeTo(uint8_t *Buf) const {
Builder.write(Buf + OutputSectionOff);
}
} // namespace coff
} // namespace lld

View File

@ -16,6 +16,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/MC/StringTableBuilder.h"
#include "llvm/Object/COFF.h"
#include <utility>
#include <vector>
@ -60,6 +61,10 @@ public:
// before calling this function.
virtual void writeTo(uint8_t *Buf) const {}
// Called by the writer after an RVA is assigned, but before calling
// getSize().
virtual void finalizeContents() {}
// The writer sets and uses the addresses.
uint64_t getRVA() const { return RVA; }
void setRVA(uint64_t V) { RVA = V; }
@ -222,6 +227,33 @@ private:
uint32_t Class[2] = {0, 0};
};
// This class is used to implement an lld-specific feature (not implemented in
// MSVC) that minimizes the output size by finding string literals sharing tail
// parts and merging them.
//
// If string tail merging is enabled and a section is identified as containing a
// string literal, it is added to a MergeChunk with an appropriate alignment.
// The MergeChunk then tail merges the strings using the StringTableBuilder
// class and assigns RVAs and section offsets to each of the member chunks based
// on the offsets assigned by the StringTableBuilder.
class MergeChunk : public Chunk {
public:
MergeChunk(uint32_t Alignment);
static void addSection(SectionChunk *C);
void finalizeContents() override;
uint32_t getPermissions() const override;
StringRef getSectionName() const override { return ".rdata"; }
size_t getSize() const override;
void writeTo(uint8_t *Buf) const override;
static std::map<uint32_t, MergeChunk *> Instances;
std::vector<SectionChunk *> Sections;
private:
llvm::StringTableBuilder Builder;
};
// A chunk for common symbols. Common chunks don't have actual data.
class CommonChunk : public Chunk {
public:

View File

@ -224,6 +224,12 @@ void ICF::run(ArrayRef<Chunk *> Vec) {
}
}
// Make sure that ICF doesn't merge sections that are being handled by string
// tail merging.
for (auto &P : MergeChunk::Instances)
for (SectionChunk *SC : P.second->Sections)
SC->Class[0] = NextId++;
// Initially, we use hash values to partition sections.
for_each(parallel::par, Chunks.begin(), Chunks.end(), [&](SectionChunk *SC) {
// Set MSB to 1 to avoid collisions with non-hash classs.

View File

@ -138,12 +138,13 @@ void ObjFile::initializeChunks() {
if (Sec->Characteristics & IMAGE_SCN_LNK_COMDAT)
SparseChunks[I] = PendingComdat;
else
SparseChunks[I] = readSection(I, nullptr);
SparseChunks[I] = readSection(I, nullptr, "");
}
}
SectionChunk *ObjFile::readSection(uint32_t SectionNumber,
const coff_aux_section_definition *Def) {
const coff_aux_section_definition *Def,
StringRef LeaderName) {
const coff_section *Sec;
StringRef Name;
if (auto EC = COFFObj->getSection(SectionNumber, Sec))
@ -189,6 +190,12 @@ SectionChunk *ObjFile::readSection(uint32_t SectionNumber,
GuardLJmpChunks.push_back(C);
else if (Name == ".sxdata")
SXDataChunks.push_back(C);
else if (Config->DoICF && Sec->NumberOfRelocations == 0 && Name == ".rdata" &&
LeaderName.startswith("??_C@"))
// COFF sections that look like string literal sections (i.e. no
// relocations, in .rdata, leader symbol name matches the MSVC name mangling
// for string literals) are subject to string tail merging.
MergeChunk::addSection(C);
else
Chunks.push_back(C);
@ -209,7 +216,7 @@ void ObjFile::readAssociativeDefinition(
// the section; otherwise mark it as discarded.
int32_t SectionNumber = Sym.getSectionNumber();
if (Parent) {
SparseChunks[SectionNumber] = readSection(SectionNumber, Def);
SparseChunks[SectionNumber] = readSection(SectionNumber, Def, "");
if (SparseChunks[SectionNumber])
Parent->addAssociative(SparseChunks[SectionNumber]);
} else {
@ -343,7 +350,7 @@ Optional<Symbol *> ObjFile::createDefined(
Prevailing = true;
}
if (Prevailing) {
SectionChunk *C = readSection(SectionNumber, Def);
SectionChunk *C = readSection(SectionNumber, Def, Name);
SparseChunks[SectionNumber] = C;
C->Sym = cast<DefinedRegular>(Leader);
cast<DefinedRegular>(Leader)->Data = &C->Repl;

View File

@ -150,7 +150,8 @@ private:
SectionChunk *
readSection(uint32_t SectionNumber,
const llvm::object::coff_aux_section_definition *Def);
const llvm::object::coff_aux_section_definition *Def,
StringRef LeaderName);
void readAssociativeDefinition(
COFFSymbolRef COFFSym,

View File

@ -426,6 +426,9 @@ void Writer::createSections() {
void Writer::createMiscChunks() {
OutputSection *RData = createSection(".rdata");
for (auto &P : MergeChunk::Instances)
RData->addChunk(P.second);
// Create thunks for locally-dllimported symbols.
if (!Symtab->LocalImportChunks.empty()) {
for (Chunk *C : Symtab->LocalImportChunks)
@ -665,6 +668,7 @@ void Writer::assignAddresses() {
VirtualSize = alignTo(VirtualSize, C->Alignment);
C->setRVA(RVA + VirtualSize);
C->OutputSectionOff = VirtualSize;
C->finalizeContents();
VirtualSize += C->getSize();
if (C->hasData())
RawSize = alignTo(VirtualSize, SectorSize);

View File

@ -0,0 +1,87 @@
# REQUIRES: x86
# RUN: llvm-mc -triple=x86_64-windows-msvc -filetype=obj -o %t.obj %s
# RUN: lld-link %t.obj /out:%t.exe /entry:main /subsystem:console
# RUN: llvm-objdump -s %t.exe | FileCheck %s
# CHECK: Contents of section .rdata:
# CHECK-NEXT: 140002000 68656c6c 6f20776f 726c6400 6fa26ca4 hello world.o.l.
# CHECK-NEXT: 140002010 0068656c 6c6f2077 6f726c64 00006865 .hello world..he
# CHECK-NEXT: 140002020 6c6c6f20 776f726c 64006800 65006c00 llo world.h.e.l.
# CHECK-NEXT: 140002030 6c006f00 20007700 6f007200 6c006400 l.o. .w.o.r.l.d.
# CHECK-NEXT: 140002040 0000 ..
# CHECK: Contents of section .text:
.globl main
main:
# CHECK-NEXT: 140003000 11200040 01000000 17200040 01000000
.8byte "??_C@_0M@LACCCNMM@hello?5world?$AA@"
.8byte "??_C@_05MCBCHHEJ@world?$AA@"
# CHECK-NEXT: 140003010 2a200040 01000000 36200040 01000000
.8byte "??_C@_1BI@HHJHKLLN@?$AAh?$AAe?$AAl?$AAl?$AAo?$AA?5?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@"
.8byte "??_C@_1M@NBBDDHIO@?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@"
# CHECK-NEXT: 140003020 00200040 01000000 0c200040 01000000
.8byte "??_D@not_a_string_literal"
.8byte "??_C@string_literal_with_relocs"
# CHECK-NEXT: 140003030 00100040 01000000 1e200040 01000000
.8byte "??_C@string_literal_in_wrong_section"
.8byte "??_C@overaligned_string_literal"
.section .rdata,"dr",discard,"??_C@_0M@LACCCNMM@hello?5world?$AA@"
.globl "??_C@_0M@LACCCNMM@hello?5world?$AA@"
"??_C@_0M@LACCCNMM@hello?5world?$AA@":
.asciz "hello world"
.section .rdata,"dr",discard,"??_C@_05MCBCHHEJ@world?$AA@"
.globl "??_C@_05MCBCHHEJ@world?$AA@"
"??_C@_05MCBCHHEJ@world?$AA@":
.asciz "world"
.section .rdata,"dr",discard,"??_C@_1BI@HHJHKLLN@?$AAh?$AAe?$AAl?$AAl?$AAo?$AA?5?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@"
.globl "??_C@_1BI@HHJHKLLN@?$AAh?$AAe?$AAl?$AAl?$AAo?$AA?5?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@"
.p2align 1
"??_C@_1BI@HHJHKLLN@?$AAh?$AAe?$AAl?$AAl?$AAo?$AA?5?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@":
.short 104
.short 101
.short 108
.short 108
.short 111
.short 32
.short 119
.short 111
.short 114
.short 108
.short 100
.short 0
.section .rdata,"dr",discard,"??_C@_1M@NBBDDHIO@?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@"
.globl "??_C@_1M@NBBDDHIO@?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@"
.p2align 1
"??_C@_1M@NBBDDHIO@?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@":
.short 119
.short 111
.short 114
.short 108
.short 100
.short 0
.section .data,"drw",discard,"??_C@string_literal_in_wrong_section"
.globl "??_C@string_literal_in_wrong_section"
"??_C@string_literal_in_wrong_section":
.asciz "hello world"
.section .rdata,"dr",discard,"??_D@not_a_string_literal"
.globl "??_D@not_a_string_literal"
"??_D@not_a_string_literal":
.asciz "hello world"
.section .rdata,"dr",discard,"??_C@string_literal_with_relocs"
.globl "??_C@string_literal_with_relocs"
"??_C@string_literal_with_relocs":
.4byte main + 111 + (114 << 8) + (108 << 16) + (100 << 24) # main + "orld"
.byte 0
.section .rdata,"dr",discard,"??_C@overaligned_string_literal"
.globl "??_C@overaligned_string_literal"
.p2align 1
"??_C@overaligned_string_literal":
.asciz "hello world"