[Samples] add array-partition into ablation study; [ArrayPartition] support AffineStoreOp; bug fixes in EmitHLSCpp and ConvertToHLSCpp

This commit is contained in:
Hanchen Ye 2020-12-08 12:35:44 -06:00
parent 6a504e576a
commit cbadc8f831
6 changed files with 228 additions and 141 deletions

View File

@ -46,12 +46,12 @@ $ scalehls-opt -qor-estimation test/Analysis/QoREstimation/test_for.mlir
```
### 4. Ablation study
If Vivado HLS (2019.1 tested) is installed on your machine, running the following script will report the HLS results for some benchmarks (around 2 hours on AMD Ryzen7 3800X for all 16 tests).
If Vivado HLS (2019.1 tested) is installed on your machine, running the following script will report the HLS results for some benchmarks (around 8 hours on AMD Ryzen7 3800X for all 33 tests).
For the `ablation_test_run.sh` script, `-n` determines the number of tests to be processed, the maximum supported value of which is 16; `-c` determines whether to run Vivado HLS C synthesis; `-r` determines whether to run report generation. The generated C++ source code will be written to `sample/cpp_src`; the Vivado HLS project will be established in `sample/hls_proj`; the generated report will be written to `sample/test_results`.
For the `ablation_test_run.sh` script, `-n` determines the number of tests to be processed, the maximum supported value of which is 33; `-c` determines from which test to begin to rerun the C++ synthesis and report collection. The generated C++ source code will be written to `sample/cpp_src`; the Vivado HLS project will be established in `sample/hls_proj`; the collected report will be written to `sample/test_results`; the test summary will be generated to `sample`.
```sh
$ cd $SCALEHLS_DIR/sample
$ ./ablation_test_run.sh -n 16 -c true -r true
$ ./ablation_test_run.sh -n 33 -c 0
```
## References

View File

@ -80,9 +80,14 @@ void ConvertToHLSCpp::runOnOperation() {
// Set array pragma attributes, default array instance is ram_1p
// bram. Other attributes are not set here since they requires more
// analysis to be determined.
arrayOp.setAttr("interface", builder.getBoolAttr(false));
arrayOp.setAttr("storage", builder.getBoolAttr(false));
arrayOp.setAttr("partition", builder.getBoolAttr(false));
if (!arrayOp.getAttr("interface"))
arrayOp.setAttr("interface", builder.getBoolAttr(false));
if (!arrayOp.getAttr("storage"))
arrayOp.setAttr("storage", builder.getBoolAttr(false));
if (!arrayOp.getAttr("partition"))
arrayOp.setAttr("partition", builder.getBoolAttr(false));
}
}
}
@ -92,9 +97,14 @@ void ConvertToHLSCpp::runOnOperation() {
forOp.emitError("has zero or more than one basic blocks");
// Set loop pragma attributes.
forOp.setAttr("pipeline", builder.getBoolAttr(false));
forOp.setAttr("unroll", builder.getBoolAttr(false));
forOp.setAttr("flatten", builder.getBoolAttr(false));
if (!forOp.getAttr("pipeline"))
forOp.setAttr("pipeline", builder.getBoolAttr(false));
if (!forOp.getAttr("unroll"))
forOp.setAttr("unroll", builder.getBoolAttr(false));
if (!forOp.getAttr("flatten"))
forOp.setAttr("flatten", builder.getBoolAttr(false));
}
});
}

View File

@ -284,7 +284,7 @@ bool HLSKernelVisitor::visitOp(MaxPoolOp op) {
auto dataType = O.getType().cast<MemRefType>().getElementType();
auto zeroConst = builder.create<mlir::ConstantOp>(
op.getLoc(), builder.getZeroAttr(dataType));
createStore(zeroConst, O, {h, c, h, w});
createStore(zeroConst, O, {n, c, h, w});
// Create kernel height, and kernel width loop.
auto r = createLoop(0, kernelShape[0]);
@ -308,7 +308,7 @@ bool HLSKernelVisitor::visitOp(MaxPoolOp op) {
// Carry out selection and store the greater value.
auto newGreatest = builder.create<mlir::SelectOp>(op.getLoc(), greaterThanTmp,
fmap, tmpGreatest);
createStore(newGreatest, O, {h, c, h, w});
createStore(newGreatest, O, {n, c, h, w});
return true;
}

View File

@ -1195,7 +1195,7 @@ void ModuleEmitter::emitAssign(AssignOp *op) {
}
void ModuleEmitter::emitArray(ArrayOp *op) {
// addAlias(op->getOperand(), op->getResult());
addAlias(op->getOperand(), op->getResult());
if (op->interface()) {

View File

@ -20,13 +20,107 @@ struct ArrayPartition : public ArrayPartitionBase<ArrayPartition> {
};
} // namespace
static mlir::AffineForOp getPipelineLoop(mlir::AffineForOp root) {
SmallVector<mlir::AffineForOp, 4> nestedLoops;
root.walk([&](mlir::AffineForOp loop) {
if (auto attr = loop.getAttrOfType<BoolAttr>("pipeline")) {
if (attr.getValue())
nestedLoops.push_back(loop);
}
});
return nestedLoops.back();
}
template <typename OpType>
static void applyArrayPartition(MemAccessDict &accessDict, OpBuilder &builder) {
for (auto pair : accessDict) {
auto arrayOp = cast<ArrayOp>(pair.first);
auto arrayType = arrayOp.getType().cast<MemRefType>();
auto arrayAccesses = pair.second;
// Walk through each dimension of the targeted array.
SmallVector<Attribute, 4> partitionFactor;
SmallVector<StringRef, 4> partitionType;
for (size_t dim = 0, e = arrayType.getShape().size(); dim < e; ++dim) {
// Collect all array access indices of the current dimension.
SmallVector<AffineExpr, 4> indices;
for (auto accessOp : arrayAccesses) {
auto concreteOp = cast<OpType>(accessOp);
auto index = concreteOp.getAffineMap().getResult(dim);
// Only add unique index.
if (std::find(indices.begin(), indices.end(), index) == indices.end())
indices.push_back(index);
}
auto accessNum = indices.size();
// Find the max array access distance in the current block.
unsigned maxDistance = 0;
bool failFlag = false;
for (unsigned i = 0; i < accessNum; ++i) {
for (unsigned j = i + 1; j < accessNum; ++j) {
// TODO: this expression can't be simplified.
auto expr = indices[j] - indices[i];
if (auto constDistance = expr.dyn_cast<AffineConstantExpr>()) {
unsigned distance = abs(constDistance.getValue());
maxDistance = max(maxDistance, distance);
} else {
// The array partition mechanism will fail if the distance is
// not a constant number.
// failFlag = true;
// break;
}
}
// if (failFlag)
// break;
}
// Determine array partition strategy.
maxDistance += 1;
if (failFlag || maxDistance == 1) {
// This means all accesses have the same index, and this dimension
// should not be partitioned.
partitionType.push_back("none");
partitionFactor.push_back(builder.getUI32IntegerAttr(1));
} else if (accessNum >= maxDistance) {
// This means some elements are accessed more than once or exactly
// once, and successive elements are accessed. In most cases,
// apply "cyclic" partition should be the best solution.
partitionType.push_back("cyclic");
partitionFactor.push_back(builder.getUI32IntegerAttr(maxDistance));
} else {
// This means discrete elements are accessed. Typically, "block"
// partition will be most benefit for this occasion.
partitionType.push_back("block");
partitionFactor.push_back(builder.getUI32IntegerAttr(accessNum));
}
}
arrayOp.setAttr("partition", builder.getBoolAttr(true));
arrayOp.setAttr("partition_type", builder.getStrArrayAttr(partitionType));
arrayOp.setAttr("partition_factor", builder.getArrayAttr(partitionFactor));
}
}
void ArrayPartition::runOnOperation() {
auto module = getOperation();
auto builder = OpBuilder(module);
// Extract all static parameters and current pragma configurations.
HLSCppAnalyzer analyzer(builder);
analyzer.analyzeModule(getOperation());
// If the current loop is annotated as pipeline, all intter loops are
// automatically unrolled.
for (auto func : module.getOps<FuncOp>()) {
for (auto forOp : func.getOps<mlir::AffineForOp>()) {
auto outermost = getPipelineLoop(forOp);
outermost.walk([&](mlir::AffineForOp loop) {
if (loop != outermost)
loopUnrollFull(loop);
});
}
}
// Canonicalize the analyzed IR.
OwningRewritePatternList patterns;
@ -38,109 +132,27 @@ void ArrayPartition::runOnOperation() {
Operation *op = getOperation();
applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns));
// Estimate performance and resource utilization.
// Apply array partition.
for (auto func : module.getOps<FuncOp>()) {
for (auto forOp : func.getOps<mlir::AffineForOp>()) {
// TODO: support imperfect loop nests.
SmallVector<mlir::AffineForOp, 4> nestedLoops;
getPerfectlyNestedLoops(nestedLoops, forOp);
auto innermost = nestedLoops.back();
auto outermost = getPipelineLoop(forOp);
// Collect memory access information.
MemAccessDict loadDict;
innermost.walk([&](mlir::AffineLoadOp loadOp) {
outermost.walk([&](mlir::AffineLoadOp loadOp) {
auto arrayOp = cast<ArrayOp>(loadOp.getMemRef().getDefiningOp());
loadDict[arrayOp].push_back(loadOp);
});
MemAccessDict storeDict;
innermost.walk([&](mlir::AffineStoreOp storeOp) {
outermost.walk([&](mlir::AffineStoreOp storeOp) {
auto arrayOp = cast<ArrayOp>(storeOp.getMemRef().getDefiningOp());
storeDict[arrayOp].push_back(storeOp);
});
// Apply array partition pragma.
for (auto pair : loadDict) {
auto arrayOp = cast<ArrayOp>(pair.first);
auto arrayType = arrayOp.getType().cast<MemRefType>();
auto arrayAccesses = pair.second;
// Walk through each dimension of the targeted array.
SmallVector<Attribute, 4> partitionFactor;
SmallVector<StringRef, 4> partitionType;
for (size_t dim = 0, e = arrayType.getShape().size(); dim < e; ++dim) {
unsigned dimSize = arrayType.getShape()[dim];
// Collect all array access indices of the current dimension.
SmallVector<AffineExpr, 4> indices;
for (auto accessOp : arrayAccesses) {
auto concreteOp = cast<mlir::AffineLoadOp>(accessOp);
auto index = concreteOp.getAffineMap().getResult(dim);
// Only add unique index.
if (std::find(indices.begin(), indices.end(), index) ==
indices.end())
indices.push_back(index);
}
auto accessNum = indices.size();
// Find the max array access distance in the current block.
unsigned maxDistance = 0;
bool failFlag = false;
for (unsigned i = 0; i < accessNum; ++i) {
for (unsigned j = i + 1; j < accessNum; ++j) {
// TODO: this expression can't be simplified.
auto expr = indices[j] - indices[i];
if (auto constDistance = expr.dyn_cast<AffineConstantExpr>()) {
unsigned distance = abs(constDistance.getValue());
maxDistance = max(maxDistance, distance);
} else {
// The array partition mechanism will fail if the distance is
// not a constant number.
// failFlag = true;
// break;
}
}
// if (failFlag)
// break;
}
// Determine array partition strategy.
maxDistance += 1;
if (failFlag || maxDistance == 1) {
// This means all accesses have the same index, and this dimension
// should not be partitioned.
partitionType.push_back("none");
partitionFactor.push_back(builder.getUI32IntegerAttr(1));
} else if (accessNum == dimSize) {
// Apply complete array partition.
partitionType.push_back("complete");
partitionFactor.push_back(builder.getUI32IntegerAttr(1));
} else if (accessNum >= maxDistance) {
// This means some elements are accessed more than once or exactly
// once, and successive elements are accessed. In most cases, apply
// "cyclic" partition should be the best solution.
partitionType.push_back("cyclic");
partitionFactor.push_back(builder.getUI32IntegerAttr(maxDistance));
} else {
// This means discrete elements are accessed. Typically, "block"
// partition will be most benefit for this occasion.
partitionType.push_back("block");
partitionFactor.push_back(builder.getUI32IntegerAttr(accessNum));
}
}
arrayOp.setAttr("partition", builder.getBoolAttr(true));
arrayOp.setAttr("partition_type",
builder.getStrArrayAttr(partitionType));
arrayOp.setAttr("partition_factor",
builder.getArrayAttr(partitionFactor));
}
applyArrayPartition<mlir::AffineLoadOp>(loadDict, builder);
applyArrayPartition<mlir::AffineStoreOp>(storeDict, builder);
}
}
}

View File

@ -1,12 +1,11 @@
#!/bin/bash
# Script options.
while getopts 'n:c:r:' opt
while getopts 'n:c:' opt
do
case $opt in
n) ablation_number=$OPTARG ;;
c) rerun_csynth=$OPTARG ;;
r) rerun_report=$OPTARG ;;
c) rerun_csynth_from=$OPTARG ;;
esac
done
@ -28,10 +27,13 @@ fi
# Candidate passes.
hta=-hlskernel-to-affine
pft=-affine-loop-perfection
rvb=-remove-var-loop-bound
cth=-convert-to-hlscpp
can=-canonicalize
alp=-affine-loop-perfection
rvb=-remove-var-loop-bound
par=-array-partition
p0=-insert-pipeline-pragma="insert-level=0"
p1=-insert-pipeline-pragma="insert-level=1"
p2=-insert-pipeline-pragma="insert-level=2"
@ -43,10 +45,15 @@ u3=-affine-loop-unroll="unroll-full unroll-num-reps=3"
t1s2=-partial-affine-loop-tile="tile-level=1 tile-size=2"
t1s4=-partial-affine-loop-tile="tile-level=1 tile-size=4"
t1s8=-partial-affine-loop-tile="tile-level=1 tile-size=8"
t2s2=-partial-affine-loop-tile="tile-level=2 tile-size=2"
t2s4=-partial-affine-loop-tile="tile-level=2 tile-size=4"
t2s8=-partial-affine-loop-tile="tile-level=2 tile-size=8"
t3s2=-partial-affine-loop-tile="tile-level=3 tile-size=2"
t3s4=-partial-affine-loop-tile="tile-level=3 tile-size=4"
t3s8=-partial-affine-loop-tile="tile-level=3 tile-size=8"
emit=-emit-hlscpp
@ -59,34 +66,63 @@ do
do
output="cpp_src/${file##*Affine/}.cpp"
case $n in
0) scalehls-opt $hta $can $file | scalehls-translate $emit -o $output ;;
0) scalehls-opt $hta $cth $can $file | scalehls-translate $emit -o $output ;;
# Apply pipeline.
1) scalehls-opt $hta "$p0" $can $file | scalehls-translate $emit -o $output ;;
2) scalehls-opt $hta "$p1" $can $file | scalehls-translate $emit -o $output ;;
3) scalehls-opt $hta "$p2" $can $file | scalehls-translate $emit -o $output ;;
4) scalehls-opt $hta "$p3" $can $file | scalehls-translate $emit -o $output ;;
1) scalehls-opt $hta $cth "$p0" $can $file | scalehls-translate $emit -o $output ;;
2) scalehls-opt $hta $cth "$p1" $can $file | scalehls-translate $emit -o $output ;;
3) scalehls-opt $hta $cth "$p2" $can $file | scalehls-translate $emit -o $output ;;
4) scalehls-opt $hta $cth "$p3" $can $file | scalehls-translate $emit -o $output ;;
# Apply loop perfection + pipeline.
5) scalehls-opt $hta $pft "$p0" $can $file | scalehls-translate $emit -o $output ;;
6) scalehls-opt $hta $pft "$p1" $can $file | scalehls-translate $emit -o $output ;;
7) scalehls-opt $hta $pft "$p2" $can $file | scalehls-translate $emit -o $output ;;
8) scalehls-opt $hta $pft "$p3" $can $file | scalehls-translate $emit -o $output ;;
# Apply pipeline + array partition.
5) scalehls-opt $hta $cth "$p0" $par $can $file | scalehls-translate $emit -o $output ;;
6) scalehls-opt $hta $cth "$p1" $par $can $file | scalehls-translate $emit -o $output ;;
7) scalehls-opt $hta $cth "$p2" $par $can $file | scalehls-translate $emit -o $output ;;
8) scalehls-opt $hta $cth "$p3" $par $can $file | scalehls-translate $emit -o $output ;;
# Apply loop perfection + remove variable bound + pipeline.
9) scalehls-opt $hta $pft $rvb "$p0" $can $file | scalehls-translate $emit -o $output ;;
10) scalehls-opt $hta $pft $rvb "$p1" $can $file | scalehls-translate $emit -o $output ;;
11) scalehls-opt $hta $pft $rvb "$p2" $can $file | scalehls-translate $emit -o $output ;;
12) scalehls-opt $hta $pft $rvb "$p3" $can $file | scalehls-translate $emit -o $output ;;
9) scalehls-opt $hta $alp $rvb $cth "$p0" $can $file | scalehls-translate $emit -o $output ;;
10) scalehls-opt $hta $alp $rvb $cth "$p1" $can $file | scalehls-translate $emit -o $output ;;
11) scalehls-opt $hta $alp $rvb $cth "$p2" $can $file | scalehls-translate $emit -o $output ;;
12) scalehls-opt $hta $alp $rvb $cth "$p3" $can $file | scalehls-translate $emit -o $output ;;
# Apply loop perfection + remove variable bound + loop tiling + pipeline.
13) scalehls-opt $hta $pft $rvb "$t1s4" "$p1" "$u1" $can $file | scalehls-translate $emit -o $output ;;
14) scalehls-opt $hta $pft $rvb "$t2s4" "$p2" "$u2" $can $file | scalehls-translate $emit -o $output ;;
15) scalehls-opt $hta $pft $rvb "$t3s4" "$p3" "$u3" $can $file | scalehls-translate $emit -o $output ;;
# Apply loop perfection + remove variable bound + pipeline + array partition.
13) scalehls-opt $hta $alp $rvb $cth "$p0" $par $can $file | scalehls-translate $emit -o $output ;;
14) scalehls-opt $hta $alp $rvb $cth "$p1" $par $can $file | scalehls-translate $emit -o $output ;;
15) scalehls-opt $hta $alp $rvb $cth "$p2" $par $can $file | scalehls-translate $emit -o $output ;;
16) scalehls-opt $hta $alp $rvb $cth "$p3" $par $can $file | scalehls-translate $emit -o $output ;;
# Apply ... + 1st-level loop tiling + pipeline.
17) scalehls-opt $hta $alp $rvb "$t1s2" $cth "$p1" "$u1" $can $file | scalehls-translate $emit -o $output ;;
18) scalehls-opt $hta $alp $rvb "$t1s4" $cth "$p1" "$u1" $can $file | scalehls-translate $emit -o $output ;;
19) scalehls-opt $hta $alp $rvb "$t1s8" $cth "$p1" "$u1" $can $file | scalehls-translate $emit -o $output ;;
# Apply ... + 1st-level loop tiling + pipeline + array partition.
20) scalehls-opt $hta $alp $rvb "$t1s2" $cth "$p1" "$u1" $par $can $file | scalehls-translate $emit -o $output ;;
21) scalehls-opt $hta $alp $rvb "$t1s4" $cth "$p1" "$u1" $par $can $file | scalehls-translate $emit -o $output ;;
22) scalehls-opt $hta $alp $rvb "$t1s8" $cth "$p1" "$u1" $par $can $file | scalehls-translate $emit -o $output ;;
# Apply ... + 2nd-level loop tiling + pipeline.
23) scalehls-opt $hta $alp $rvb "$t2s2" $cth "$p2" "$u2" $can $file | scalehls-translate $emit -o $output ;;
24) scalehls-opt $hta $alp $rvb "$t2s4" $cth "$p2" "$u2" $can $file | scalehls-translate $emit -o $output ;;
25) scalehls-opt $hta $alp $rvb "$t2s8" $cth "$p2" "$u2" $can $file | scalehls-translate $emit -o $output ;;
# Apply ... + 2nd-level loop tiling + pipeline + array partition.
26) scalehls-opt $hta $alp $rvb "$t2s2" $cth "$p2" "$u2" $par $can $file | scalehls-translate $emit -o $output ;;
27) scalehls-opt $hta $alp $rvb "$t2s4" $cth "$p2" "$u2" $par $can $file | scalehls-translate $emit -o $output ;;
28) scalehls-opt $hta $alp $rvb "$t2s8" $cth "$p2" "$u2" $par $can $file | scalehls-translate $emit -o $output ;;
# Apply ... + 3rd-level loop tiling + pipeline.
29) scalehls-opt $hta $alp $rvb "$t3s2" $cth "$p3" "$u3" $can $file | scalehls-translate $emit -o $output ;;
30) scalehls-opt $hta $alp $rvb "$t3s4" $cth "$p3" "$u3" $can $file | scalehls-translate $emit -o $output ;;
# Apply ... + 3rd-level loop tiling + pipeline + array partition.
31) scalehls-opt $hta $cth $alp $rvb "$t3s2" "$p3" "$u3" $par $can $file | scalehls-translate $emit -o $output ;;
32) scalehls-opt $hta $cth $alp $rvb "$t3s4" "$p3" "$u3" $par $can $file | scalehls-translate $emit -o $output ;;
esac
done
if [ $rerun_csynth == "true" ]
if [ $n -ge $rerun_csynth_from ]
then
# Run HLS synthesis.
cd hls_proj
@ -94,7 +130,7 @@ do
cd ..
fi
if [ $rerun_report == "true" ]
if [ $n -ge $rerun_csynth_from ]
then
# Generate latency report.
echo -e "benchmark\tdsp\tlut\tcycles" > test_results/test_result$n.log
@ -138,22 +174,51 @@ do
3) echo -e "p2\t\c" >> test_summary.log ;;
4) echo -e "p3\t\c" >> test_summary.log ;;
# Apply loop perfection + pipeline.
5) echo -e "pft+p0\t\c" >> test_summary.log ;;
6) echo -e "pft+p1\t\c" >> test_summary.log ;;
7) echo -e "pft+p2\t\c" >> test_summary.log ;;
8) echo -e "pft+p3\t\c" >> test_summary.log ;;
# Apply pipeline + array partition.
5) echo -e "p0+par\t\c" >> test_summary.log ;;
6) echo -e "p1+par\t\c" >> test_summary.log ;;
7) echo -e "p2+par\t\c" >> test_summary.log ;;
8) echo -e "p3+par\t\c" >> test_summary.log ;;
# Apply loop perfection + remove variable bound + pipeline.
9) echo -e "pft+rvb+p0\t\c" >> test_summary.log ;;
10) echo -e "pft+rvb+p1\t\c" >> test_summary.log ;;
11) echo -e "pft+rvb+p2\t\c" >> test_summary.log ;;
12) echo -e "pft+rvb+p3\t\c" >> test_summary.log ;;
9) echo -e "ar+p0\t\c" >> test_summary.log ;;
10) echo -e "ar+p1\t\c" >> test_summary.log ;;
11) echo -e "ar+p2\t\c" >> test_summary.log ;;
12) echo -e "ar+p3\t\c" >> test_summary.log ;;
# Apply loop perfection + remove variable bound + loop tiling + pipeline.
13) echo -e "pft+rvb+t1+p1\t\c" >> test_summary.log ;;
14) echo -e "pft+rvb+t2+p2\t\c" >> test_summary.log ;;
15) echo -e "pft+rvb+t3+p3\t\c" >> test_summary.log ;;
# Apply loop perfection + remove variable bound + pipeline + array partition.
13) echo -e "ar+p0+par\t\c" >> test_summary.log ;;
14) echo -e "ar+p1+par\t\c" >> test_summary.log ;;
15) echo -e "ar+p2+par\t\c" >> test_summary.log ;;
16) echo -e "ar+p3+par\t\c" >> test_summary.log ;;
# Apply ... + 1st-level loop tiling + pipeline.
17) echo -e "ar+t1s2+p1\t\c" >> test_summary.log ;;
18) echo -e "ar+t1s4+p1\t\c" >> test_summary.log ;;
19) echo -e "ar+t1s8+p1\t\c" >> test_summary.log ;;
# Apply ... + 1st-level loop tiling + pipeline + array partition.
20) echo -e "ar+t1s2+p1+par\t\c" >> test_summary.log ;;
21) echo -e "ar+t1s4+p1+par\t\c" >> test_summary.log ;;
22) echo -e "ar+t1s8+p1+par\t\c" >> test_summary.log ;;
# Apply ... + 2nd-level loop tiling + pipeline.
23) echo -e "ar+t2s2+p2\t\c" >> test_summary.log ;;
24) echo -e "ar+t2s4+p2\t\c" >> test_summary.log ;;
25) echo -e "ar+t2s8+p2\t\c" >> test_summary.log ;;
# Apply ... + 2nd-level loop tiling + pipeline + array partition.
26) echo -e "ar+t2s2+p2+par\t\c" >> test_summary.log ;;
27) echo -e "ar+t2s4+p2+par\t\c" >> test_summary.log ;;
28) echo -e "ar+t2s8+p2+par\t\c" >> test_summary.log ;;
# Apply ... + 3rd-level loop tiling + pipeline.
29) echo -e "ar+t3s2+p3\t\c" >> test_summary.log ;;
30) echo -e "ar+t3s4+p3\t\c" >> test_summary.log ;;
# Apply ... + 3rd-level loop tiling + pipeline + array partition.
31) echo -e "ar+t3s2+p3+par\t\c" >> test_summary.log ;;
32) echo -e "ar+t3s4+p3+par\t\c" >> test_summary.log ;;
esac
cat $result | awk "NR==$idx{OFS=\"\t\";print \$2,\$3,\$4}" >> test_summary.log