From 9ae0d9222e86cdc1ee7191a897a4301e7d8aba3c Mon Sep 17 00:00:00 2001 From: Peter Doak Date: Sat, 16 Jun 2018 17:25:27 -0400 Subject: [PATCH] adding profiling example --- CMakeLists.txt | 8 +++- examples/profiling/README.md | 40 +++++++++++++++++++ .../build_llvm_nvcc_xray_instrumented.sh | 34 ++++++++++++++++ 3 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 examples/profiling/README.md create mode 100644 examples/profiling/build_llvm_nvcc_xray_instrumented.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 489f04529..8fff636a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -685,13 +685,17 @@ ELSE(QMC_CUDA) ENDIF(QMC_CUDA) SET(XRAY_PROFILE FALSE CACHE BOOL "Use llvm xray profiling") -SET(XRAY_INSTRUCTION_THRESHOLD 20 CACHE INT "Instruction threshold for xray instrumentation") +SET(XRAY_INSTRUCTION_THRESHOLD 200 CACHE INT "Instruction threshold for xray instrumentation") +SET(XRAY_GPU_MOST FALSE CACHE BOOL "Manually instrument almost all of the CUDA implementations calls on the CPU") + IF(XRAY_PROFILE) set(XRAY_FLAGS "-fxray-instrument -fxray-instruction-threshold=${XRAY_INSTRUCTION_THRESHOLD}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${XRAY_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${XRAY_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${XRAY_FLAGS}") - set_property(DIRECTORY ${CMAKE_SOURCE_DIR} APPEND PROPERTY COMPILE_DEFINITIONS GPU_XRAY_TRACE_ON) + if(XRAY_GPU_MOST) + set_property(DIRECTORY ${CMAKE_SOURCE_DIR} APPEND PROPERTY COMPILE_DEFINITIONS GPU_XRAY_TRACE_ON) + endif(XRAY_GPU_MOST) ENDIF(XRAY_PROFILE) diff --git a/examples/profiling/README.md b/examples/profiling/README.md new file mode 100644 index 000000000..bd532c615 --- /dev/null +++ b/examples/profiling/README.md @@ -0,0 +1,40 @@ +# Profiling QMCPACK + +## LLVM-XRAY based +With CUDA: +### LLVM 4.0.1 and NVCC 9.1 -- allows simultaneous nvprof and xray instrumentation of CPU code +* Build: see [build_script](./build_llvm_nvcc_xray_instrumented.sh) +* Running: With the performance tests set up. +``` shell +[epd@oxygen ]$ export QMCPACK_APP=/path/to/your/instrumented/bin/qmcpack +[epd@oxygen ]$ cd /performace/NiO/sample/dmc-a4-e48-gpu +[epd@oxygen ]$ OMP_NUM_THREADS=12 XRAY_OPTIONS="patch_premain=true xray_mode=xray-basic" nvprof -o simul.nvprof $QMCPACK_APP NiO-fcc-S1-dmc.xml +[epd@oxygen ]$ llvm-xray account xray-log.qmcpack.yrxHG1 -instr_map=$QMCPACK_APP -sort=med -top=10 -sortorder=dsc + +Functions with latencies: 514 + funcid count [ min, med, 90p, 99p, max] sum function + 1 1 [12.220955, 12.220955, 12.220955, 12.220955, 12.220955] 12.220955 :0:0: main + 55 1 [ 5.188287, 5.188287, 5.188287, 5.188287, 5.188287] 5.188287 :0:0: qmcplusplus::QMCMain::execute() + 59 1 [ 2.152482, 2.152482, 2.152482, 2.152482, 2.152482] 2.152482 :0:0: qmcplusplus::QMCMain::validateXML() + 64 1 [ 1.795763, 1.795763, 1.795763, 1.795763, 1.795763] 1.795763 :0:0: qmcplusplus::WaveFunctionPool::put(_xmlNode*) + 1266 1 [ 1.795672, 1.795672, 1.795672, 1.795672, 1.795672] 1.795672 :0:0: qmcplusplus::WaveFunctionFactory::build(_xmlNode*, bool) + 1267 1 [ 1.488744, 1.488744, 1.488744, 1.488744, 1.488744] 1.488744 :0:0: qmcplusplus::WaveFunctionFactory::addFermionTerm(_xmlNode*) + 1767 1 [ 1.488722, 1.488722, 1.488722, 1.488722, 1.488722] 1.488722 :0:0: qmcplusplus::SlaterDetBuilder::put(_xmlNode*) + 335 1 [ 1.198301, 1.198301, 1.198301, 1.198301, 1.198301] 1.198301 :0:0: qmcplusplus::VMCcuda::runWithDrift() + 56 3 [ 0.793548, 1.041833, 1.198912, 1.198912, 1.198912] 3.034292 :0:0: qmcplusplus::QMCMain::executeQMCSection(_xmlNode*, bool) + 58 3 [ 0.793514, 1.041820, 1.198901, 1.198901, 1.198901] 3.034235 :0:0: qmcplusplus::QMCMain::runQMC(_xmlNode*) +``` + +If you also have a new version of llvm installed you can use the newer xray tools with the old trace. The best of these allows you to convert to a event-trace format that can then be massaged further to view in a graphical tool. + +``` shell +/home/epd/opt/llvm-7/bin/llvm-xray convert -output-format=trace_event -instr_map=$QMCPACK_APP -symbolize -sort -output=dmc-a4-e48-gpu.trace xray-log.qmcpack.yrxHG1 +``` + +Then convert into chrome viewable html doc using the [catapult](https://github.com/catapult-project/catapult) tool. + +``` +~/codes/catapult/tracing/bin/trace2html dmc-a4-e48-gpu.trace -o dmc-a4-e48-gpu.html +``` + +[example -- only works with chrome](http://cdash-minimal.ornl.gov/profiling/dmc-a4-e48-gpu.html) diff --git a/examples/profiling/build_llvm_nvcc_xray_instrumented.sh b/examples/profiling/build_llvm_nvcc_xray_instrumented.sh new file mode 100644 index 000000000..2cc780495 --- /dev/null +++ b/examples/profiling/build_llvm_nvcc_xray_instrumented.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Xray instrumentation is a feature of relatively modern llvm +# It is only supported on linux as of clang 4.0.1 +# While nvcc seems incomplatible with llvm > 5.0.1 even with +# a rewritten llvm version string +# Xray leaves entrypoints in the instrumented code that at runtime +# can be patched to call profiling routines in the compiler_rt library. +# It appears to be much less of a runtime drag that many profilers. + +# Requirements: +# It should be sufficient to install spack packages loaded +# below with the same specs. +# Additionally if you build llvm with your system gcc you need libatomic installed. +# this should be done with the same package manager that installed you system gcc + +# DMKL_ROOT should be defined to your mkl path +# DCUDA_TOOLKIT_ROOTDIR should point at the root of your CUDA install +# DCUDA_NVCC_FLAGS should have -arch appropriate to your CUDA version and hardware +# +# nvcc and libc++ don't get along hence -stdlib=libstdc++ +# It is both a compiling and linking flag + +spack load mpich%clang@4.0.1 +spack load hdf5%clang@4.0.1 +spack load ninja +spack load llvm@4.0.1 ^ncurses+termlib +spack load zlib%clang@4.0.1 + +CXXFLAGS="-stdlib=libstdc++" LDFLAGS="-stdlib=libstdc++" cmake -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} -DCMAKE_CXX_COMPILER=mpic++ -DCMAKE_C_COMPILER=mpicc -DENABLE_MKL=1 -DMKL_ROOT="/opt/intel2018/mkl" -GNinja -DQMC_MPI=1 -DQMC_CUDA=1 -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-9.1 -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DCUDA_NVCC_FLAGS="-arch=sm_60;-Drestrict=__restrict__;-DNO_CUDA_MAIN;-O3" -DHDF5_ROOT=/data/epd/spack/opt/spack/linux-rhel7-x86_64/gcc-4.8.5/hdf5-1.10.2-qqmot24bg6uetn3xhpxjlwafvxr4p5pp/ -DXRAY_PROFILE=1 -DXRAY_INSTRUCTION_THRESHOLD=50 -DXRAY_GPU_MOST=1 .. + +ninja + +#you will see numerous warnings about loop vectorizations.