adding profiling example

This commit is contained in:
Peter Doak 2018-06-16 17:25:27 -04:00
parent 1ada2a1a92
commit 9ae0d9222e
3 changed files with 80 additions and 2 deletions

View File

@ -685,13 +685,17 @@ ELSE(QMC_CUDA)
ENDIF(QMC_CUDA)
SET(XRAY_PROFILE FALSE CACHE BOOL "Use llvm xray profiling")
SET(XRAY_INSTRUCTION_THRESHOLD 20 CACHE INT "Instruction threshold for xray instrumentation")
SET(XRAY_INSTRUCTION_THRESHOLD 200 CACHE INT "Instruction threshold for xray instrumentation")
SET(XRAY_GPU_MOST FALSE CACHE BOOL "Manually instrument almost all of the CUDA implementations calls on the CPU")
IF(XRAY_PROFILE)
set(XRAY_FLAGS "-fxray-instrument -fxray-instruction-threshold=${XRAY_INSTRUCTION_THRESHOLD}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${XRAY_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${XRAY_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${XRAY_FLAGS}")
set_property(DIRECTORY ${CMAKE_SOURCE_DIR} APPEND PROPERTY COMPILE_DEFINITIONS GPU_XRAY_TRACE_ON)
if(XRAY_GPU_MOST)
set_property(DIRECTORY ${CMAKE_SOURCE_DIR} APPEND PROPERTY COMPILE_DEFINITIONS GPU_XRAY_TRACE_ON)
endif(XRAY_GPU_MOST)
ENDIF(XRAY_PROFILE)

View File

@ -0,0 +1,40 @@
# Profiling QMCPACK
## LLVM-XRAY based
With CUDA:
### LLVM 4.0.1 and NVCC 9.1 -- allows simultaneous nvprof and xray instrumentation of CPU code
* Build: see [build_script](./build_llvm_nvcc_xray_instrumented.sh)
* Running: With the performance tests set up.
``` shell
[epd@oxygen ]$ export QMCPACK_APP=/path/to/your/instrumented/bin/qmcpack
[epd@oxygen ]$ cd /performace/NiO/sample/dmc-a4-e48-gpu
[epd@oxygen ]$ OMP_NUM_THREADS=12 XRAY_OPTIONS="patch_premain=true xray_mode=xray-basic" nvprof -o simul.nvprof $QMCPACK_APP NiO-fcc-S1-dmc.xml
[epd@oxygen ]$ llvm-xray account xray-log.qmcpack.yrxHG1 -instr_map=$QMCPACK_APP -sort=med -top=10 -sortorder=dsc
Functions with latencies: 514
funcid count [ min, med, 90p, 99p, max] sum function
1 1 [12.220955, 12.220955, 12.220955, 12.220955, 12.220955] 12.220955 <invalid>:0:0: main
55 1 [ 5.188287, 5.188287, 5.188287, 5.188287, 5.188287] 5.188287 <invalid>:0:0: qmcplusplus::QMCMain::execute()
59 1 [ 2.152482, 2.152482, 2.152482, 2.152482, 2.152482] 2.152482 <invalid>:0:0: qmcplusplus::QMCMain::validateXML()
64 1 [ 1.795763, 1.795763, 1.795763, 1.795763, 1.795763] 1.795763 <invalid>:0:0: qmcplusplus::WaveFunctionPool::put(_xmlNode*)
1266 1 [ 1.795672, 1.795672, 1.795672, 1.795672, 1.795672] 1.795672 <invalid>:0:0: qmcplusplus::WaveFunctionFactory::build(_xmlNode*, bool)
1267 1 [ 1.488744, 1.488744, 1.488744, 1.488744, 1.488744] 1.488744 <invalid>:0:0: qmcplusplus::WaveFunctionFactory::addFermionTerm(_xmlNode*)
1767 1 [ 1.488722, 1.488722, 1.488722, 1.488722, 1.488722] 1.488722 <invalid>:0:0: qmcplusplus::SlaterDetBuilder::put(_xmlNode*)
335 1 [ 1.198301, 1.198301, 1.198301, 1.198301, 1.198301] 1.198301 <invalid>:0:0: qmcplusplus::VMCcuda::runWithDrift()
56 3 [ 0.793548, 1.041833, 1.198912, 1.198912, 1.198912] 3.034292 <invalid>:0:0: qmcplusplus::QMCMain::executeQMCSection(_xmlNode*, bool)
58 3 [ 0.793514, 1.041820, 1.198901, 1.198901, 1.198901] 3.034235 <invalid>:0:0: qmcplusplus::QMCMain::runQMC(_xmlNode*)
```
If you also have a new version of llvm installed you can use the newer xray tools with the old trace. The best of these allows you to convert to a event-trace format that can then be massaged further to view in a graphical tool.
``` shell
/home/epd/opt/llvm-7/bin/llvm-xray convert -output-format=trace_event -instr_map=$QMCPACK_APP -symbolize -sort -output=dmc-a4-e48-gpu.trace xray-log.qmcpack.yrxHG1
```
Then convert into chrome viewable html doc using the [catapult](https://github.com/catapult-project/catapult) tool.
```
~/codes/catapult/tracing/bin/trace2html dmc-a4-e48-gpu.trace -o dmc-a4-e48-gpu.html
```
[example -- only works with chrome](http://cdash-minimal.ornl.gov/profiling/dmc-a4-e48-gpu.html)

View File

@ -0,0 +1,34 @@
#!/bin/bash
# Xray instrumentation is a feature of relatively modern llvm
# It is only supported on linux as of clang 4.0.1
# While nvcc seems incomplatible with llvm > 5.0.1 even with
# a rewritten llvm version string
# Xray leaves entrypoints in the instrumented code that at runtime
# can be patched to call profiling routines in the compiler_rt library.
# It appears to be much less of a runtime drag that many profilers.
# Requirements:
# It should be sufficient to install spack packages loaded
# below with the same specs.
# Additionally if you build llvm with your system gcc you need libatomic installed.
# this should be done with the same package manager that installed you system gcc
# DMKL_ROOT should be defined to your mkl path
# DCUDA_TOOLKIT_ROOTDIR should point at the root of your CUDA install
# DCUDA_NVCC_FLAGS should have -arch appropriate to your CUDA version and hardware
#
# nvcc and libc++ don't get along hence -stdlib=libstdc++
# It is both a compiling and linking flag
spack load mpich%clang@4.0.1
spack load hdf5%clang@4.0.1
spack load ninja
spack load llvm@4.0.1 ^ncurses+termlib
spack load zlib%clang@4.0.1
CXXFLAGS="-stdlib=libstdc++" LDFLAGS="-stdlib=libstdc++" cmake -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} -DCMAKE_CXX_COMPILER=mpic++ -DCMAKE_C_COMPILER=mpicc -DENABLE_MKL=1 -DMKL_ROOT="/opt/intel2018/mkl" -GNinja -DQMC_MPI=1 -DQMC_CUDA=1 -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-9.1 -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DCUDA_NVCC_FLAGS="-arch=sm_60;-Drestrict=__restrict__;-DNO_CUDA_MAIN;-O3" -DHDF5_ROOT=/data/epd/spack/opt/spack/linux-rhel7-x86_64/gcc-4.8.5/hdf5-1.10.2-qqmot24bg6uetn3xhpxjlwafvxr4p5pp/ -DXRAY_PROFILE=1 -DXRAY_INSTRUCTION_THRESHOLD=50 -DXRAY_GPU_MOST=1 ..
ninja
#you will see numerous warnings about loop vectorizations.