diff --git a/releasenotes/notes/fix_omp_nested_performance-a3d55f3e85366a5b.yaml b/releasenotes/notes/fix_omp_nested_performance-a3d55f3e85366a5b.yaml new file mode 100644 index 000000000..50a19f6be --- /dev/null +++ b/releasenotes/notes/fix_omp_nested_performance-a3d55f3e85366a5b.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + OpenMP nested parallel simulation for parallel experiments + parallel state + update was very slow because gate fusion uses unitary simulator inside + and it used omp parallel region. This fix remove parallel region in + gate fusion and improve performance of nested parallel simulations diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index f42ae64ef..e6005b9a6 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -541,7 +541,7 @@ Result Controller::execute(std::vector> &circuits, // nested should be set to zero if num_threads clause will be used #if _OPENMP >= 200805 - omp_set_max_active_levels(2); + omp_set_max_active_levels(1); #else omp_set_nested(1); #endif diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp index 3cc84d8a7..a68689935 100755 --- a/src/simulators/statevector/qubitvector.hpp +++ b/src/simulators/statevector/qubitvector.hpp @@ -890,11 +890,10 @@ template void QubitVector::zero() { const int_t END = data_size_; // end for k loop -#pragma omp parallel for if (num_qubits_ > omp_threshold_ && omp_threads_ > 1) \ - num_threads(omp_threads_) - for (int_t k = 0; k < END; ++k) { - data_[k] = 0.0; - } + auto zero_proc = [this](int_t i) { data_[i] = 0.0; }; + Utils::apply_omp_parallel_for( + (num_qubits_ > omp_threshold_ && omp_threads_ > 1), 0, END, zero_proc, + omp_threads_); } template diff --git a/src/simulators/unitary/unitarymatrix.hpp b/src/simulators/unitary/unitarymatrix.hpp index 494d57e84..f40609166 100644 --- a/src/simulators/unitary/unitarymatrix.hpp +++ b/src/simulators/unitary/unitarymatrix.hpp @@ -238,13 +238,13 @@ void UnitaryMatrix::initialize() { BaseVector::zero(); // Set to be identity matrix const int_t nrows = rows_; // end for k loop -#pragma omp parallel if (BaseVector::num_qubits_ > \ - BaseVector::omp_threshold_ && \ - BaseVector::omp_threads_ > 1) \ - num_threads(BaseVector::omp_threads_) - for (int_t k = 0; k < nrows; ++k) { - BaseVector::data_[k * (nrows + 1)] = 1.0; - } + auto initialize_proc = [this](int_t i) { + BaseVector::data_[i * (rows_ + 1)] = 1.0; + }; + Utils::apply_omp_parallel_for( + (BaseVector::num_qubits_ > BaseVector::omp_threshold_ && + BaseVector::omp_threads_ > 1), + 0, rows_, initialize_proc, BaseVector::omp_threads_); } template @@ -260,15 +260,15 @@ void UnitaryMatrix::initialize_from_matrix( std::to_string(mat.GetRows()) + "," + std::to_string(mat.GetColumns()) + ")."); } - -#pragma omp parallel if (BaseVector::num_qubits_ > \ - BaseVector::omp_threshold_ && \ - BaseVector::omp_threads_ > 1) \ - num_threads(BaseVector::omp_threads_) - for (int_t row = 0; row < nrows; ++row) - for (int_t col = 0; col < nrows; ++col) { - BaseVector::data_[row + nrows * col] = mat(row, col); + auto initialize_proc = [this, &mat](int_t row) { + for (int_t col = 0; col < rows_; ++col) { + BaseVector::data_[row + rows_ * col] = mat(row, col); } + }; + Utils::apply_omp_parallel_for( + (BaseVector::num_qubits_ > BaseVector::omp_threshold_ && + BaseVector::omp_threads_ > 1), + 0, rows_, initialize_proc, BaseVector::omp_threads_); } template diff --git a/src/transpile/fusion.hpp b/src/transpile/fusion.hpp index d7c14ec8b..a3a1c8b59 100644 --- a/src/transpile/fusion.hpp +++ b/src/transpile/fusion.hpp @@ -851,11 +851,21 @@ void Fusion::optimize_circuit(Circuit &circ, Noise::NoiseModel &noise, if (circ.ops.size() % parallelization_) ++unit; -#pragma omp parallel for if (parallelization_ > 1) num_threads(parallelization_) - for (int_t i = 0; i < parallelization_; i++) { - int_t start = unit * i; - int_t end = std::min(start + unit, (int_t)circ.ops.size()); - optimize_circuit(circ, noise, allowed_opset, start, end, fuser, method); + if (parallelization_ > 1) { +#pragma omp parallel for num_threads(parallelization_) + for (int_t i = 0; i < parallelization_; i++) { + int_t start = unit * i; + int_t end = std::min(start + unit, (int_t)circ.ops.size()); + optimize_circuit(circ, noise, allowed_opset, start, end, fuser, + method); + } + } else { + for (int_t i = 0; i < parallelization_; i++) { + int_t start = unit * i; + int_t end = std::min(start + unit, (int_t)circ.ops.size()); + optimize_circuit(circ, noise, allowed_opset, start, end, fuser, + method); + } } result.metadata.add(parallelization_, "fusion", "parallelization"); }