Fix nested parallel performance (#1949)

2023-09-28 17:05:31 +09:00 · 2023-09-28 17:05:31 +09:00 · 1a5af5975c
parent b83abe2656
commit 1a5af5975c
5 changed files with 42 additions and 26 deletions
--- a/releasenotes/notes/fix_omp_nested_performance-a3d55f3e85366a5b.yaml
+++ b/releasenotes/notes/fix_omp_nested_performance-a3d55f3e85366a5b.yaml
@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    OpenMP nested parallel simulation for parallel experiments + parallel state
+    update was very slow because gate fusion uses unitary simulator inside
+    and it used omp parallel region. This fix remove parallel region in
+    gate fusion and improve performance of nested parallel simulations
--- a/src/controllers/aer_controller.hpp
+++ b/src/controllers/aer_controller.hpp
@ -541,7 +541,7 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,

      // nested should be set to zero if num_threads clause will be used
 #if _OPENMP >= 200805
-      omp_set_max_active_levels(2);
+      omp_set_max_active_levels(1);
 #else
      omp_set_nested(1);
 #endif
--- a/src/simulators/statevector/qubitvector.hpp
+++ b/src/simulators/statevector/qubitvector.hpp
@ -890,11 +890,10 @@ template <typename data_t>
 void QubitVector<data_t>::zero() {
  const int_t END = data_size_; // end for k loop

-#pragma omp parallel for if (num_qubits_ > omp_threshold_ && omp_threads_ > 1) \
-    num_threads(omp_threads_)
-  for (int_t k = 0; k < END; ++k) {
-    data_[k] = 0.0;
-  }
+  auto zero_proc = [this](int_t i) { data_[i] = 0.0; };
+  Utils::apply_omp_parallel_for(
+      (num_qubits_ > omp_threshold_ && omp_threads_ > 1), 0, END, zero_proc,
+      omp_threads_);
 }

 template <typename data_t>
--- a/src/simulators/unitary/unitarymatrix.hpp
+++ b/src/simulators/unitary/unitarymatrix.hpp
@ -238,13 +238,13 @@ void UnitaryMatrix<data_t>::initialize() {
  BaseVector::zero();
  // Set to be identity matrix
  const int_t nrows = rows_; // end for k loop
-#pragma omp parallel if (BaseVector::num_qubits_ >                             \
-                             BaseVector::omp_threshold_ &&                     \
-                         BaseVector::omp_threads_ > 1)                         \
-    num_threads(BaseVector::omp_threads_)
-  for (int_t k = 0; k < nrows; ++k) {
-    BaseVector::data_[k * (nrows + 1)] = 1.0;
-  }
+  auto initialize_proc = [this](int_t i) {
+    BaseVector::data_[i * (rows_ + 1)] = 1.0;
+  };
+  Utils::apply_omp_parallel_for(
+      (BaseVector::num_qubits_ > BaseVector::omp_threshold_ &&
+       BaseVector::omp_threads_ > 1),
+      0, rows_, initialize_proc, BaseVector::omp_threads_);
 }

 template <class data_t>
@ -260,15 +260,15 @@ void UnitaryMatrix<data_t>::initialize_from_matrix(
        std::to_string(mat.GetRows()) + "," + std::to_string(mat.GetColumns()) +
        ").");
  }
-
-#pragma omp parallel if (BaseVector::num_qubits_ >                             \
-                             BaseVector::omp_threshold_ &&                     \
-                         BaseVector::omp_threads_ > 1)                         \
-    num_threads(BaseVector::omp_threads_)
-  for (int_t row = 0; row < nrows; ++row)
-    for (int_t col = 0; col < nrows; ++col) {
-      BaseVector::data_[row + nrows * col] = mat(row, col);
+  auto initialize_proc = [this, &mat](int_t row) {
+    for (int_t col = 0; col < rows_; ++col) {
+      BaseVector::data_[row + rows_ * col] = mat(row, col);
    }
+  };
+  Utils::apply_omp_parallel_for(
+      (BaseVector::num_qubits_ > BaseVector::omp_threshold_ &&
+       BaseVector::omp_threads_ > 1),
+      0, rows_, initialize_proc, BaseVector::omp_threads_);
 }

 template <class data_t>
--- a/src/transpile/fusion.hpp
+++ b/src/transpile/fusion.hpp
@ -851,11 +851,21 @@ void Fusion::optimize_circuit(Circuit &circ, Noise::NoiseModel &noise,
      if (circ.ops.size() % parallelization_)
        ++unit;

-#pragma omp parallel for if (parallelization_ > 1) num_threads(parallelization_)
-      for (int_t i = 0; i < parallelization_; i++) {
-        int_t start = unit * i;
-        int_t end = std::min(start + unit, (int_t)circ.ops.size());
-        optimize_circuit(circ, noise, allowed_opset, start, end, fuser, method);
+      if (parallelization_ > 1) {
+#pragma omp parallel for num_threads(parallelization_)
+        for (int_t i = 0; i < parallelization_; i++) {
+          int_t start = unit * i;
+          int_t end = std::min(start + unit, (int_t)circ.ops.size());
+          optimize_circuit(circ, noise, allowed_opset, start, end, fuser,
+                           method);
+        }
+      } else {
+        for (int_t i = 0; i < parallelization_; i++) {
+          int_t start = unit * i;
+          int_t end = std::min(start + unit, (int_t)circ.ops.size());
+          optimize_circuit(circ, noise, allowed_opset, start, end, fuser,
+                           method);
+        }
      }
      result.metadata.add(parallelization_, "fusion", "parallelization");
    }