mirror of https://github.com/Qiskit/qiskit-aer.git
Fix nested parallel performance (#1949)
This commit is contained in:
parent
b83abe2656
commit
1a5af5975c
|
@ -0,0 +1,7 @@
|
|||
---
|
||||
fixes:
|
||||
- |
|
||||
OpenMP nested parallel simulation for parallel experiments + parallel state
|
||||
update was very slow because gate fusion uses unitary simulator inside
|
||||
and it used omp parallel region. This fix remove parallel region in
|
||||
gate fusion and improve performance of nested parallel simulations
|
|
@ -541,7 +541,7 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
|
|||
|
||||
// nested should be set to zero if num_threads clause will be used
|
||||
#if _OPENMP >= 200805
|
||||
omp_set_max_active_levels(2);
|
||||
omp_set_max_active_levels(1);
|
||||
#else
|
||||
omp_set_nested(1);
|
||||
#endif
|
||||
|
|
|
@ -890,11 +890,10 @@ template <typename data_t>
|
|||
void QubitVector<data_t>::zero() {
|
||||
const int_t END = data_size_; // end for k loop
|
||||
|
||||
#pragma omp parallel for if (num_qubits_ > omp_threshold_ && omp_threads_ > 1) \
|
||||
num_threads(omp_threads_)
|
||||
for (int_t k = 0; k < END; ++k) {
|
||||
data_[k] = 0.0;
|
||||
}
|
||||
auto zero_proc = [this](int_t i) { data_[i] = 0.0; };
|
||||
Utils::apply_omp_parallel_for(
|
||||
(num_qubits_ > omp_threshold_ && omp_threads_ > 1), 0, END, zero_proc,
|
||||
omp_threads_);
|
||||
}
|
||||
|
||||
template <typename data_t>
|
||||
|
|
|
@ -238,13 +238,13 @@ void UnitaryMatrix<data_t>::initialize() {
|
|||
BaseVector::zero();
|
||||
// Set to be identity matrix
|
||||
const int_t nrows = rows_; // end for k loop
|
||||
#pragma omp parallel if (BaseVector::num_qubits_ > \
|
||||
BaseVector::omp_threshold_ && \
|
||||
BaseVector::omp_threads_ > 1) \
|
||||
num_threads(BaseVector::omp_threads_)
|
||||
for (int_t k = 0; k < nrows; ++k) {
|
||||
BaseVector::data_[k * (nrows + 1)] = 1.0;
|
||||
}
|
||||
auto initialize_proc = [this](int_t i) {
|
||||
BaseVector::data_[i * (rows_ + 1)] = 1.0;
|
||||
};
|
||||
Utils::apply_omp_parallel_for(
|
||||
(BaseVector::num_qubits_ > BaseVector::omp_threshold_ &&
|
||||
BaseVector::omp_threads_ > 1),
|
||||
0, rows_, initialize_proc, BaseVector::omp_threads_);
|
||||
}
|
||||
|
||||
template <class data_t>
|
||||
|
@ -260,15 +260,15 @@ void UnitaryMatrix<data_t>::initialize_from_matrix(
|
|||
std::to_string(mat.GetRows()) + "," + std::to_string(mat.GetColumns()) +
|
||||
").");
|
||||
}
|
||||
|
||||
#pragma omp parallel if (BaseVector::num_qubits_ > \
|
||||
BaseVector::omp_threshold_ && \
|
||||
BaseVector::omp_threads_ > 1) \
|
||||
num_threads(BaseVector::omp_threads_)
|
||||
for (int_t row = 0; row < nrows; ++row)
|
||||
for (int_t col = 0; col < nrows; ++col) {
|
||||
BaseVector::data_[row + nrows * col] = mat(row, col);
|
||||
auto initialize_proc = [this, &mat](int_t row) {
|
||||
for (int_t col = 0; col < rows_; ++col) {
|
||||
BaseVector::data_[row + rows_ * col] = mat(row, col);
|
||||
}
|
||||
};
|
||||
Utils::apply_omp_parallel_for(
|
||||
(BaseVector::num_qubits_ > BaseVector::omp_threshold_ &&
|
||||
BaseVector::omp_threads_ > 1),
|
||||
0, rows_, initialize_proc, BaseVector::omp_threads_);
|
||||
}
|
||||
|
||||
template <class data_t>
|
||||
|
|
|
@ -851,11 +851,21 @@ void Fusion::optimize_circuit(Circuit &circ, Noise::NoiseModel &noise,
|
|||
if (circ.ops.size() % parallelization_)
|
||||
++unit;
|
||||
|
||||
#pragma omp parallel for if (parallelization_ > 1) num_threads(parallelization_)
|
||||
for (int_t i = 0; i < parallelization_; i++) {
|
||||
int_t start = unit * i;
|
||||
int_t end = std::min(start + unit, (int_t)circ.ops.size());
|
||||
optimize_circuit(circ, noise, allowed_opset, start, end, fuser, method);
|
||||
if (parallelization_ > 1) {
|
||||
#pragma omp parallel for num_threads(parallelization_)
|
||||
for (int_t i = 0; i < parallelization_; i++) {
|
||||
int_t start = unit * i;
|
||||
int_t end = std::min(start + unit, (int_t)circ.ops.size());
|
||||
optimize_circuit(circ, noise, allowed_opset, start, end, fuser,
|
||||
method);
|
||||
}
|
||||
} else {
|
||||
for (int_t i = 0; i < parallelization_; i++) {
|
||||
int_t start = unit * i;
|
||||
int_t end = std::min(start + unit, (int_t)circ.ops.size());
|
||||
optimize_circuit(circ, noise, allowed_opset, start, end, fuser,
|
||||
method);
|
||||
}
|
||||
}
|
||||
result.metadata.add(parallelization_, "fusion", "parallelization");
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue