diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp --- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp 2020-06-05 21:07:57.000000000 +0800 +++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp 2021-04-19 18:04:05.541802882 +0800 @@ -17,18 +17,18 @@ { __m256d v[2]; - v[0] = load2(&psi[I]); - v[1] = load2(&psi[I + d0]); + v[0] = load2(psi + 2 * I); + v[1] = load2(psi + 2 * (I + d0)); - _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(mul(v[0], m[0], mt[0]), mul(v[1], m[1], mt[1]))); + _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(mul(v[0], m[0], mt[0]), mul(v[1], m[1], mt[1]))); } // bit indices id[.] are given from high to low (e.g. control first for CNOT) template -void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask) +void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len) { - std::size_t n = psi.size(); + std::size_t n = len; std::size_t d0 = 1UL << id0; __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1])}; diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp --- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp 2020-06-05 21:07:57.000000000 +0800 +++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp 2021-04-19 18:04:05.541802882 +0800 @@ -17,21 +17,21 @@ { __m256d v[4]; - v[0] = load2(&psi[I]); - v[1] = load2(&psi[I + d0]); - v[2] = load2(&psi[I + d1]); - v[3] = load2(&psi[I + d0 + d1]); + v[0] = load2(psi + 2 * I); + v[1] = load2(psi + 2 * (I + d0)); + v[2] = load2(psi + 2 * (I + d1)); + v[3] = load2(psi + 2 * (I + d0 + d1)); - _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3]))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7]))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3]))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7]))))); } // bit indices id[.] are given from high to low (e.g. control first for CNOT) template -void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) +void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len) { - std::size_t n = psi.size(); + std::size_t n = len; std::size_t d0 = 1UL << id0; std::size_t d1 = 1UL << id1; diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp --- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp 2020-06-05 21:07:57.000000000 +0800 +++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp 2021-04-19 18:04:05.541802882 +0800 @@ -17,10 +17,10 @@ { __m256d v[4]; - v[0] = load2(&psi[I]); - v[1] = load2(&psi[I + d0]); - v[2] = load2(&psi[I + d1]); - v[3] = load2(&psi[I + d0 + d1]); + v[0] = load2(psi + 2 * I); + v[1] = load2(psi + 2 * (I + d0)); + v[2] = load2(psi + 2 * (I + d1)); + v[3] = load2(psi + 2 * (I + d0 + d1)); __m256d tmp[4]; @@ -29,23 +29,23 @@ tmp[2] = add(mul(v[0], m[8], mt[8]), add(mul(v[1], m[9], mt[9]), add(mul(v[2], m[10], mt[10]), mul(v[3], m[11], mt[11])))); tmp[3] = add(mul(v[0], m[12], mt[12]), add(mul(v[1], m[13], mt[13]), add(mul(v[2], m[14], mt[14]), mul(v[3], m[15], mt[15])))); - v[0] = load2(&psi[I + d2]); - v[1] = load2(&psi[I + d0 + d2]); - v[2] = load2(&psi[I + d1 + d2]); - v[3] = load2(&psi[I + d0 + d1 + d2]); - - _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31])))))); + v[0] = load2(psi + 2 * (I + d2)); + v[1] = load2(psi + 2 * (I + d0 + d2)); + v[2] = load2(psi + 2 * (I + d1 + d2)); + v[3] = load2(psi + 2 * (I + d0 + d1 + d2)); + + _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(tmp[0], add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(tmp[1], add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2), psi + 2 * (I + d2), add(tmp[2], add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2), psi + 2 * (I + d1 + d2), add(tmp[3], add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31])))))); } // bit indices id[.] are given from high to low (e.g. control first for CNOT) template -void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) +void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len) { - std::size_t n = psi.size(); + std::size_t n = len; std::size_t d0 = 1UL << id0; std::size_t d1 = 1UL << id1; std::size_t d2 = 1UL << id2; diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp --- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp 2020-06-05 21:07:57.000000000 +0800 +++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp 2021-04-19 18:04:05.541802882 +0800 @@ -17,10 +17,10 @@ { __m256d v[4]; - v[0] = load2(&psi[I]); - v[1] = load2(&psi[I + d0]); - v[2] = load2(&psi[I + d1]); - v[3] = load2(&psi[I + d0 + d1]); + v[0] = load2(psi + 2 * I); + v[1] = load2(psi + 2 * (I + d0)); + v[2] = load2(psi + 2 * (I + d1)); + v[3] = load2(psi + 2 * (I + d0 + d1)); __m256d tmp[8]; @@ -33,10 +33,10 @@ tmp[6] = add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27])))); tmp[7] = add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31])))); - v[0] = load2(&psi[I + d2]); - v[1] = load2(&psi[I + d0 + d2]); - v[2] = load2(&psi[I + d1 + d2]); - v[3] = load2(&psi[I + d0 + d1 + d2]); + v[0] = load2(psi + 2 * (I + d2)); + v[1] = load2(psi + 2 * (I + d0 + d2)); + v[2] = load2(psi + 2 * (I + d1 + d2)); + v[3] = load2(psi + 2 * (I + d0 + d1 + d2)); tmp[0] = add(tmp[0], add(mul(v[0], m[32], mt[32]), add(mul(v[1], m[33], mt[33]), add(mul(v[2], m[34], mt[34]), mul(v[3], m[35], mt[35]))))); tmp[1] = add(tmp[1], add(mul(v[0], m[36], mt[36]), add(mul(v[1], m[37], mt[37]), add(mul(v[2], m[38], mt[38]), mul(v[3], m[39], mt[39]))))); @@ -47,10 +47,10 @@ tmp[6] = add(tmp[6], add(mul(v[0], m[56], mt[56]), add(mul(v[1], m[57], mt[57]), add(mul(v[2], m[58], mt[58]), mul(v[3], m[59], mt[59]))))); tmp[7] = add(tmp[7], add(mul(v[0], m[60], mt[60]), add(mul(v[1], m[61], mt[61]), add(mul(v[2], m[62], mt[62]), mul(v[3], m[63], mt[63]))))); - v[0] = load2(&psi[I + d3]); - v[1] = load2(&psi[I + d0 + d3]); - v[2] = load2(&psi[I + d1 + d3]); - v[3] = load2(&psi[I + d0 + d1 + d3]); + v[0] = load2(psi + 2 * (I + d3)); + v[1] = load2(psi + 2 * (I + d0 + d3)); + v[2] = load2(psi + 2 * (I + d1 + d3)); + v[3] = load2(psi + 2 * (I + d0 + d1 + d3)); tmp[0] = add(tmp[0], add(mul(v[0], m[64], mt[64]), add(mul(v[1], m[65], mt[65]), add(mul(v[2], m[66], mt[66]), mul(v[3], m[67], mt[67]))))); tmp[1] = add(tmp[1], add(mul(v[0], m[68], mt[68]), add(mul(v[1], m[69], mt[69]), add(mul(v[2], m[70], mt[70]), mul(v[3], m[71], mt[71]))))); @@ -61,27 +61,27 @@ tmp[6] = add(tmp[6], add(mul(v[0], m[88], mt[88]), add(mul(v[1], m[89], mt[89]), add(mul(v[2], m[90], mt[90]), mul(v[3], m[91], mt[91]))))); tmp[7] = add(tmp[7], add(mul(v[0], m[92], mt[92]), add(mul(v[1], m[93], mt[93]), add(mul(v[2], m[94], mt[94]), mul(v[3], m[95], mt[95]))))); - v[0] = load2(&psi[I + d2 + d3]); - v[1] = load2(&psi[I + d0 + d2 + d3]); - v[2] = load2(&psi[I + d1 + d2 + d3]); - v[3] = load2(&psi[I + d0 + d1 + d2 + d3]); - - _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[96], mt[96]), add(mul(v[1], m[97], mt[97]), add(mul(v[2], m[98], mt[98]), mul(v[3], m[99], mt[99])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[100], mt[100]), add(mul(v[1], m[101], mt[101]), add(mul(v[2], m[102], mt[102]), mul(v[3], m[103], mt[103])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[104], mt[104]), add(mul(v[1], m[105], mt[105]), add(mul(v[2], m[106], mt[106]), mul(v[3], m[107], mt[107])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[108], mt[108]), add(mul(v[1], m[109], mt[109]), add(mul(v[2], m[110], mt[110]), mul(v[3], m[111], mt[111])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d3], (double*)&psi[I + d3], add(tmp[4], add(mul(v[0], m[112], mt[112]), add(mul(v[1], m[113], mt[113]), add(mul(v[2], m[114], mt[114]), mul(v[3], m[115], mt[115])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], add(tmp[5], add(mul(v[0], m[116], mt[116]), add(mul(v[1], m[117], mt[117]), add(mul(v[2], m[118], mt[118]), mul(v[3], m[119], mt[119])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], add(tmp[6], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], add(tmp[7], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127])))))); + v[0] = load2(psi + 2 * (I + d2 + d3)); + v[1] = load2(psi + 2 * (I + d0 + d2 + d3)); + v[2] = load2(psi + 2 * (I + d1 + d2 + d3)); + v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d3)); + + _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(tmp[0], add(mul(v[0], m[96], mt[96]), add(mul(v[1], m[97], mt[97]), add(mul(v[2], m[98], mt[98]), mul(v[3], m[99], mt[99])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(tmp[1], add(mul(v[0], m[100], mt[100]), add(mul(v[1], m[101], mt[101]), add(mul(v[2], m[102], mt[102]), mul(v[3], m[103], mt[103])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2), psi + 2 * (I + d2), add(tmp[2], add(mul(v[0], m[104], mt[104]), add(mul(v[1], m[105], mt[105]), add(mul(v[2], m[106], mt[106]), mul(v[3], m[107], mt[107])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2), psi + 2 * (I + d1 + d2), add(tmp[3], add(mul(v[0], m[108], mt[108]), add(mul(v[1], m[109], mt[109]), add(mul(v[2], m[110], mt[110]), mul(v[3], m[111], mt[111])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d3), psi + 2 * (I + d3), add(tmp[4], add(mul(v[0], m[112], mt[112]), add(mul(v[1], m[113], mt[113]), add(mul(v[2], m[114], mt[114]), mul(v[3], m[115], mt[115])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d3), psi + 2 * (I + d1 + d3), add(tmp[5], add(mul(v[0], m[116], mt[116]), add(mul(v[1], m[117], mt[117]), add(mul(v[2], m[118], mt[118]), mul(v[3], m[119], mt[119])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d3), psi + 2 * (I + d2 + d3), add(tmp[6], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d3), psi + 2 * (I + d1 + d2 + d3), add(tmp[7], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127])))))); } // bit indices id[.] are given from high to low (e.g. control first for CNOT) template -void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) +void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len) { - std::size_t n = psi.size(); + std::size_t n = len; std::size_t d0 = 1UL << id0; std::size_t d1 = 1UL << id1; std::size_t d2 = 1UL << id2; diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp --- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp 2020-06-05 21:07:57.000000000 +0800 +++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp 2021-04-19 18:04:05.541802882 +0800 @@ -17,10 +17,10 @@ { __m256d v[4]; - v[0] = load2(&psi[I]); - v[1] = load2(&psi[I + d0]); - v[2] = load2(&psi[I + d1]); - v[3] = load2(&psi[I + d0 + d1]); + v[0] = load2(psi + 2 * I); + v[1] = load2(psi + 2 * (I + d0)); + v[2] = load2(psi + 2 * (I + d1)); + v[3] = load2(psi + 2 * (I + d0 + d1)); __m256d tmp[16]; @@ -41,10 +41,10 @@ tmp[14] = add(mul(v[0], m[56], mt[56]), add(mul(v[1], m[57], mt[57]), add(mul(v[2], m[58], mt[58]), mul(v[3], m[59], mt[59])))); tmp[15] = add(mul(v[0], m[60], mt[60]), add(mul(v[1], m[61], mt[61]), add(mul(v[2], m[62], mt[62]), mul(v[3], m[63], mt[63])))); - v[0] = load2(&psi[I + d2]); - v[1] = load2(&psi[I + d0 + d2]); - v[2] = load2(&psi[I + d1 + d2]); - v[3] = load2(&psi[I + d0 + d1 + d2]); + v[0] = load2(psi + 2 * (I + d2)); + v[1] = load2(psi + 2 * (I + d0 + d2)); + v[2] = load2(psi + 2 * (I + d1 + d2)); + v[3] = load2(psi + 2 * (I + d0 + d1 + d2)); tmp[0] = add(tmp[0], add(mul(v[0], m[64], mt[64]), add(mul(v[1], m[65], mt[65]), add(mul(v[2], m[66], mt[66]), mul(v[3], m[67], mt[67]))))); tmp[1] = add(tmp[1], add(mul(v[0], m[68], mt[68]), add(mul(v[1], m[69], mt[69]), add(mul(v[2], m[70], mt[70]), mul(v[3], m[71], mt[71]))))); @@ -63,10 +63,10 @@ tmp[14] = add(tmp[14], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123]))))); tmp[15] = add(tmp[15], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127]))))); - v[0] = load2(&psi[I + d3]); - v[1] = load2(&psi[I + d0 + d3]); - v[2] = load2(&psi[I + d1 + d3]); - v[3] = load2(&psi[I + d0 + d1 + d3]); + v[0] = load2(psi + 2 * (I + d3)); + v[1] = load2(psi + 2 * (I + d0 + d3)); + v[2] = load2(psi + 2 * (I + d1 + d3)); + v[3] = load2(psi + 2 * (I + d0 + d1 + d3)); tmp[0] = add(tmp[0], add(mul(v[0], m[128], mt[128]), add(mul(v[1], m[129], mt[129]), add(mul(v[2], m[130], mt[130]), mul(v[3], m[131], mt[131]))))); tmp[1] = add(tmp[1], add(mul(v[0], m[132], mt[132]), add(mul(v[1], m[133], mt[133]), add(mul(v[2], m[134], mt[134]), mul(v[3], m[135], mt[135]))))); @@ -85,10 +85,10 @@ tmp[14] = add(tmp[14], add(mul(v[0], m[184], mt[184]), add(mul(v[1], m[185], mt[185]), add(mul(v[2], m[186], mt[186]), mul(v[3], m[187], mt[187]))))); tmp[15] = add(tmp[15], add(mul(v[0], m[188], mt[188]), add(mul(v[1], m[189], mt[189]), add(mul(v[2], m[190], mt[190]), mul(v[3], m[191], mt[191]))))); - v[0] = load2(&psi[I + d2 + d3]); - v[1] = load2(&psi[I + d0 + d2 + d3]); - v[2] = load2(&psi[I + d1 + d2 + d3]); - v[3] = load2(&psi[I + d0 + d1 + d2 + d3]); + v[0] = load2(psi + 2 * (I + d2 + d3)); + v[1] = load2(psi + 2 * (I + d0 + d2 + d3)); + v[2] = load2(psi + 2 * (I + d1 + d2 + d3)); + v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d3)); tmp[0] = add(tmp[0], add(mul(v[0], m[192], mt[192]), add(mul(v[1], m[193], mt[193]), add(mul(v[2], m[194], mt[194]), mul(v[3], m[195], mt[195]))))); tmp[1] = add(tmp[1], add(mul(v[0], m[196], mt[196]), add(mul(v[1], m[197], mt[197]), add(mul(v[2], m[198], mt[198]), mul(v[3], m[199], mt[199]))))); @@ -107,10 +107,10 @@ tmp[14] = add(tmp[14], add(mul(v[0], m[248], mt[248]), add(mul(v[1], m[249], mt[249]), add(mul(v[2], m[250], mt[250]), mul(v[3], m[251], mt[251]))))); tmp[15] = add(tmp[15], add(mul(v[0], m[252], mt[252]), add(mul(v[1], m[253], mt[253]), add(mul(v[2], m[254], mt[254]), mul(v[3], m[255], mt[255]))))); - v[0] = load2(&psi[I + d4]); - v[1] = load2(&psi[I + d0 + d4]); - v[2] = load2(&psi[I + d1 + d4]); - v[3] = load2(&psi[I + d0 + d1 + d4]); + v[0] = load2(psi + 2 * (I + d4)); + v[1] = load2(psi + 2 * (I + d0 + d4)); + v[2] = load2(psi + 2 * (I + d1 + d4)); + v[3] = load2(psi + 2 * (I + d0 + d1 + d4)); tmp[0] = add(tmp[0], add(mul(v[0], m[256], mt[256]), add(mul(v[1], m[257], mt[257]), add(mul(v[2], m[258], mt[258]), mul(v[3], m[259], mt[259]))))); tmp[1] = add(tmp[1], add(mul(v[0], m[260], mt[260]), add(mul(v[1], m[261], mt[261]), add(mul(v[2], m[262], mt[262]), mul(v[3], m[263], mt[263]))))); @@ -129,10 +129,10 @@ tmp[14] = add(tmp[14], add(mul(v[0], m[312], mt[312]), add(mul(v[1], m[313], mt[313]), add(mul(v[2], m[314], mt[314]), mul(v[3], m[315], mt[315]))))); tmp[15] = add(tmp[15], add(mul(v[0], m[316], mt[316]), add(mul(v[1], m[317], mt[317]), add(mul(v[2], m[318], mt[318]), mul(v[3], m[319], mt[319]))))); - v[0] = load2(&psi[I + d2 + d4]); - v[1] = load2(&psi[I + d0 + d2 + d4]); - v[2] = load2(&psi[I + d1 + d2 + d4]); - v[3] = load2(&psi[I + d0 + d1 + d2 + d4]); + v[0] = load2(psi + 2 * (I + d2 + d4)); + v[1] = load2(psi + 2 * (I + d0 + d2 + d4)); + v[2] = load2(psi + 2 * (I + d1 + d2 + d4)); + v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d4)); tmp[0] = add(tmp[0], add(mul(v[0], m[320], mt[320]), add(mul(v[1], m[321], mt[321]), add(mul(v[2], m[322], mt[322]), mul(v[3], m[323], mt[323]))))); tmp[1] = add(tmp[1], add(mul(v[0], m[324], mt[324]), add(mul(v[1], m[325], mt[325]), add(mul(v[2], m[326], mt[326]), mul(v[3], m[327], mt[327]))))); @@ -151,10 +151,10 @@ tmp[14] = add(tmp[14], add(mul(v[0], m[376], mt[376]), add(mul(v[1], m[377], mt[377]), add(mul(v[2], m[378], mt[378]), mul(v[3], m[379], mt[379]))))); tmp[15] = add(tmp[15], add(mul(v[0], m[380], mt[380]), add(mul(v[1], m[381], mt[381]), add(mul(v[2], m[382], mt[382]), mul(v[3], m[383], mt[383]))))); - v[0] = load2(&psi[I + d3 + d4]); - v[1] = load2(&psi[I + d0 + d3 + d4]); - v[2] = load2(&psi[I + d1 + d3 + d4]); - v[3] = load2(&psi[I + d0 + d1 + d3 + d4]); + v[0] = load2(psi + 2 * (I + d3 + d4)); + v[1] = load2(psi + 2 * (I + d0 + d3 + d4)); + v[2] = load2(psi + 2 * (I + d1 + d3 + d4)); + v[3] = load2(psi + 2 * (I + d0 + d1 + d3 + d4)); tmp[0] = add(tmp[0], add(mul(v[0], m[384], mt[384]), add(mul(v[1], m[385], mt[385]), add(mul(v[2], m[386], mt[386]), mul(v[3], m[387], mt[387]))))); tmp[1] = add(tmp[1], add(mul(v[0], m[388], mt[388]), add(mul(v[1], m[389], mt[389]), add(mul(v[2], m[390], mt[390]), mul(v[3], m[391], mt[391]))))); @@ -173,35 +173,35 @@ tmp[14] = add(tmp[14], add(mul(v[0], m[440], mt[440]), add(mul(v[1], m[441], mt[441]), add(mul(v[2], m[442], mt[442]), mul(v[3], m[443], mt[443]))))); tmp[15] = add(tmp[15], add(mul(v[0], m[444], mt[444]), add(mul(v[1], m[445], mt[445]), add(mul(v[2], m[446], mt[446]), mul(v[3], m[447], mt[447]))))); - v[0] = load2(&psi[I + d2 + d3 + d4]); - v[1] = load2(&psi[I + d0 + d2 + d3 + d4]); - v[2] = load2(&psi[I + d1 + d2 + d3 + d4]); - v[3] = load2(&psi[I + d0 + d1 + d2 + d3 + d4]); - - _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[448], mt[448]), add(mul(v[1], m[449], mt[449]), add(mul(v[2], m[450], mt[450]), mul(v[3], m[451], mt[451])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[452], mt[452]), add(mul(v[1], m[453], mt[453]), add(mul(v[2], m[454], mt[454]), mul(v[3], m[455], mt[455])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[456], mt[456]), add(mul(v[1], m[457], mt[457]), add(mul(v[2], m[458], mt[458]), mul(v[3], m[459], mt[459])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[460], mt[460]), add(mul(v[1], m[461], mt[461]), add(mul(v[2], m[462], mt[462]), mul(v[3], m[463], mt[463])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d3], (double*)&psi[I + d3], add(tmp[4], add(mul(v[0], m[464], mt[464]), add(mul(v[1], m[465], mt[465]), add(mul(v[2], m[466], mt[466]), mul(v[3], m[467], mt[467])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], add(tmp[5], add(mul(v[0], m[468], mt[468]), add(mul(v[1], m[469], mt[469]), add(mul(v[2], m[470], mt[470]), mul(v[3], m[471], mt[471])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], add(tmp[6], add(mul(v[0], m[472], mt[472]), add(mul(v[1], m[473], mt[473]), add(mul(v[2], m[474], mt[474]), mul(v[3], m[475], mt[475])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], add(tmp[7], add(mul(v[0], m[476], mt[476]), add(mul(v[1], m[477], mt[477]), add(mul(v[2], m[478], mt[478]), mul(v[3], m[479], mt[479])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d4], (double*)&psi[I + d4], add(tmp[8], add(mul(v[0], m[480], mt[480]), add(mul(v[1], m[481], mt[481]), add(mul(v[2], m[482], mt[482]), mul(v[3], m[483], mt[483])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], add(tmp[9], add(mul(v[0], m[484], mt[484]), add(mul(v[1], m[485], mt[485]), add(mul(v[2], m[486], mt[486]), mul(v[3], m[487], mt[487])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], add(tmp[10], add(mul(v[0], m[488], mt[488]), add(mul(v[1], m[489], mt[489]), add(mul(v[2], m[490], mt[490]), mul(v[3], m[491], mt[491])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], add(tmp[11], add(mul(v[0], m[492], mt[492]), add(mul(v[1], m[493], mt[493]), add(mul(v[2], m[494], mt[494]), mul(v[3], m[495], mt[495])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], add(tmp[12], add(mul(v[0], m[496], mt[496]), add(mul(v[1], m[497], mt[497]), add(mul(v[2], m[498], mt[498]), mul(v[3], m[499], mt[499])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], add(tmp[13], add(mul(v[0], m[500], mt[500]), add(mul(v[1], m[501], mt[501]), add(mul(v[2], m[502], mt[502]), mul(v[3], m[503], mt[503])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], add(tmp[14], add(mul(v[0], m[504], mt[504]), add(mul(v[1], m[505], mt[505]), add(mul(v[2], m[506], mt[506]), mul(v[3], m[507], mt[507])))))); - _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], add(tmp[15], add(mul(v[0], m[508], mt[508]), add(mul(v[1], m[509], mt[509]), add(mul(v[2], m[510], mt[510]), mul(v[3], m[511], mt[511])))))); + v[0] = load2(psi + 2 * (I + d2 + d3 + d4)); + v[1] = load2(psi + 2 * (I + d0 + d2 + d3 + d4)); + v[2] = load2(psi + 2 * (I + d1 + d2 + d3 + d4)); + v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d3 + d4)); + + _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(tmp[0], add(mul(v[0], m[448], mt[448]), add(mul(v[1], m[449], mt[449]), add(mul(v[2], m[450], mt[450]), mul(v[3], m[451], mt[451])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(tmp[1], add(mul(v[0], m[452], mt[452]), add(mul(v[1], m[453], mt[453]), add(mul(v[2], m[454], mt[454]), mul(v[3], m[455], mt[455])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2), psi + 2 * (I + d2), add(tmp[2], add(mul(v[0], m[456], mt[456]), add(mul(v[1], m[457], mt[457]), add(mul(v[2], m[458], mt[458]), mul(v[3], m[459], mt[459])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2), psi + 2 * (I + d1 + d2), add(tmp[3], add(mul(v[0], m[460], mt[460]), add(mul(v[1], m[461], mt[461]), add(mul(v[2], m[462], mt[462]), mul(v[3], m[463], mt[463])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d3), psi + 2 * (I + d3), add(tmp[4], add(mul(v[0], m[464], mt[464]), add(mul(v[1], m[465], mt[465]), add(mul(v[2], m[466], mt[466]), mul(v[3], m[467], mt[467])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d3), psi + 2 * (I + d1 + d3), add(tmp[5], add(mul(v[0], m[468], mt[468]), add(mul(v[1], m[469], mt[469]), add(mul(v[2], m[470], mt[470]), mul(v[3], m[471], mt[471])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d3), psi + 2 * (I + d2 + d3), add(tmp[6], add(mul(v[0], m[472], mt[472]), add(mul(v[1], m[473], mt[473]), add(mul(v[2], m[474], mt[474]), mul(v[3], m[475], mt[475])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d3), psi + 2 * (I + d1 + d2 + d3), add(tmp[7], add(mul(v[0], m[476], mt[476]), add(mul(v[1], m[477], mt[477]), add(mul(v[2], m[478], mt[478]), mul(v[3], m[479], mt[479])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d4), psi + 2 * (I + d4), add(tmp[8], add(mul(v[0], m[480], mt[480]), add(mul(v[1], m[481], mt[481]), add(mul(v[2], m[482], mt[482]), mul(v[3], m[483], mt[483])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d4), psi + 2 * (I + d1 + d4), add(tmp[9], add(mul(v[0], m[484], mt[484]), add(mul(v[1], m[485], mt[485]), add(mul(v[2], m[486], mt[486]), mul(v[3], m[487], mt[487])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d4), psi + 2 * (I + d2 + d4), add(tmp[10], add(mul(v[0], m[488], mt[488]), add(mul(v[1], m[489], mt[489]), add(mul(v[2], m[490], mt[490]), mul(v[3], m[491], mt[491])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d4), psi + 2 * (I + d1 + d2 + d4), add(tmp[11], add(mul(v[0], m[492], mt[492]), add(mul(v[1], m[493], mt[493]), add(mul(v[2], m[494], mt[494]), mul(v[3], m[495], mt[495])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d3 + d4), psi + 2 * (I + d3 + d4), add(tmp[12], add(mul(v[0], m[496], mt[496]), add(mul(v[1], m[497], mt[497]), add(mul(v[2], m[498], mt[498]), mul(v[3], m[499], mt[499])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d3 + d4), psi + 2 * (I + d1 + d3 + d4), add(tmp[13], add(mul(v[0], m[500], mt[500]), add(mul(v[1], m[501], mt[501]), add(mul(v[2], m[502], mt[502]), mul(v[3], m[503], mt[503])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d3 + d4), psi + 2 * (I + d2 + d3 + d4), add(tmp[14], add(mul(v[0], m[504], mt[504]), add(mul(v[1], m[505], mt[505]), add(mul(v[2], m[506], mt[506]), mul(v[3], m[507], mt[507])))))); + _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d3 + d4), psi + 2 * (I + d1 + d2 + d3 + d4), add(tmp[15], add(mul(v[0], m[508], mt[508]), add(mul(v[1], m[509], mt[509]), add(mul(v[2], m[510], mt[510]), mul(v[3], m[511], mt[511])))))); } // bit indices id[.] are given from high to low (e.g. control first for CNOT) template -void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) +void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len) { - std::size_t n = psi.size(); + std::size_t n = len; std::size_t d0 = 1UL << id0; std::size_t d1 = 1UL << id1; std::size_t d2 = 1UL << id2; diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/simulator.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/simulator.hpp --- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/simulator.hpp 2020-06-05 21:07:57.000000000 +0800 +++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/simulator.hpp 2021-04-20 11:46:27.115554725 +0800 @@ -18,11 +18,7 @@ #include #include -#if defined(NOINTRIN) || !defined(INTRIN) -#include "nointrin/kernels.hpp" -#else #include "intrin/kernels.hpp" -#endif #include "intrin/alignedallocator.hpp" #include "fusion.hpp" @@ -32,173 +28,29 @@ #include #include #include - +#include class Simulator{ public: using calc_type = double; using complex_type = std::complex; - using StateVector = std::vector>; + using StateVector = calc_type *; using Map = std::map; using RndEngine = std::mt19937; using Term = std::vector>; using TermsDict = std::vector>; using ComplexTermsDict = std::vector>; + StateVector vec_; - Simulator(unsigned seed = 1) : N_(0), vec_(1,0.), fusion_qubits_min_(4), + Simulator(unsigned seed = 1, unsigned N = 0) : N_(N), fusion_qubits_min_(4), fusion_qubits_max_(5), rnd_eng_(seed) { + len_ = 1UL << (N_ + 1); + vec_ = (StateVector)calloc(len_, sizeof(calc_type)); vec_[0]=1.; // all-zero initial state std::uniform_real_distribution dist(0., 1.); rng_ = std::bind(dist, std::ref(rnd_eng_)); - } - - void allocate_qubit(unsigned id){ - if (map_.count(id) == 0){ - map_[id] = N_++; - StateVector newvec; // avoid large memory allocations - if( tmpBuff1_.capacity() >= (1UL << N_) ) - std::swap(newvec, tmpBuff1_); - newvec.resize(1UL << N_); -#pragma omp parallel for schedule(static) - for (std::size_t i = 0; i < newvec.size(); ++i) - newvec[i] = (i < vec_.size())?vec_[i]:0.; - std::swap(vec_, newvec); - // recycle large memory - std::swap(tmpBuff1_, newvec); - if( tmpBuff1_.capacity() < tmpBuff2_.capacity() ) - std::swap(tmpBuff1_, tmpBuff2_); - } - else - throw(std::runtime_error( - "AllocateQubit: ID already exists. Qubit IDs should be unique.")); - } - - bool get_classical_value(unsigned id, calc_type tol = 1.e-12){ - run(); - unsigned pos = map_[id]; - std::size_t delta = (1UL << pos); - - for (std::size_t i = 0; i < vec_.size(); i += 2*delta){ - for (std::size_t j = 0; j < delta; ++j){ - if (std::norm(vec_[i+j]) > tol) - return false; - if (std::norm(vec_[i+j+delta]) > tol) - return true; - } - } - assert(false); // this will never happen - return false; // suppress 'control reaches end of non-void...' - } - - bool is_classical(unsigned id, calc_type tol = 1.e-12){ - run(); - unsigned pos = map_[id]; - std::size_t delta = (1UL << pos); - - short up = 0, down = 0; - #pragma omp parallel for schedule(static) reduction(|:up,down) - for (std::size_t i = 0; i < vec_.size(); i += 2*delta){ - for (std::size_t j = 0; j < delta; ++j){ - up = up | ((std::norm(vec_[i+j]) > tol)&1); - down = down | ((std::norm(vec_[i+j+delta]) > tol)&1); - } - } - - return 1 == (up^down); - } - - void collapse_vector(unsigned id, bool value = false, bool shrink = false){ - run(); - unsigned pos = map_[id]; - std::size_t delta = (1UL << pos); - - if (!shrink){ - #pragma omp parallel for schedule(static) - for (std::size_t i = 0; i < vec_.size(); i += 2*delta){ - for (std::size_t j = 0; j < delta; ++j) - vec_[i+j+static_cast(!value)*delta] = 0.; - } - } - else{ - StateVector newvec; // avoid costly memory reallocations - if( tmpBuff1_.capacity() >= (1UL << (N_-1)) ) - std::swap(tmpBuff1_, newvec); - newvec.resize((1UL << (N_-1))); - #pragma omp parallel for schedule(static) if(0) - for (std::size_t i = 0; i < vec_.size(); i += 2*delta) - std::copy_n(&vec_[i + static_cast(value)*delta], - delta, &newvec[i/2]); - std::swap(vec_, newvec); - std::swap(tmpBuff1_, newvec); - if( tmpBuff1_.capacity() < tmpBuff2_.capacity() ) - std::swap(tmpBuff1_, tmpBuff2_); - - for (auto& p : map_){ - if (p.second > pos) - p.second--; - } - map_.erase(id); - N_--; - } - } - - void measure_qubits(std::vector const& ids, std::vector &res){ - run(); - - std::vector positions(ids.size()); - for (unsigned i = 0; i < ids.size(); ++i) - positions[i] = map_[ids[i]]; - - calc_type P = 0.; - calc_type rnd = rng_(); - - // pick entry at random with probability |entry|^2 - std::size_t pick = 0; - while (P < rnd && pick < vec_.size()) - P += std::norm(vec_[pick++]); - - pick--; - // determine result vector (boolean values for each qubit) - // and create mask to detect bad entries (i.e., entries that don't agree with measurement) - res = std::vector(ids.size()); - std::size_t mask = 0; - std::size_t val = 0; - for (unsigned i = 0; i < ids.size(); ++i){ - bool r = ((pick >> positions[i]) & 1) == 1; - res[i] = r; - mask |= (1UL << positions[i]); - val |= (static_cast(r&1) << positions[i]); - } - // set bad entries to 0 - calc_type N = 0.; - #pragma omp parallel for reduction(+:N) schedule(static) - for (std::size_t i = 0; i < vec_.size(); ++i){ - if ((i & mask) != val) - vec_[i] = 0.; - else - N += std::norm(vec_[i]); - } - // re-normalize - N = 1./std::sqrt(N); - #pragma omp parallel for schedule(static) - for (std::size_t i = 0; i < vec_.size(); ++i) - vec_[i] *= N; - } - - std::vector measure_qubits_return(std::vector const& ids){ - std::vector ret; - measure_qubits(ids, ret); - return ret; - } - - void deallocate_qubit(unsigned id){ - run(); - assert(map_.count(id) == 1); - if (!is_classical(id)) - throw(std::runtime_error("Error: Qubit has not been measured / uncomputed! There is most likely a bug in your code.")); - - bool value = get_classical_value(id); - collapse_vector(id, value, true); + for (unsigned i = 0; i < N_; i++) + map_[i] = i; } template @@ -221,84 +73,13 @@ fused_gates_ = fused_gates; } - template - void emulate_math(F const& f, QuReg quregs, const std::vector& ctrl, - bool parallelize = false){ - run(); - auto ctrlmask = get_control_mask(ctrl); - - for (unsigned i = 0; i < quregs.size(); ++i) - for (unsigned j = 0; j < quregs[i].size(); ++j) - quregs[i][j] = map_[quregs[i][j]]; - - StateVector newvec; // avoid costly memory reallocations - if( tmpBuff1_.capacity() >= vec_.size() ) - std::swap(newvec, tmpBuff1_); - newvec.resize(vec_.size()); -#pragma omp parallel for schedule(static) - for (std::size_t i = 0; i < vec_.size(); i++) - newvec[i] = 0; - -//#pragma omp parallel reduction(+:newvec[:newvec.size()]) if(parallelize) // requires OpenMP 4.5 - { - std::vector res(quregs.size()); - //#pragma omp for schedule(static) - for (std::size_t i = 0; i < vec_.size(); ++i){ - if ((ctrlmask&i) == ctrlmask){ - for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){ - res[qr_i] = 0; - for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i) - res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i; - } - f(res); - auto new_i = i; - for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){ - for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){ - if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1))) - new_i ^= (1UL << quregs[qr_i][qb_i]); - } - } - newvec[new_i] += vec_[i]; - } - else - newvec[i] += vec_[i]; - } - } - std::swap(vec_, newvec); - std::swap(tmpBuff1_, newvec); - } - - // faster version without calling python - template - inline void emulate_math_addConstant(int a, const QuReg& quregs, const std::vector& ctrl) - { - emulate_math([a](std::vector &res){for(auto& x: res) x = x + a;}, quregs, ctrl, true); - } - - // faster version without calling python - template - inline void emulate_math_addConstantModN(int a, int N, const QuReg& quregs, const std::vector& ctrl) - { - emulate_math([a,N](std::vector &res){for(auto& x: res) x = (x + a) % N;}, quregs, ctrl, true); - } - - // faster version without calling python - template - inline void emulate_math_multiplyByConstantModN(int a, int N, const QuReg& quregs, const std::vector& ctrl) - { - emulate_math([a,N](std::vector &res){for(auto& x: res) x = (x * a) % N;}, quregs, ctrl, true); - } - calc_type get_expectation_value(TermsDict const& td, std::vector const& ids){ run(); calc_type expectation = 0.; - StateVector current_state; // avoid costly memory reallocations - if( tmpBuff1_.capacity() >= vec_.size() ) - std::swap(tmpBuff1_, current_state); - current_state.resize(vec_.size()); + StateVector current_state = (StateVector)malloc(len_ *sizeof(calc_type)); #pragma omp parallel for schedule(static) - for (std::size_t i = 0; i < vec_.size(); ++i) + for (std::size_t i = 0; i < len_; ++i) current_state[i] = vec_[i]; for (auto const& term : td){ @@ -306,81 +87,53 @@ apply_term(term.first, ids, {}); calc_type delta = 0.; #pragma omp parallel for reduction(+:delta) schedule(static) - for (std::size_t i = 0; i < vec_.size(); ++i){ - auto const a1 = std::real(current_state[i]); - auto const b1 = -std::imag(current_state[i]); - auto const a2 = std::real(vec_[i]); - auto const b2 = std::imag(vec_[i]); + for (std::size_t i = 0; i < (len_ >> 1); ++i){ + auto const a1 = current_state[2 * i]; + auto const b1 = -current_state[2 * i + 1]; + auto const a2 = vec_[2 * i]; + auto const b2 = vec_[2 * i + 1]; delta += a1 * a2 - b1 * b2; // reset vec_ - vec_[i] = current_state[i]; + vec_[2 * i] = current_state[2 * i]; + vec_[2 * i + 1] = current_state[2 * i + 1]; } expectation += coefficient * delta; } - std::swap(current_state, tmpBuff1_); + if (NULL != current_state){ + free(current_state); + current_state = NULL; + } return expectation; } void apply_qubit_operator(ComplexTermsDict const& td, std::vector const& ids){ run(); - StateVector new_state, current_state; // avoid costly memory reallocations - if( tmpBuff1_.capacity() >= vec_.size() ) - std::swap(tmpBuff1_, new_state); - if( tmpBuff2_.capacity() >= vec_.size() ) - std::swap(tmpBuff2_, current_state); - new_state.resize(vec_.size()); - current_state.resize(vec_.size()); + StateVector new_state = (StateVector)calloc(len_, sizeof(calc_type)); + StateVector current_state = (StateVector)malloc(len_ * sizeof(calc_type)); #pragma omp parallel for schedule(static) - for (std::size_t i = 0; i < vec_.size(); ++i){ - new_state[i] = 0; + for (std::size_t i = 0; i < len_; ++i){ current_state[i] = vec_[i]; } for (auto const& term : td){ auto const& coefficient = term.second; apply_term(term.first, ids, {}); #pragma omp parallel for schedule(static) - for (std::size_t i = 0; i < vec_.size(); ++i){ - new_state[i] += coefficient * vec_[i]; - vec_[i] = current_state[i]; + for (std::size_t i = 0; i < (len_ >> 1); ++i){ + new_state[2 * i] += coefficient.real() * vec_[2 * i] - coefficient.imag() * vec_[2 * i + 1]; + new_state[2 * i + 1] += coefficient.real() * vec_[2 * i + 1] + coefficient.imag() * vec_[2 * i]; + vec_[2 * i] = current_state[2 * i]; + vec_[2 * i + 1] = current_state[2 * i + 1]; } } - std::swap(vec_, new_state); - std::swap(tmpBuff1_, new_state); - std::swap(tmpBuff2_, current_state); - } - - calc_type get_probability(std::vector const& bit_string, - std::vector const& ids){ - run(); - if (!check_ids(ids)) - throw(std::runtime_error("get_probability(): Unknown qubit id. Please make sure you have called eng.flush().")); - std::size_t mask = 0, bit_str = 0; - for (unsigned i = 0; i < ids.size(); ++i){ - mask |= 1UL << map_[ids[i]]; - bit_str |= (bit_string[i]?1UL:0UL) << map_[ids[i]]; - } - calc_type probability = 0.; - #pragma omp parallel for reduction(+:probability) schedule(static) - for (std::size_t i = 0; i < vec_.size(); ++i) - if ((i & mask) == bit_str) - probability += std::norm(vec_[i]); - return probability; - } - - complex_type const& get_amplitude(std::vector const& bit_string, - std::vector const& ids){ - run(); - std::size_t chk = 0; - std::size_t index = 0; - for (unsigned i = 0; i < ids.size(); ++i){ - if (map_.count(ids[i]) == 0) - break; - chk |= 1UL << map_[ids[i]]; - index |= (bit_string[i]?1UL:0UL) << map_[ids[i]]; + if (NULL != vec_) + free(vec_); + vec_ = new_state; + if (NULL != new_state) + new_state = NULL; + if (NULL != current_state){ + free(current_state); + current_state = NULL; } - if (chk + 1 != vec_.size()) - throw(std::runtime_error("The second argument to get_amplitude() must be a permutation of all allocated qubits. Please make sure you have called eng.flush().")); - return vec_[index]; } void emulate_time_evolution(TermsDict const& tdict, calc_type const& time, @@ -400,87 +153,71 @@ } unsigned s = std::abs(time) * op_nrm + 1.; complex_type correction = std::exp(-time * I * tr / (double)s); - auto output_state = vec_; + auto output_state = copy(vec_, len_); auto ctrlmask = get_control_mask(ctrl); for (unsigned i = 0; i < s; ++i){ calc_type nrm_change = 1.; for (unsigned k = 0; nrm_change > 1.e-12; ++k){ auto coeff = (-time * I) / double(s * (k + 1)); - auto current_state = vec_; - auto update = StateVector(vec_.size(), 0.); + auto current_state = copy(vec_, len_); + auto update = (StateVector)calloc(len_, sizeof(calc_type)); for (auto const& tup : td){ apply_term(tup.first, ids, {}); #pragma omp parallel for schedule(static) - for (std::size_t j = 0; j < vec_.size(); ++j){ + for (std::size_t j = 0; j < len_; ++j){ update[j] += vec_[j] * tup.second; vec_[j] = current_state[j]; } } nrm_change = 0.; #pragma omp parallel for reduction(+:nrm_change) schedule(static) - for (std::size_t j = 0; j < vec_.size(); ++j){ - update[j] *= coeff; - vec_[j] = update[j]; + for (std::size_t j = 0; j < (len_ >> 1); ++j){ + complex_type tmp(update[2 * j], update[2 * j + 1]); + tmp *= coeff; + update[2 * j] *= std::real(tmp); + update[2 * j + 1] *= std::imag(tmp); + vec_[2 * j] = update[2 * j]; + vec_[2 * j + 1] = update[2 * j + 1]; if ((j & ctrlmask) == ctrlmask){ - output_state[j] += update[j]; - nrm_change += std::norm(update[j]); + output_state[2 * j] += update[2 * j]; + output_state[2 * j + 1] += update[2 * j + 1]; + nrm_change += std::sqrt(update[2 * j] * update[2 * j] + update[2 * j + 1] * update[2 * j + 1]); } } nrm_change = std::sqrt(nrm_change); + if (NULL != current_state){ + free(current_state); + current_state = NULL; + } + if (NULL != update){ + free(update); + update = NULL; + } } #pragma omp parallel for schedule(static) - for (std::size_t j = 0; j < vec_.size(); ++j){ - if ((j & ctrlmask) == ctrlmask) - output_state[j] *= correction; - vec_[j] = output_state[j]; + for (std::size_t j = 0; j < (len_ >>1); ++j){ + if ((j & ctrlmask) == ctrlmask){ + complex_type tmp(output_state[2 * j], output_state[2 * j + 1]); + tmp *= correction; + output_state[2 * j] = std::real(tmp); + output_state[2 * j + 1] =std::imag(tmp); + } + vec_[2 * j] = output_state[2 * j]; + vec_[2 * j + 1] = output_state[2 * j + 1]; } } + if (NULL != output_state){ + free(output_state); + output_state = NULL; + } } void set_wavefunction(StateVector const& wavefunction, std::vector const& ordering){ run(); - // make sure there are 2^n amplitudes for n qubits - assert(wavefunction.size() == (1UL << ordering.size())); - // check that all qubits have been allocated previously - if (map_.size() != ordering.size() || !check_ids(ordering)) - throw(std::runtime_error("set_wavefunction(): Invalid mapping provided. Please make sure all qubits have been allocated previously (call eng.flush()).")); - - // set mapping and wavefunction - for (unsigned i = 0; i < ordering.size(); ++i) - map_[ordering[i]] = i; - #pragma omp parallel for schedule(static) - for (std::size_t i = 0; i < wavefunction.size(); ++i) - vec_[i] = wavefunction[i]; - } - - void collapse_wavefunction(std::vector const& ids, std::vector const& values){ - run(); - assert(ids.size() == values.size()); - if (!check_ids(ids)) - throw(std::runtime_error("collapse_wavefunction(): Unknown qubit id(s) provided. Try calling eng.flush() before invoking this function.")); - std::size_t mask = 0, val = 0; - for (unsigned i = 0; i < ids.size(); ++i){ - mask |= (1UL << map_[ids[i]]); - val |= ((values[i]?1UL:0UL) << map_[ids[i]]); - } - // set bad entries to 0 and compute probability of outcome to renormalize - calc_type N = 0.; - #pragma omp parallel for reduction(+:N) schedule(static) - for (std::size_t i = 0; i < vec_.size(); ++i){ - if ((i & mask) == val) - N += std::norm(vec_[i]); - } - if (N < 1.e-12) - throw(std::runtime_error("collapse_wavefunction(): Invalid collapse! Probability is ~0.")); - // re-normalize (if possible) - N = 1./std::sqrt(N); - #pragma omp parallel for schedule(static) - for (std::size_t i = 0; i < vec_.size(); ++i){ - if ((i & mask) != val) - vec_[i] = 0.; - else - vec_[i] *= N; + if (NULL != vec_){ + free(vec_); } + vec_ = copy(wavefunction, len_); } void run(){ @@ -500,23 +237,23 @@ switch (ids.size()){ case 1: #pragma omp parallel - kernel(vec_, ids[0], m, ctrlmask); + kernel(vec_, ids[0], m, ctrlmask, len_ >> 1); break; case 2: #pragma omp parallel - kernel(vec_, ids[1], ids[0], m, ctrlmask); + kernel(vec_, ids[1], ids[0], m, ctrlmask, len_ >> 1); break; case 3: #pragma omp parallel - kernel(vec_, ids[2], ids[1], ids[0], m, ctrlmask); + kernel(vec_, ids[2], ids[1], ids[0], m, ctrlmask, len_ >> 1); break; case 4: #pragma omp parallel - kernel(vec_, ids[3], ids[2], ids[1], ids[0], m, ctrlmask); + kernel(vec_, ids[3], ids[2], ids[1], ids[0], m, ctrlmask, len_ >> 1); break; case 5: #pragma omp parallel - kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask); + kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask, len_ >> 1); break; default: throw std::invalid_argument("Gates with more than 5 qubits are not supported!"); @@ -525,12 +262,27 @@ fused_gates_ = Fusion(); } - std::tuple cheat(){ + std::vector cheat(){ run(); - return make_tuple(map_, std::ref(vec_)); + std::vector result; + for (unsigned int i = 0; i < (len_ >> 1); i++){ + result.push_back({vec_[2 * i], vec_[2 * i + 1]}); + } + return result; + } + + inline StateVector copy(StateVector source, unsigned len){ + StateVector result = (StateVector)malloc(len * sizeof(calc_type)); +#pragma omp parallel for schedule(static) + for (std::size_t i = 0; i < len; ++i) { + result[i] = source[i]; } + return result; +} ~Simulator(){ + if (NULL != vec_) + free(vec_); } private: @@ -562,18 +314,13 @@ } unsigned N_; // #qubits - StateVector vec_; + unsigned len_; Map map_; Fusion fused_gates_; unsigned fusion_qubits_min_, fusion_qubits_max_; RndEngine rnd_eng_; std::function rng_; - - // large array buffers to avoid costly reallocations - static StateVector tmpBuff1_, tmpBuff2_; }; -Simulator::StateVector Simulator::tmpBuff1_; -Simulator::StateVector Simulator::tmpBuff2_; #endif