Skip to content

Commit 5c985ab

Browse files
authored
Merge pull request #10 from TensoriumCore/bssn_perf_optimization
2 parents e592541 + b3bb9cd commit 5c985ab

15 files changed

Lines changed: 1463 additions & 367 deletions

File tree

Tests/bssn/bowen_york_boost_stability.cpp

Lines changed: 442 additions & 30 deletions
Large diffs are not rendered by default.

includes/Tensorium/Physics/DiffGeometry/BSSN_Grid/Derivatives/BSSNGridDerivatives.hpp

Lines changed: 25 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -131,44 +131,39 @@ template <typename T> inline double Dyz_ptr(const T *p, ptrdiff_t sy, double inv
131131
// Mixed Derivatives (Order 4)
132132
template <typename T>
133133
inline double Dxy4_ptr(const T *p, ptrdiff_t sx, ptrdiff_t sy, double inv_144dxdy) {
134-
double sum = 0.0;
135-
const ptrdiff_t idx_x[4] = {-2 * sx, -sx, sx, 2 * sx};
136-
const ptrdiff_t idx_y[4] = {-2 * sy, -sy, sy, 2 * sy};
137-
const double w[4] = {-1.0, 8.0, -8.0, 1.0};
138-
139-
for (int a = 0; a < 4; ++a) {
140-
for (int b = 0; b < 4; ++b) {
141-
sum += w[a] * w[b] * p[idx_x[a] + idx_y[b]];
142-
}
143-
}
134+
const ptrdiff_t sx2 = 2 * sx;
135+
const ptrdiff_t sy2 = 2 * sy;
136+
const double sum =
137+
(+1.0) * p[-sx2 - sy2] + (-8.0) * p[-sx2 - sy] + (+8.0) * p[-sx2 + sy] +
138+
(-1.0) * p[-sx2 + sy2] + (-8.0) * p[-sx - sy2] + (+64.0) * p[-sx - sy] +
139+
(-64.0) * p[-sx + sy] + (+8.0) * p[-sx + sy2] + (+8.0) * p[sx - sy2] +
140+
(-64.0) * p[sx - sy] + (+64.0) * p[sx + sy] + (-8.0) * p[sx + sy2] +
141+
(-1.0) * p[sx2 - sy2] + (+8.0) * p[sx2 - sy] + (-8.0) * p[sx2 + sy] +
142+
(+1.0) * p[sx2 + sy2];
144143
return sum * inv_144dxdy;
145144
}
146145

147146
template <typename T> inline double Dxz4_ptr(const T *p, ptrdiff_t sx, double inv_144dxdz) {
148-
double sum = 0.0;
149-
const ptrdiff_t idx_x[4] = {-2 * sx, -sx, sx, 2 * sx};
150-
const ptrdiff_t idx_z[4] = {-2, -1, 1, 2};
151-
const double w[4] = {-1.0, 8.0, -8.0, 1.0};
152-
153-
for (int a = 0; a < 4; ++a) {
154-
for (int b = 0; b < 4; ++b) {
155-
sum += w[a] * w[b] * p[idx_x[a] + idx_z[b]];
156-
}
157-
}
147+
const ptrdiff_t sx2 = 2 * sx;
148+
const double sum =
149+
(+1.0) * p[-sx2 - 2] + (-8.0) * p[-sx2 - 1] + (+8.0) * p[-sx2 + 1] +
150+
(-1.0) * p[-sx2 + 2] + (-8.0) * p[-sx - 2] + (+64.0) * p[-sx - 1] +
151+
(-64.0) * p[-sx + 1] + (+8.0) * p[-sx + 2] + (+8.0) * p[sx - 2] +
152+
(-64.0) * p[sx - 1] + (+64.0) * p[sx + 1] + (-8.0) * p[sx + 2] +
153+
(-1.0) * p[sx2 - 2] + (+8.0) * p[sx2 - 1] + (-8.0) * p[sx2 + 1] +
154+
(+1.0) * p[sx2 + 2];
158155
return sum * inv_144dxdz;
159156
}
160157

161158
template <typename T> inline double Dyz4_ptr(const T *p, ptrdiff_t sy, double inv_144dydz) {
162-
double sum = 0.0;
163-
const ptrdiff_t idx_y[4] = {-2 * sy, -sy, sy, 2 * sy};
164-
const ptrdiff_t idx_z[4] = {-2, -1, 1, 2};
165-
const double w[4] = {-1.0, 8.0, -8.0, 1.0};
166-
167-
for (int a = 0; a < 4; ++a) {
168-
for (int b = 0; b < 4; ++b) {
169-
sum += w[a] * w[b] * p[idx_y[a] + idx_z[b]];
170-
}
171-
}
159+
const ptrdiff_t sy2 = 2 * sy;
160+
const double sum =
161+
(+1.0) * p[-sy2 - 2] + (-8.0) * p[-sy2 - 1] + (+8.0) * p[-sy2 + 1] +
162+
(-1.0) * p[-sy2 + 2] + (-8.0) * p[-sy - 2] + (+64.0) * p[-sy - 1] +
163+
(-64.0) * p[-sy + 1] + (+8.0) * p[-sy + 2] + (+8.0) * p[sy - 2] +
164+
(-64.0) * p[sy - 1] + (+64.0) * p[sy + 1] + (-8.0) * p[sy + 2] +
165+
(-1.0) * p[sy2 - 2] + (+8.0) * p[sy2 - 1] + (-8.0) * p[sy2 + 1] +
166+
(+1.0) * p[sy2 + 2];
172167
return sum * inv_144dydz;
173168
}
174169

includes/Tensorium/Physics/DiffGeometry/BSSN_Grid/Evolution/BSSNEvolutionATilde.hpp

Lines changed: 31 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -33,18 +33,15 @@ namespace tensorium_RG::bssn {
3333
*/
3434
template <typename T>
3535
inline void compute_rhs_A_tilde(const BSSNGridSoA<T> &G, Field3D<T> rhs[6], size_t padding = 4,
36-
const GaugeParameters<T> &params = {}) {
36+
const GaugeParameters<T> &params = {},
37+
Field3D<T> *z4_conformal_trace_cache = nullptr) {
3738
BSSN_PROFILE_KERNEL(ATilde);
3839
using namespace tensorium_RG::fd;
3940
const T ko_sigma = scaled_ko_sigma(params.ko_sigma);
4041

4142
size_t I0, I1, J0, J1, K0, K1;
4243
G.domain_bounds(I0, I1, J0, J1, K0, K1);
4344

44-
const size_t total = G.A_tilde[0].st.nx_tot * G.A_tilde[0].st.ny_tot * G.A_tilde[0].st.nz_tot;
45-
for (int s = 0; s < 6; ++s)
46-
std::fill(rhs[s].ptr(), rhs[s].ptr() + total, T(0));
47-
4845
const size_t i0 = clamped_lower(I0, padding, I1);
4946
const size_t j0 = clamped_lower(J0, padding, J1);
5047
const size_t k0 = clamped_lower(K0, padding, K1);
@@ -77,7 +74,10 @@ inline void compute_rhs_A_tilde(const BSSNGridSoA<T> &G, Field3D<T> rhs[6], size
7774
const ptrdiff_t sy = G.A_tilde[0].st.sy;
7875

7976
// Map (row, col) -> symmetric index 0..5
80-
const int map_s[3][3] = {{0, 1, 2}, {1, 3, 4}, {2, 4, 5}};
77+
constexpr int map_s[3][3] = {{XX, XY, XZ}, {XY, YY, YZ}, {XZ, YZ, ZZ}};
78+
constexpr int sym_row[6] = {0, 0, 0, 1, 1, 2};
79+
constexpr int sym_col[6] = {0, 1, 2, 1, 2, 2};
80+
const T ko_scale = T(ko_sigma / G.dx);
8181

8282
#pragma omp parallel for collapse(2)
8383
for (size_t i = i0; i < i1; ++i) {
@@ -106,6 +106,8 @@ inline void compute_rhs_A_tilde(const BSSNGridSoA<T> &G, Field3D<T> rhs[6], size
106106
p_Ricci[s] = G.Ricci[s].ptr() + idx_start;
107107
p_rhs[s] = rhs[s].ptr() + idx_start;
108108
}
109+
T *p_z4_trace =
110+
z4_conformal_trace_cache ? (z4_conformal_trace_cache->ptr() + idx_start) : nullptr;
109111

110112
for (size_t k = k0; k < k1; ++k) {
111113
const T alpha = *p_alpha;
@@ -114,7 +116,19 @@ inline void compute_rhs_A_tilde(const BSSNGridSoA<T> &G, Field3D<T> rhs[6], size
114116
const T inv_chi = T(1) / chi_guarded;
115117
const T K = *p_K;
116118
T RicciZ4[6];
117-
compute_RicciZ4(G, i, j, k, RicciZ4, params.chi_div_floor);
119+
compute_RicciZ4_core(G, i, j, k, RicciZ4, params.chi_div_floor, inv_12dx,
120+
inv_12dy, inv_12dz, sx, sy);
121+
if (p_z4_trace) {
122+
const T g_xx = *p_gam_inv[0];
123+
const T g_xy = *p_gam_inv[1];
124+
const T g_xz = *p_gam_inv[2];
125+
const T g_yy = *p_gam_inv[3];
126+
const T g_yz = *p_gam_inv[4];
127+
const T g_zz = *p_gam_inv[5];
128+
*p_z4_trace = g_xx * RicciZ4[0] + g_yy * RicciZ4[3] + g_zz * RicciZ4[5] +
129+
T(2) * (g_xy * RicciZ4[1] + g_xz * RicciZ4[2] +
130+
g_yz * RicciZ4[4]);
131+
}
118132

119133
// --- 1. Load Local Tensors ---
120134
T gamma_tilde_inv[3][3];
@@ -124,26 +138,8 @@ inline void compute_rhs_A_tilde(const BSSNGridSoA<T> &G, Field3D<T> rhs[6], size
124138
T Ricci_mat[3][3];
125139

126140
for (int s = 0; s < 6; ++s) {
127-
int a, b;
128-
if (s == 0) {
129-
a = 0;
130-
b = 0;
131-
} else if (s == 1) {
132-
a = 0;
133-
b = 1;
134-
} else if (s == 2) {
135-
a = 0;
136-
b = 2;
137-
} else if (s == 3) {
138-
a = 1;
139-
b = 1;
140-
} else if (s == 4) {
141-
a = 1;
142-
b = 2;
143-
} else {
144-
a = 2;
145-
b = 2;
146-
}
141+
const int a = sym_row[s];
142+
const int b = sym_col[s];
147143

148144
const T gt = *p_gam[s];
149145
const T gt_inv = *p_gam_inv[s];
@@ -185,26 +181,8 @@ inline void compute_rhs_A_tilde(const BSSNGridSoA<T> &G, Field3D<T> rhs[6], size
185181
T d_g_phys[3][3][3]; // dir, row, col
186182

187183
for (int s = 0; s < 6; ++s) {
188-
int row, col;
189-
if (s == 0) {
190-
row = 0;
191-
col = 0;
192-
} else if (s == 1) {
193-
row = 0;
194-
col = 1;
195-
} else if (s == 2) {
196-
row = 0;
197-
col = 2;
198-
} else if (s == 3) {
199-
row = 1;
200-
col = 1;
201-
} else if (s == 4) {
202-
row = 1;
203-
col = 2;
204-
} else {
205-
row = 2;
206-
col = 2;
207-
}
184+
const int row = sym_row[s];
185+
const int col = sym_col[s];
208186

209187
const T *p_g = p_gam[s];
210188
const T d_gt_x = Dx_ptr(p_g, sx, inv_12dx);
@@ -304,13 +282,9 @@ inline void compute_rhs_A_tilde(const BSSNGridSoA<T> &G, Field3D<T> rhs[6], size
304282
const int s = map_s[a][b];
305283
const T *p_field = p_A[s];
306284

307-
const T bx = beta[0];
308-
const T by = beta[1];
309-
const T bz = beta[2];
310-
311-
const T adv = bx * Dx_upwind_ptr(p_field, sx, inv_2dx, bx) +
312-
by * Dy_upwind_ptr(p_field, sy, inv_2dy, by) +
313-
bz * Dz_upwind_ptr(p_field, inv_2dz, bz);
285+
const T adv = beta[0] * Dx_upwind_ptr(p_field, sx, inv_2dx, beta[0]) +
286+
beta[1] * Dy_upwind_ptr(p_field, sy, inv_2dy, beta[1]) +
287+
beta[2] * Dz_upwind_ptr(p_field, inv_2dz, beta[2]);
314288

315289
T lie = T(0);
316290
for (int m = 0; m < 3; ++m) {
@@ -324,7 +298,7 @@ inline void compute_rhs_A_tilde(const BSSNGridSoA<T> &G, Field3D<T> rhs[6], size
324298
alpha * (K * A_mat[a][b] - T(2) * A_contracted[a][b]);
325299
const T diss = KO6_axis_ptr(p_field, sx) + KO6_axis_ptr(p_field, sy) +
326300
KO6_axis_ptr(p_field, 1);
327-
const T diss_scaled = (ko_sigma / G.dx) * diss;
301+
const T diss_scaled = ko_scale * diss;
328302

329303
*p_rhs[s] = adv + lie + term_geom + term_quad + diss_scaled;
330304
}
@@ -342,6 +316,8 @@ inline void compute_rhs_A_tilde(const BSSNGridSoA<T> &G, Field3D<T> rhs[6], size
342316
++p_Ricci[s];
343317
++p_rhs[s];
344318
}
319+
if (p_z4_trace)
320+
++p_z4_trace;
345321
}
346322
}
347323
}

includes/Tensorium/Physics/DiffGeometry/BSSN_Grid/Evolution/BSSNEvolutionCommon.hpp

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -124,15 +124,13 @@ template <typename T> inline T guard_chi_div(T chi, T chi_div_floor) {
124124
}
125125

126126
template <typename T>
127-
inline void compute_RicciZ4(const BSSNGridSoA<T> &G, size_t i, size_t j, size_t k, T Z4corr[6],
128-
T chi_div_floor = T(-1000.0)) {
127+
inline void compute_RicciZ4_core(const BSSNGridSoA<T> &G, size_t i, size_t j, size_t k,
128+
T Z4corr[6], T chi_div_floor, const double inv_12dx,
129+
const double inv_12dy, const double inv_12dz,
130+
const ptrdiff_t sx, const ptrdiff_t sy) {
129131
using namespace tensorium_RG::fd;
130-
131-
const double inv_12dx = 1.0 / (60.0 * G.dx);
132-
const double inv_12dy = 1.0 / (60.0 * G.dy);
133-
const double inv_12dz = 1.0 / (60.0 * G.dz);
134-
const ptrdiff_t sx = G.alpha.st.sx;
135-
const ptrdiff_t sy = G.alpha.st.sy;
132+
constexpr int row_of_sym[6] = {0, 0, 0, 1, 1, 2};
133+
constexpr int col_of_sym[6] = {0, 1, 2, 1, 2, 2};
136134

137135
const size_t idx = G.alpha.idx(i, j, k);
138136

@@ -151,9 +149,8 @@ inline void compute_RicciZ4(const BSSNGridSoA<T> &G, size_t i, size_t j, size_t
151149
}
152150

153151
for (int s = 0; s < 6; ++s) {
154-
int row = 0;
155-
int col = 0;
156-
sym_index_to_pair(s, row, col);
152+
const int row = row_of_sym[s];
153+
const int col = col_of_sym[s];
157154
const T gt = *p_gamma[s];
158155
const T gt_inv = *p_gamma_inv[s];
159156
const T g_phys = gt * inv_chi;
@@ -170,9 +167,8 @@ inline void compute_RicciZ4(const BSSNGridSoA<T> &G, size_t i, size_t j, size_t
170167

171168
T d_g_phys[3][3][3];
172169
for (int s = 0; s < 6; ++s) {
173-
int row = 0;
174-
int col = 0;
175-
sym_index_to_pair(s, row, col);
170+
const int row = row_of_sym[s];
171+
const int col = col_of_sym[s];
176172
const T *p_g = p_gamma[s];
177173
const T d_gt_x = Dx_ptr(p_g, sx, inv_12dx);
178174
const T d_gt_y = Dy_ptr(p_g, sy, inv_12dy);
@@ -255,4 +251,15 @@ inline void compute_RicciZ4(const BSSNGridSoA<T> &G, size_t i, size_t j, size_t
255251
}
256252
}
257253

254+
template <typename T>
255+
inline void compute_RicciZ4(const BSSNGridSoA<T> &G, size_t i, size_t j, size_t k, T Z4corr[6],
256+
T chi_div_floor = T(-1000.0)) {
257+
const double inv_12dx = 1.0 / (60.0 * G.dx);
258+
const double inv_12dy = 1.0 / (60.0 * G.dy);
259+
const double inv_12dz = 1.0 / (60.0 * G.dz);
260+
const ptrdiff_t sx = G.alpha.st.sx;
261+
const ptrdiff_t sy = G.alpha.st.sy;
262+
compute_RicciZ4_core(G, i, j, k, Z4corr, chi_div_floor, inv_12dx, inv_12dy, inv_12dz, sx, sy);
263+
}
264+
258265
} // namespace tensorium_RG::bssn

0 commit comments

Comments
 (0)