From 48130822f256dcadadd4107977197622f3df80d2 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Thu, 16 Apr 2026 10:39:20 +0100 Subject: [PATCH 01/23] Matrix multiplication does not change MatrixCSR --- cpp/dolfinx/la/MatrixCSR.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h index 6094a32335..58df422174 100644 --- a/cpp/dolfinx/la/MatrixCSR.h +++ b/cpp/dolfinx/la/MatrixCSR.h @@ -489,7 +489,7 @@ class MatrixCSR /// /// @param[in] x Vector to apply `A` to. /// @param[in,out] y Vector to accumulate the result into. - void mult(Vector& x, Vector& y); + void mult(Vector& x, Vector& y) const; /// @brief Get MPI communicator that matrix is defined on. MPI_Comm comm() const { return _comm.comm(); } @@ -838,7 +838,7 @@ MatrixCSR::MatrixCSR(const SparsityType& p, BlockMode mode) /// x,y template void MatrixCSR::mult(la::Vector& x, - la::Vector& y) + la::Vector& y) const { // start communication (update ghosts) x.scatter_fwd_begin(); From 0b0b958dbfbc08f44edb2c5033f54be0325342ec Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Thu, 16 Apr 2026 16:47:29 +0100 Subject: [PATCH 02/23] Add transpose implementation --- cpp/dolfinx/la/MatrixCSR.h | 29 ++++++++ cpp/dolfinx/la/matrix_csr_impl.h | 37 ++++++++++ python/dolfinx/wrappers/dolfinx_wrappers/la.h | 1 + python/test/unit/la/test_matrix_vector.py | 67 +++++++++++++++++-- 4 files changed, 130 insertions(+), 4 deletions(-) diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h index 58df422174..87e9806173 100644 --- a/cpp/dolfinx/la/MatrixCSR.h +++ b/cpp/dolfinx/la/MatrixCSR.h @@ -491,6 +491,9 @@ class MatrixCSR /// @param[in,out] y Vector to accumulate the result into. void mult(Vector& x, Vector& y) const; + void multT(Vector& x, Vector& y) const; + + /// @brief Get MPI communicator that matrix is defined on. MPI_Comm comm() const { return _comm.comm(); } @@ -886,4 +889,30 @@ void MatrixCSR::mult(la::Vector& x, } } +template +void MatrixCSR::multT(la::Vector& x, + la::Vector& y) const +{ + std::int32_t nrowslocal = num_owned_rows(); + std::span Arow_ptr(row_ptr().data(), nrowslocal + 1); + std::span Acols(cols().data(), Arow_ptr[nrowslocal]); + std::span Aoff_diag_offset(off_diag_offset().data(), + nrowslocal); + std::span Avalues(values().data(), Arow_ptr[nrowslocal]); + + std::span _x = x.array(); + std::span _y = y.array(); + + std::span Arow_begin(Arow_ptr.data(), nrowslocal); + std::span Arow_end(Arow_ptr.data() + 1, nrowslocal); + + // Compute ghost region contribution and scatter back + int ncolslocal = index_map(1)->size_local(); + std::fill(std::next(_y.begin(), ncolslocal), _y.end(), 0.0); + impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]); + y.scatter_rev(std::plus()); + impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]); + +} + } // namespace dolfinx::la diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h index 143b6c0d4f..ffed5aedd9 100644 --- a/cpp/dolfinx/la/matrix_csr_impl.h +++ b/cpp/dolfinx/la/matrix_csr_impl.h @@ -259,5 +259,42 @@ void spmv(std::span values, std::span row_begin, } } + +/// @brief Sparse matrix-vector transpose product implementation. +/// @tparam T +/// @tparam BS1 +/// @param values +/// @param row_begin +/// @param row_end +/// @param indices +/// @param x +/// @param y +/// @param bs0 +/// @param bs1 +template +void spmvT(std::span values, std::span row_begin, + std::span row_end, + std::span indices, std::span x, + std::span y, int bs0, int bs1) +{ + assert(row_begin.size() == row_end.size()); + for (int k0 = 0; k0 < bs0; ++k0) + { + for (std::size_t i = 0; i < row_begin.size(); i++) + { + const T xval = x[i * bs0 + k0]; + for (std::int32_t j = row_begin[i]; j < row_end[i]; j++) + { + for (int k1 = 0; k1 < bs1; ++k1) + { + y[indices[j] * bs1 + k1] += values[j * bs1 * bs0 + k1 * bs0 + k0] + * xval; + } + } + } + } +} + + } // namespace impl } // namespace dolfinx::la diff --git a/python/dolfinx/wrappers/dolfinx_wrappers/la.h b/python/dolfinx/wrappers/dolfinx_wrappers/la.h index 57ede85a46..4ac7a36730 100644 --- a/python/dolfinx/wrappers/dolfinx_wrappers/la.h +++ b/python/dolfinx/wrappers/dolfinx_wrappers/la.h @@ -145,6 +145,7 @@ void declare_la_objects(nanobind::module_& m, const std::string& type) }) .def("scatter_reverse", &dolfinx::la::MatrixCSR::scatter_rev) .def("mult", &dolfinx::la::MatrixCSR::mult) + .def("multT", &dolfinx::la::MatrixCSR::multT) .def("to_dense", [](const dolfinx::la::MatrixCSR& self) { diff --git a/python/test/unit/la/test_matrix_vector.py b/python/test/unit/la/test_matrix_vector.py index fc5847a109..13f4d8f267 100644 --- a/python/test/unit/la/test_matrix_vector.py +++ b/python/test/unit/la/test_matrix_vector.py @@ -9,6 +9,7 @@ import numpy as np import pytest +from scipy.sparse import csr_matrix from dolfinx import cpp as _cpp from dolfinx import la @@ -16,6 +17,19 @@ from dolfinx.mesh import create_unit_square +def mat_gather(A): + # Gather full matrix on all processes for scipy + nr = A.index_map(0).size_local + gatheredvals = np.concatenate(MPI.COMM_WORLD.allgather(A.data[: A.indptr[nr]])) + gatheredptrs = MPI.COMM_WORLD.allgather(A.indptr[: nr + 1]) + cols = A.index_map(1).local_to_global(A.indices[: A.indptr[nr]]) + gatheredcols = np.concatenate(MPI.COMM_WORLD.allgather(cols)) + indptr = gatheredptrs[0] + for i in range(1, len(gatheredptrs)): + indptr = np.concatenate((indptr, (gatheredptrs[i][1:] + indptr[-1]))) + return csr_matrix((gatheredvals, gatheredcols, indptr)) + + def test_create_matrix_csr(): """Test creation of CSR matrix with specified types.""" mesh = create_unit_square(MPI.COMM_WORLD, 10, 11) @@ -58,22 +72,67 @@ def test_matvec(dtype): imap = mesh.topology.index_map(0) sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1]) rows = np.arange(0, imap.size_local) - cols = np.arange(0, imap.size_local) + cols = np.arange(0, imap.size_local + imap.num_ghosts) sp.insert(rows, cols) sp.finalize() # Identity A = la.matrix_csr(sp, dtype=dtype) - for i in range(imap.size_local): - A.add(np.array([2.0], dtype=dtype), np.array([i]), np.array([i])) + rng = np.random.default_rng(12345) + A.data[:] = rng.random(len(A.data)) A.scatter_reverse() + Ascipy = mat_gather(A) + lr0, lr1 = A.index_map(0).local_range + nr = A.index_map(0).size_local + # Check gathered matrix + assert np.allclose(A.to_dense()[:nr, :], Ascipy.todense()[lr0:lr1]) + b = la.vector(imap, dtype=dtype) u = la.vector(imap, dtype=dtype) b.array[:] = 1.0 A.mult(b, u) u.scatter_forward() - assert np.allclose(u.array[: imap.size_local], 2.0) + + bs = np.ones(A.index_map(0).size_global) + us = Ascipy @ bs + assert np.allclose(u.array[:nr], us[lr0:lr1]) + + +def test_matvec_transpose(): + dtype = np.float64 + mesh = create_unit_square(MPI.COMM_WORLD, 3, 3) + imap = mesh.topology.index_map(0) + sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1]) + rows = np.arange(0, imap.size_local) + cols = np.arange(0, imap.size_local + imap.num_ghosts) + sp.insert(rows, cols) + sp.finalize() + + # Identity + A = la.matrix_csr(sp, dtype=dtype) + rng = np.random.default_rng(12345) + A.data[:] = rng.random(len(A.data)) + A.scatter_reverse() + + Ascipy = mat_gather(A) + lr0, lr1 = A.index_map(0).local_range + nr = A.index_map(0).size_local + # Check gathered matrix + assert np.allclose(A.to_dense()[:nr, :], Ascipy.todense()[lr0:lr1]) + + b = la.vector(imap, dtype=dtype) + u = la.vector(imap, dtype=dtype) + b.array[:] = 1.0 + A._cpp_object.multT(b._cpp_object, u._cpp_object) + u.scatter_forward() + print(u.array[: imap.size_local]) + # assert np.allclose(u.array[: imap.size_local], 2.0) + + bs = np.ones(A.index_map(0).size_global) + us = Ascipy.T @ bs + print(us) + assert np.allclose(u.array[:nr], us[lr0:lr1]) @pytest.mark.parametrize( From 7c065753ab8a95ef1f7106f408905cd609658cfa Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Thu, 16 Apr 2026 17:38:25 +0100 Subject: [PATCH 03/23] fix up test a bit --- cpp/dolfinx/la/MatrixCSR.h | 15 +++++++++++++-- cpp/dolfinx/la/matrix_csr_impl.h | 12 ++++++++++++ python/dolfinx/la/__init__.py | 10 +++++++--- python/test/unit/la/test_matrix_vector.py | 6 +----- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h index 87e9806173..971f18ce6f 100644 --- a/cpp/dolfinx/la/MatrixCSR.h +++ b/cpp/dolfinx/la/MatrixCSR.h @@ -889,6 +889,9 @@ void MatrixCSR::mult(la::Vector& x, } } +/// @brief Compute transpose y += A^T x +/// @param x +/// @param y template void MatrixCSR::multT(la::Vector& x, la::Vector& y) const @@ -909,9 +912,17 @@ void MatrixCSR::multT(la::Vector& x, // Compute ghost region contribution and scatter back int ncolslocal = index_map(1)->size_local(); std::fill(std::next(_y.begin(), ncolslocal), _y.end(), 0.0); - impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]); + if (_bs[1] == 1) + impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]); + else + impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]); + y.scatter_rev(std::plus()); - impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]); + + if (_bs[1] == 1) + impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]); + else + impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]); } diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h index ffed5aedd9..8af8c903fb 100644 --- a/cpp/dolfinx/la/matrix_csr_impl.h +++ b/cpp/dolfinx/la/matrix_csr_impl.h @@ -278,6 +278,7 @@ void spmvT(std::span values, std::span row_begin, std::span y, int bs0, int bs1) { assert(row_begin.size() == row_end.size()); + for (int k0 = 0; k0 < bs0; ++k0) { for (std::size_t i = 0; i < row_begin.size(); i++) @@ -285,11 +286,22 @@ void spmvT(std::span values, std::span row_begin, const T xval = x[i * bs0 + k0]; for (std::int32_t j = row_begin[i]; j < row_end[i]; j++) { + if constexpr(BS1 == -1) + { for (int k1 = 0; k1 < bs1; ++k1) { y[indices[j] * bs1 + k1] += values[j * bs1 * bs0 + k1 * bs0 + k0] * xval; } + } + else + { + for (int k1 = 0; k1 < BS1; ++k1) + { + y[indices[j] * BS1 + k1] += values[j * BS1 * bs0 + k1 * bs0 + k0] + * xval; + } + } } } } diff --git a/python/dolfinx/la/__init__.py b/python/dolfinx/la/__init__.py index 8c3f40aaf4..88054df14a 100644 --- a/python/dolfinx/la/__init__.py +++ b/python/dolfinx/la/__init__.py @@ -155,14 +155,18 @@ def index_map(self, i: int) -> IndexMap: """ return self._cpp_object.index_map(i) - def mult(self, x: Vector, y: Vector) -> None: - """Compute ``y += Ax``. + def mult(self, x: Vector, y: Vector, transpose: bool = False) -> None: + """Compute ``y += Ax`` or ``y += A^T x``. Args: x: Input Vector y: Output Vector + transpose: if True, compute y += A^T x """ - self._cpp_object.mult(x._cpp_object, y._cpp_object) + if transpose: + self._cpp_object.multT(x._cpp_object, y._cpp_object) + else: + self._cpp_object.mult(x._cpp_object, y._cpp_object) @property def block_size(self) -> list: diff --git a/python/test/unit/la/test_matrix_vector.py b/python/test/unit/la/test_matrix_vector.py index 13f4d8f267..9175c6ee16 100644 --- a/python/test/unit/la/test_matrix_vector.py +++ b/python/test/unit/la/test_matrix_vector.py @@ -92,7 +92,6 @@ def test_matvec(dtype): u = la.vector(imap, dtype=dtype) b.array[:] = 1.0 A.mult(b, u) - u.scatter_forward() bs = np.ones(A.index_map(0).size_global) us = Ascipy @ bs @@ -124,10 +123,7 @@ def test_matvec_transpose(): b = la.vector(imap, dtype=dtype) u = la.vector(imap, dtype=dtype) b.array[:] = 1.0 - A._cpp_object.multT(b._cpp_object, u._cpp_object) - u.scatter_forward() - print(u.array[: imap.size_local]) - # assert np.allclose(u.array[: imap.size_local], 2.0) + A.mult(b, u, True) bs = np.ones(A.index_map(0).size_global) us = Ascipy.T @ bs From 3fdebefad84af29bf05e4d8709607a85a7f94cd8 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 09:54:37 +0100 Subject: [PATCH 04/23] Update tests --- cpp/dolfinx/la/MatrixCSR.h | 4 ++-- python/test/unit/la/test_matrix_vector.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h index 971f18ce6f..cab4e01c5b 100644 --- a/cpp/dolfinx/la/MatrixCSR.h +++ b/cpp/dolfinx/la/MatrixCSR.h @@ -913,14 +913,14 @@ void MatrixCSR::multT(la::Vector& x, int ncolslocal = index_map(1)->size_local(); std::fill(std::next(_y.begin(), ncolslocal), _y.end(), 0.0); if (_bs[1] == 1) - impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]); + impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], 1); else impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]); y.scatter_rev(std::plus()); if (_bs[1] == 1) - impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]); + impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], 1); else impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]); diff --git a/python/test/unit/la/test_matrix_vector.py b/python/test/unit/la/test_matrix_vector.py index 9175c6ee16..c20558dcfe 100644 --- a/python/test/unit/la/test_matrix_vector.py +++ b/python/test/unit/la/test_matrix_vector.py @@ -90,10 +90,11 @@ def test_matvec(dtype): b = la.vector(imap, dtype=dtype) u = la.vector(imap, dtype=dtype) - b.array[:] = 1.0 + b.array[:] = np.arange(len(b.array)) A.mult(b, u) - bs = np.ones(A.index_map(0).size_global) + bs = np.concatenate(mesh.comm.allgather(b.array[:nr])) + print(Ascipy.shape, bs.shape) us = Ascipy @ bs assert np.allclose(u.array[:nr], us[lr0:lr1]) @@ -122,12 +123,11 @@ def test_matvec_transpose(): b = la.vector(imap, dtype=dtype) u = la.vector(imap, dtype=dtype) - b.array[:] = 1.0 + b.array[:] = np.arange(len(b.array)) A.mult(b, u, True) - bs = np.ones(A.index_map(0).size_global) + bs = np.concatenate(mesh.comm.allgather(b.array[:nr])) us = Ascipy.T @ bs - print(us) assert np.allclose(u.array[:nr], us[lr0:lr1]) From 1a541ad4b197bd690b5370d1c981726bb7188811 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 11:32:31 +0100 Subject: [PATCH 05/23] Use non-blocking MPI for y update --- cpp/dolfinx/la/MatrixCSR.h | 4 +++- cpp/dolfinx/la/Vector.h | 29 ++++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h index cab4e01c5b..d95ab873c3 100644 --- a/cpp/dolfinx/la/MatrixCSR.h +++ b/cpp/dolfinx/la/MatrixCSR.h @@ -917,13 +917,15 @@ void MatrixCSR::multT(la::Vector& x, else impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]); - y.scatter_rev(std::plus()); + y.scatter_rev_begin(); if (_bs[1] == 1) impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], 1); else impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]); + y.scatter_rev_end(std::plus()); + } } // namespace dolfinx::la diff --git a/cpp/dolfinx/la/Vector.h b/cpp/dolfinx/la/Vector.h index afbdb10cd2..e29e6f38b1 100644 --- a/cpp/dolfinx/la/Vector.h +++ b/cpp/dolfinx/la/Vector.h @@ -354,6 +354,33 @@ class Vector _x.begin()); } + /// @brief End scatter (send) of local data values + /// to the owning process and assign/accumulate into the owned data entries + /// (simplified CPU version). + /// + /// Suitable for scatter operations on a CPU with `std::vector` + /// storage. The receive buffer is unpacked internally by a function + /// that is suitable for use on a CPU. + /// + /// @note Collective MPI operation. + template + void scatter_rev_end(BinaryOperation op) + requires requires(Container c) { + { c.data() } -> std::same_as; + } + { + // Inline the kernel overload directly to avoid ambiguity: the lambda + // produced by get_unpack_op satisfies both VectorPackKernel and the + // unconstrained BinaryOperation parameter, making a delegating call to + // scatter_rev_end(get_unpack_op(op)) ambiguous. + auto unpack = get_unpack_op(op); + _scatterer->scatter_end(_request); + unpack(_scatterer->local_indices().begin(), + _scatterer->local_indices().end(), _buffer_local.begin(), + _x.begin()); + } + + /// @brief Scatter (send) of ghost data values to the owning process /// and assign/accumulate into the owned data entries (simplified CPU /// version). @@ -372,7 +399,7 @@ class Vector void scatter_rev(BinaryOperation op) { this->scatter_rev_begin(); - this->scatter_rev_end(get_unpack_op(op)); + this->scatter_rev_end(op); } /// Get IndexMap From 109843c0e89309758592c62cf8a70e68b83162b9 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 11:37:04 +0100 Subject: [PATCH 06/23] revert and update test --- cpp/dolfinx/la/MatrixCSR.h | 4 +--- python/test/unit/la/test_matrix_vector.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h index d95ab873c3..619fa54397 100644 --- a/cpp/dolfinx/la/MatrixCSR.h +++ b/cpp/dolfinx/la/MatrixCSR.h @@ -917,15 +917,13 @@ void MatrixCSR::multT(la::Vector& x, else impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]); - y.scatter_rev_begin(); + y.scatter_rev(); if (_bs[1] == 1) impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], 1); else impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]); - y.scatter_rev_end(std::plus()); - } } // namespace dolfinx::la diff --git a/python/test/unit/la/test_matrix_vector.py b/python/test/unit/la/test_matrix_vector.py index c20558dcfe..e064b6b04a 100644 --- a/python/test/unit/la/test_matrix_vector.py +++ b/python/test/unit/la/test_matrix_vector.py @@ -90,6 +90,7 @@ def test_matvec(dtype): b = la.vector(imap, dtype=dtype) u = la.vector(imap, dtype=dtype) + u.array[:] = 0.0 b.array[:] = np.arange(len(b.array)) A.mult(b, u) @@ -99,8 +100,16 @@ def test_matvec(dtype): assert np.allclose(u.array[:nr], us[lr0:lr1]) -def test_matvec_transpose(): - dtype = np.float64 +@pytest.mark.parametrize( + "dtype", + [ + np.float32, + np.float64, + np.complex64, + np.complex128, + ], +) +def test_matvec_transpose(dtype): mesh = create_unit_square(MPI.COMM_WORLD, 3, 3) imap = mesh.topology.index_map(0) sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1]) @@ -123,6 +132,7 @@ def test_matvec_transpose(): b = la.vector(imap, dtype=dtype) u = la.vector(imap, dtype=dtype) + u.array[:] = 0.0 b.array[:] = np.arange(len(b.array)) A.mult(b, u, True) From f822b92ca0cb3391118e556c2cdb8190fc8ef142 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 15:34:32 +0100 Subject: [PATCH 07/23] Add mutmul implementation --- cpp/dolfinx/la/CMakeLists.txt | 1 + cpp/dolfinx/la/MatrixCSR.h | 4 +- cpp/dolfinx/la/matmul_impl.h | 528 ++++++++++++++++++ python/dolfinx/la/__init__.py | 11 + python/dolfinx/wrappers/dolfinx_wrappers/la.h | 4 + python/test/unit/la/test_matmul.py | 73 +++ 6 files changed, 620 insertions(+), 1 deletion(-) create mode 100644 cpp/dolfinx/la/matmul_impl.h create mode 100644 python/test/unit/la/test_matmul.py diff --git a/cpp/dolfinx/la/CMakeLists.txt b/cpp/dolfinx/la/CMakeLists.txt index eeebf8897d..48c97917f3 100644 --- a/cpp/dolfinx/la/CMakeLists.txt +++ b/cpp/dolfinx/la/CMakeLists.txt @@ -2,6 +2,7 @@ set(HEADERS_la ${CMAKE_CURRENT_SOURCE_DIR}/dolfinx_la.h ${CMAKE_CURRENT_SOURCE_DIR}/MatrixCSR.h ${CMAKE_CURRENT_SOURCE_DIR}/matrix_csr_impl.h + ${CMAKE_CURRENT_SOURCE_DIR}/matmul_impl.h ${CMAKE_CURRENT_SOURCE_DIR}/SparsityPattern.h ${CMAKE_CURRENT_SOURCE_DIR}/superlu_dist.h ${CMAKE_CURRENT_SOURCE_DIR}/Vector.h diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h index 619fa54397..a1389c2e87 100644 --- a/cpp/dolfinx/la/MatrixCSR.h +++ b/cpp/dolfinx/la/MatrixCSR.h @@ -917,7 +917,7 @@ void MatrixCSR::multT(la::Vector& x, else impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]); - y.scatter_rev(); + y.scatter_rev(std::plus{}); if (_bs[1] == 1) impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], 1); @@ -926,4 +926,6 @@ void MatrixCSR::multT(la::Vector& x, } + + } // namespace dolfinx::la diff --git a/cpp/dolfinx/la/matmul_impl.h b/cpp/dolfinx/la/matmul_impl.h new file mode 100644 index 0000000000..086843d293 --- /dev/null +++ b/cpp/dolfinx/la/matmul_impl.h @@ -0,0 +1,528 @@ +// Copyright (C) 2026 Chris Richardson +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include +#include + +#include +#include +#include +#include + +namespace dolfinx::la +{ + +/// @brief Fetch the rows of B that correspond to the ghost columns of A. +/// +/// For computing the product A*B, each rank needs the rows of B whose +/// global indices match the ghost columns of A. Those ghost columns are +/// owned by remote ranks, so we request those rows via neighbourhood +/// communication. +/// +/// @param A Matrix whose ghost column indices determine which rows of B +/// are needed. +/// @param B Matrix whose rows are fetched. +/// @return A new MatrixCSR containing the fetched ghost rows of B, with +/// an extended column IndexMap covering any new ghost columns +/// introduced by those rows. +namespace impl +{ + +// Lightweight struct satisfying SparsityImplementation for MatrixCSR +// constructor. +struct Sparsity +{ + std::shared_ptr _row_map; + std::shared_ptr _col_map; + std::span _cols; + std::span _offsets; + std::span _off_diag; + + std::shared_ptr index_map(int dim) const + { + return dim == 0 ? _row_map : _col_map; + } + + int block_size(int) const { return 1; } + + std::pair, std::span> + graph() const + { + return {_cols, _offsets}; + } + + std::span off_diagonal_offsets() const + { + return _off_diag; + } +}; + +template +std::tuple, std::vector, + std::vector, std::vector> +fetch_ghost_rows(const dolfinx::la::MatrixCSR& A, + const dolfinx::la::MatrixCSR& B) +{ + dolfinx::common::Timer tsp1("{*}fetch"); + // The ghost columns of A are global row indices into B. + auto col_map_A = A.index_map(1); // column IndexMap of A + auto row_map_B = B.index_map(0); // row IndexMap of B + + // Serial fast-path: with a single rank there are no ghost columns and + // no MPI communication to perform. We must NOT apply this check per-rank + // in a parallel run: some ranks may have no ghost columns while others do, + // and the neighbourhood collectives below must be reached by every rank. + { + int comm_size = 1; + MPI_Comm_size(col_map_A->comm(), &comm_size); + if (comm_size == 1) + { + auto col_map_B = B.index_map(1); + auto new_col_map = std::make_shared( + col_map_B->comm(), col_map_B->size_local(), col_map_B->ghosts(), + col_map_B->owners()); + return {new_col_map, std::vector{0}, + std::vector{}, std::vector{}}; + } + } + + // Create neighborhood comms for col_map_A + std::span src = col_map_A->src(); + std::span dest = col_map_A->dest(); + MPI_Comm comm = col_map_A->comm(); + + // No-neighbour parallel fast-path: A has no ghost columns (src empty) and + // no rank ghosts our A-owned columns (dest empty), so no remote rows of B + // are needed and no rank is waiting for us to participate in a collective. + // Symmetric topology guarantees the early return is safe without any global + // synchronisation (same argument as for transpose()). + // Use col_map_B — not col_map_A — for new_col_map so that any existing + // ghost columns in B are preserved in the product's column map. + if (src.empty() && dest.empty()) + { + auto col_map_B = B.index_map(1); + auto new_col_map = std::make_shared( + col_map_B->comm(), col_map_B->size_local(), col_map_B->ghosts(), + col_map_B->owners()); + return {new_col_map, std::vector{0}, + std::vector{}, std::vector{}}; + } + + std::map src_rank_to_nbr; + for (std::size_t i = 0; i < src.size(); ++i) + src_rank_to_nbr.insert({src[i], i}); + MPI_Comm neigh_comm_fwd; + MPI_Comm neigh_comm_rev; + MPI_Dist_graph_create_adjacent(comm, dest.size(), dest.data(), MPI_UNWEIGHTED, + src.size(), src.data(), MPI_UNWEIGHTED, + MPI_INFO_NULL, false, &neigh_comm_fwd); + MPI_Dist_graph_create_adjacent(comm, src.size(), src.data(), MPI_UNWEIGHTED, + dest.size(), dest.data(), MPI_UNWEIGHTED, + MPI_INFO_NULL, false, &neigh_comm_rev); + + // Global indices of the rows of B we need (= ghost cols of A) + std::span required_globals_map = col_map_A->ghosts(); + + // Owning rank for each ghost col of A (= source rank for each needed row) + std::span ghost_owners = col_map_A->owners(); + + std::vector send_count(src.size(), 0); + std::vector recv_count(dest.size(), 0); + + for (int gh : ghost_owners) + ++send_count[src_rank_to_nbr[gh]]; + MPI_Neighbor_alltoall(send_count.data(), 1, MPI_INT, recv_count.data(), 1, + MPI_INT, neigh_comm_fwd); + + // Send and recv displacements + std::vector send_disp(src.size() + 1, 0); + std::partial_sum(send_count.begin(), send_count.end(), + std::next(send_disp.begin())); + std::vector recv_disp(dest.size() + 1, 0); + std::partial_sum(recv_count.begin(), recv_count.end(), + std::next(recv_disp.begin())); + + // perm[i] = position in the send buffer (and received data) of original + // ghost column i, so we can invert the ordering after receiving. + std::vector perm(ghost_owners.size()); + std::vector required_globals(required_globals_map.size()); + for (std::size_t i = 0; i < ghost_owners.size(); ++i) + { + int pos = src_rank_to_nbr[ghost_owners[i]]; + perm[i] = send_disp[pos]; // position before increment + required_globals[send_disp[pos]++] = required_globals_map[i]; + } + + // Reset send_disp + send_disp[0] = 0; + std::partial_sum(send_count.begin(), send_count.end(), + std::next(send_disp.begin())); + + // Send the global row indices we need; receive the indices others need from + // us send buffer = required_globals (already ordered by src[]) + std::vector recv_row_globals(recv_disp.back()); + + MPI_Neighbor_alltoallv(required_globals.data(), send_count.data(), + send_disp.data(), MPI_INT64_T, recv_row_globals.data(), + recv_count.data(), recv_disp.data(), MPI_INT64_T, + neigh_comm_fwd); + + // Convert received global row indices to local row indices in B. + // The received indices are global rows of B that remote ranks need. + // They must all be owned by this rank (by construction). + std::vector recv_row_locals(recv_row_globals.size()); + row_map_B->global_to_local(recv_row_globals, recv_row_locals); + + // Compute size of send_data and communicate + auto row_ptr_B = B.row_ptr(); + + std::vector send_entry_count(dest.size(), 0); + std::vector send_entry_disp(dest.size() + 1, 0); + std::vector recv_entry_count(src.size(), 0); + std::vector recv_entry_disp(src.size() + 1, 0); + assert(recv_count.size() == dest.size()); + for (std::size_t p = 0; p < recv_count.size(); ++p) + { + for (std::int32_t i = recv_disp[p]; i < recv_disp[p + 1]; ++i) + { + int r = recv_row_locals[i]; + send_entry_count[p] += row_ptr_B[r + 1] - row_ptr_B[r]; + } + } + std::partial_sum(send_entry_count.begin(), send_entry_count.end(), + std::next(send_entry_disp.begin())); + + MPI_Neighbor_alltoall(send_entry_count.data(), 1, MPI_INT, + recv_entry_count.data(), 1, MPI_INT, neigh_comm_rev); + + std::partial_sum(recv_entry_count.begin(), recv_entry_count.end(), + std::next(recv_entry_disp.begin())); + + // Pack and send data values + auto values_B = B.values(); + std::vector send_vals(send_entry_disp.back()); + std::vector recv_vals(recv_entry_disp.back()); + std::size_t k = 0; + for (int r : recv_row_locals) + { + std::size_t row_size = row_ptr_B[r + 1] - row_ptr_B[r]; + std::copy(std::next(values_B.begin(), row_ptr_B[r]), + std::next(values_B.begin(), row_ptr_B[r + 1]), + std::next(send_vals.begin(), k)); + k += row_size; + } + MPI_Datatype mpi_T = dolfinx::MPI::mpi_t; + MPI_Neighbor_alltoallv(send_vals.data(), send_entry_count.data(), + send_entry_disp.data(), mpi_T, recv_vals.data(), + recv_entry_count.data(), recv_entry_disp.data(), mpi_T, + neigh_comm_rev); + + // Pack and send column indices + auto cols_B = B.cols(); + auto col_map_B = B.index_map(1); + const std::int32_t num_owned_cols_B = col_map_B->size_local(); + std::span ghost_owners_B = col_map_B->owners(); + + // Send column indices (global indexing) and their owner ranks + std::vector send_col_data(send_entry_disp.back()); + std::vector recv_col_data(recv_entry_disp.back()); + std::vector send_col_owners(send_entry_disp.back()); + std::vector recv_col_owners(recv_entry_disp.back()); + + // Send row sizes so receivers can reconstruct the CSR row_ptr + std::vector send_row_size(recv_disp.back()); + std::vector recv_row_size(send_disp.back()); + + k = 0; + int rank = dolfinx::MPI::rank(col_map_A->comm()); + for (std::size_t p = 0; p < dest.size(); ++p) + { + for (std::int32_t i = recv_disp[p]; i < recv_disp[p + 1]; ++i) + { + int r = recv_row_locals[i]; + std::size_t row_size = row_ptr_B[r + 1] - row_ptr_B[r]; + col_map_B->local_to_global( + std::span(std::next(cols_B.begin(), row_ptr_B[r]), row_size), + std::span(std::next(send_col_data.begin(), k), row_size)); + for (std::size_t j = 0; j < row_size; ++j) + { + std::int32_t local_col = cols_B[row_ptr_B[r] + j]; + send_col_owners[k + j] + = (local_col < num_owned_cols_B) + ? rank + : ghost_owners_B[local_col - num_owned_cols_B]; + } + k += row_size; + send_row_size[i] = row_size; + } + } + + MPI_Neighbor_alltoallv(send_col_data.data(), send_entry_count.data(), + send_entry_disp.data(), MPI_INT64_T, + recv_col_data.data(), recv_entry_count.data(), + recv_entry_disp.data(), MPI_INT64_T, neigh_comm_rev); + MPI_Neighbor_alltoallv(send_col_owners.data(), send_entry_count.data(), + send_entry_disp.data(), MPI_INT, + recv_col_owners.data(), recv_entry_count.data(), + recv_entry_disp.data(), MPI_INT, neigh_comm_rev); + MPI_Neighbor_alltoallv(send_row_size.data(), recv_count.data(), + recv_disp.data(), MPI_INT, recv_row_size.data(), + send_count.data(), send_disp.data(), MPI_INT, + neigh_comm_rev); + + MPI_Comm_free(&neigh_comm_fwd); + MPI_Comm_free(&neigh_comm_rev); + + // Build CSR row pointer for the received ghost rows. + // recv_vals and recv_col_data are the values and (global) column indices; + // recv_row_size[i] is the number of entries in ghost row i. + std::vector ghost_row_ptr(recv_row_size.size() + 1, 0); + std::partial_sum(recv_row_size.begin(), recv_row_size.end(), + std::next(ghost_row_ptr.begin())); + + // ghost_row_ptr, recv_vals, recv_col_data now form a CSR structure for + // the ghost rows of B needed by this rank. + + // Collect ghost column indices with their owners. + // Start from the received ghost rows, then add existing ghosts of col_map_B. + // Exclude anything in the locally owned range of col_map_B, then deduplicate. + auto [col_B_local_start, col_B_local_end] = col_map_B->local_range(); + auto is_local = [col_B_local_start, col_B_local_end](std::int64_t idx) + { return idx >= col_B_local_start && idx < col_B_local_end; }; + + // Merge received (global_col, owner) pairs with the existing ghosts. + std::vector> col_owner_pairs; + col_owner_pairs.reserve(recv_col_data.size() + col_map_B->ghosts().size()); + for (std::size_t i = 0; i < recv_col_data.size(); ++i) + if (!is_local(recv_col_data[i])) + col_owner_pairs.push_back({recv_col_data[i], recv_col_owners[i]}); + + std::span existing_ghosts = col_map_B->ghosts(); + std::span existing_ghost_owners = col_map_B->owners(); + for (std::size_t i = 0; i < existing_ghosts.size(); ++i) + col_owner_pairs.push_back({existing_ghosts[i], existing_ghost_owners[i]}); + + // Sort by global index and deduplicate. + std::sort(col_owner_pairs.begin(), col_owner_pairs.end()); + col_owner_pairs.erase( + std::unique(col_owner_pairs.begin(), col_owner_pairs.end()), + col_owner_pairs.end()); + + std::vector unique_cols(col_owner_pairs.size()); + std::vector unique_col_owners(col_owner_pairs.size()); + for (std::size_t i = 0; i < col_owner_pairs.size(); ++i) + { + unique_cols[i] = col_owner_pairs[i].first; + unique_col_owners[i] = col_owner_pairs[i].second; + } + + // Build the extended column IndexMap for B. + auto new_col_map = std::make_shared( + comm, col_map_B->size_local(), unique_cols, unique_col_owners); + + // Convert received global column indices to local indices using new_col_map. + std::vector recv_col_local(recv_col_data.size()); + new_col_map->global_to_local(recv_col_data, recv_col_local); + + // The received data is in send-buffer order (grouped by src rank). + // Reorder it back to the original ghost column order of col_map_A so that + // sparsity_pattern and matmul can index by (j - num_local_rows_B). + const std::size_t num_ghosts = perm.size(); + std::vector orig_ghost_row_ptr(num_ghosts + 1, 0); + for (std::size_t i = 0; i < num_ghosts; ++i) + orig_ghost_row_ptr[i + 1] = recv_row_size[perm[i]]; + std::partial_sum(orig_ghost_row_ptr.begin(), orig_ghost_row_ptr.end(), + orig_ghost_row_ptr.begin()); + + std::vector orig_recv_col_local(recv_col_local.size()); + std::vector orig_recv_vals(recv_vals.size()); + for (std::size_t i = 0; i < num_ghosts; ++i) + { + std::int32_t src_start = ghost_row_ptr[perm[i]]; + std::int32_t src_end = ghost_row_ptr[perm[i] + 1]; + std::int32_t dst_start = orig_ghost_row_ptr[i]; + std::copy(recv_col_local.begin() + src_start, + recv_col_local.begin() + src_end, + orig_recv_col_local.begin() + dst_start); + std::copy(recv_vals.begin() + src_start, recv_vals.begin() + src_end, + orig_recv_vals.begin() + dst_start); + } + + return {new_col_map, orig_ghost_row_ptr, orig_recv_col_local, orig_recv_vals}; +} + +/// @brief Compute the sparsity pattern and values of C = A*B in a single pass. +/// +/// Uses a dense accumulator (one entry per possible output column) to +/// simultaneously detect nonzero columns and accumulate A[i,j]*B[j,k], +/// eliminating the separate value-fill loop and the per-entry lower_bound +/// search that a two-pass approach requires. +/// +/// @param A Left matrix. +/// @param B Right matrix (local rows only). +/// @param new_col_map Extended column IndexMap for C, from fetch_ghost_rows. +/// @param ghost_row_ptr CSR row pointer for the ghost rows of B. +/// @param ghost_cols Local column indices (w.r.t. new_col_map) for ghost +/// rows. +/// @param ghost_vals Values for the ghost rows of B. +/// @return (row_ptr, cols, vals) — the CSR structure and values of C. +template +std::tuple, std::vector, std::vector> +matmul(const dolfinx::la::MatrixCSR& A, const dolfinx::la::MatrixCSR& B, + std::shared_ptr new_col_map, + std::span ghost_row_ptr, + std::span ghost_cols, std::span ghost_vals) +{ + dolfinx::common::Timer tsp("{*}sparsity_and_mult"); + auto col_map_B = B.index_map(1); + const std::int32_t num_local_rows_A = A.index_map(0)->size_local(); + const std::int32_t num_local_rows_B = B.index_map(0)->size_local(); + const std::int32_t num_owned_cols_B = col_map_B->size_local(); + + auto row_ptr_A = A.row_ptr(); + auto offdiag_ptr_A = A.off_diag_offset(); + auto cols_A = A.cols(); + auto vals_A = A.values(); + auto row_ptr_B = B.row_ptr(); + auto cols_B = B.cols(); + auto vals_B = B.values(); + + // Remap B's ghost column indices into new_col_map's index space. + // Owned column indices (0..num_owned_cols_B-1) are identical in both maps. + std::span b_ghost_globals = col_map_B->ghosts(); + std::vector b_ghost_remap(b_ghost_globals.size()); + new_col_map->global_to_local(b_ghost_globals, b_ghost_remap); + + const std::int32_t num_cols_C + = new_col_map->size_local() + + static_cast(new_col_map->ghosts().size()); + + std::vector C_row_ptr = {0}; + C_row_ptr.reserve(num_local_rows_A + 1); + std::vector C_cols; + std::vector C_vals; + + // Dense accumulator: acc[k] holds the running sum for column k. + // in_row[k] marks whether column k was touched in the current row. + // Both are reset via row_cols at the end of each row (O(nnz/row), not + // O(num_cols_C)). + std::vector acc(num_cols_C, T(0)); + std::vector in_row(num_cols_C, false); + std::vector row_cols; + + for (std::int32_t i = 0; i < num_local_rows_A; ++i) + { + row_cols.clear(); + + // Local columns of A → local rows of B. + for (std::int32_t ka = row_ptr_A[i]; ka < offdiag_ptr_A[i]; ++ka) + { + const std::int32_t j = cols_A[ka]; + const T a = vals_A[ka]; + for (std::int32_t kb = row_ptr_B[j]; kb < row_ptr_B[j + 1]; ++kb) + { + const std::int32_t c = cols_B[kb]; + const std::int32_t k + = c < num_owned_cols_B ? c : b_ghost_remap[c - num_owned_cols_B]; + if (!in_row[k]) + { + in_row[k] = true; + row_cols.push_back(k); + } + acc[k] += a * vals_B[kb]; + } + } + + // Ghost columns of A → ghost rows of B. + for (std::int32_t ka = offdiag_ptr_A[i]; ka < row_ptr_A[i + 1]; ++ka) + { + const std::int32_t j = cols_A[ka]; + const T a = vals_A[ka]; + const std::int32_t g = j - num_local_rows_B; + for (std::int32_t kb = ghost_row_ptr[g]; kb < ghost_row_ptr[g + 1]; ++kb) + { + const std::int32_t k = ghost_cols[kb]; + if (!in_row[k]) + { + in_row[k] = true; + row_cols.push_back(k); + } + acc[k] += a * ghost_vals[kb]; + } + } + + // Sort touched indices to satisfy the CSR sorted-column invariant + // required by off_diag_offset and MatrixCSR. + std::sort(row_cols.begin(), row_cols.end()); + + // Flush accumulator to output and reset both acc and in_row. + for (const std::int32_t c : row_cols) + { + C_cols.push_back(c); + C_vals.push_back(acc[c]); + acc[c] = T(0); + in_row[c] = false; + } + C_row_ptr.push_back(static_cast(C_cols.size())); + } + + return {std::move(C_row_ptr), std::move(C_cols), std::move(C_vals)}; +} + +} // namespace impl + +/// @brief Compute C = A*B as a distributed MatrixCSR. +/// +/// @param A Left matrix. +/// @param B Right matrix. +/// @return The product C = A*B as a MatrixCSR with row distribution +/// matching A and column distribution determined by B. +template +dolfinx::la::MatrixCSR matmul(const dolfinx::la::MatrixCSR& A, + const dolfinx::la::MatrixCSR& B) +{ + dolfinx::common::Timer t_spgemm("MatrixCSR SpGEMM"); + + // Fetch ghost rows of B needed to multiply against A's ghost columns. + spdlog::info("Fetch remote rows of B in C=A*B"); + auto [new_col_map, ghost_row_ptr, ghost_cols, ghost_vals] + = impl::fetch_ghost_rows(A, B); + + // Single pass: compute sparsity and values simultaneously. + spdlog::info("Compute sparsity and values of C=A*B"); + auto [C_row_ptr_32, C_cols_32, C_vals_vec] = impl::matmul( + A, B, new_col_map, std::span(ghost_row_ptr), + std::span(ghost_cols), + std::span(ghost_vals)); + + const std::int32_t num_local_rows = A.index_map(0)->size_local(); + const std::int32_t num_owned_cols = new_col_map->size_local(); + + // Per-row diagonal block boundary, needed by MatrixCSR constructor. + std::vector off_diag_offsets(num_local_rows); + for (std::int32_t i = 0; i < num_local_rows; ++i) + { + const auto begin = C_cols_32.cbegin() + C_row_ptr_32[i]; + const auto end = C_cols_32.cbegin() + C_row_ptr_32[i + 1]; + off_diag_offsets[i] = static_cast( + std::lower_bound(begin, end, num_owned_cols) - begin); + } + + auto C_row_map = std::make_shared(A.index_map(0)->comm(), + num_local_rows); + std::vector C_row_ptr_64(C_row_ptr_32.begin(), + C_row_ptr_32.end()); + + impl::Sparsity sp{C_row_map, new_col_map, C_cols_32, C_row_ptr_64, + off_diag_offsets}; + dolfinx::la::MatrixCSR C(sp); + std::copy(C_vals_vec.begin(), C_vals_vec.end(), C.values().begin()); + + return C; +} + +} // namespace dolfinx::la diff --git a/python/dolfinx/la/__init__.py b/python/dolfinx/la/__init__.py index 88054df14a..a4b8d44bd0 100644 --- a/python/dolfinx/la/__init__.py +++ b/python/dolfinx/la/__init__.py @@ -168,6 +168,17 @@ def mult(self, x: Vector, y: Vector, transpose: bool = False) -> None: else: self._cpp_object.mult(x._cpp_object, y._cpp_object) + def matmul(self, B): + """Compute ``C = A * B``, where `A` is this matrix. + + Args: + B: Input Matrix + C: Output Matrix + """ + if self.index_map(1).size_local != B.index_map(0).size_local: + raise RuntimeError("Invalid matrix sizes for matmult.") + return self._cpp_object.mult(B._cpp_object) + @property def block_size(self) -> list: """Block sizes for the matrix.""" diff --git a/python/dolfinx/wrappers/dolfinx_wrappers/la.h b/python/dolfinx/wrappers/dolfinx_wrappers/la.h index 4ac7a36730..6427a032f7 100644 --- a/python/dolfinx/wrappers/dolfinx_wrappers/la.h +++ b/python/dolfinx/wrappers/dolfinx_wrappers/la.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -145,6 +146,9 @@ void declare_la_objects(nanobind::module_& m, const std::string& type) }) .def("scatter_reverse", &dolfinx::la::MatrixCSR::scatter_rev) .def("mult", &dolfinx::la::MatrixCSR::mult) + .def("mult", [](const dolfinx::la::MatrixCSR& A, const dolfinx::la::MatrixCSR&B){ + return dolfinx::la::matmul(A, B); + }) .def("multT", &dolfinx::la::MatrixCSR::multT) .def("to_dense", [](const dolfinx::la::MatrixCSR& self) diff --git a/python/test/unit/la/test_matmul.py b/python/test/unit/la/test_matmul.py new file mode 100644 index 0000000000..9d8da204e7 --- /dev/null +++ b/python/test/unit/la/test_matmul.py @@ -0,0 +1,73 @@ +# +# This file is part of DOLFINx (https://www.fenicsproject.org) +# +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Unit tests for MatrixCSR.""" + +from mpi4py import MPI + +import numpy as np +import pytest +from scipy.sparse import csr_matrix + +from dolfinx import cpp as _cpp +from dolfinx import la +from dolfinx.mesh import create_unit_square + + +def mat_gather(A): + # Gather full matrix on all processes for scipy + nr = A.index_map(0).size_local + gatheredvals = np.concatenate(MPI.COMM_WORLD.allgather(A.data[: A.indptr[nr]])) + gatheredptrs = MPI.COMM_WORLD.allgather(A.indptr[: nr + 1]) + cols = A.index_map(1).local_to_global(A.indices[: A.indptr[nr]]) + gatheredcols = np.concatenate(MPI.COMM_WORLD.allgather(cols)) + indptr = gatheredptrs[0] + for i in range(1, len(gatheredptrs)): + indptr = np.concatenate((indptr, (gatheredptrs[i][1:] + indptr[-1]))) + return csr_matrix((gatheredvals, gatheredcols, indptr)) + + +@pytest.mark.parametrize( + "dtype", + [ + np.float32, + np.float64, + np.complex64, + np.complex128, + ], +) +def test_matmul(dtype): + mesh = create_unit_square(MPI.COMM_WORLD, 3, 3) + imap = mesh.topology.index_map(0) + sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1]) + rows = np.arange(0, imap.size_local) + cols = np.arange(0, imap.size_local + imap.num_ghosts) + sp.insert(rows, cols) + sp.finalize() + + # Identity + A = la.matrix_csr(sp, dtype=dtype) + B = la.matrix_csr(sp, dtype=dtype) + rng = np.random.default_rng(12345) + A.data[:] = rng.random(len(A.data)) + A.scatter_reverse() + B.data[:] = rng.random(len(B.data)) + B.scatter_reverse() + + Ascipy = mat_gather(A) + Bscipy = mat_gather(B) + lr0, lr1 = A.index_map(0).local_range + nr = A.index_map(0).size_local + # Check gathered matrix + assert np.allclose(A.to_dense()[:nr, :], Ascipy.todense()[lr0:lr1]) + lrB0, lrB1 = B.index_map(0).local_range + nrB = B.index_map(0).size_local + # Check gathered matrix + assert np.allclose(B.to_dense()[:nrB, :], Bscipy.todense()[lrB0:lrB1]) + + C = la.MatrixCSR(A.matmul(B)) + lrC0, lrC1 = C.index_map(0).local_range + nrC = C.index_map(0).size_local + Cscipy = Ascipy @ Bscipy + assert np.allclose(C.to_dense()[:nrC, :], Cscipy.todense()[lrC0:lrC1]) From 594c2c0374e3a8e9d0e00467de09d4f267a6e500 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 15:51:51 +0100 Subject: [PATCH 08/23] Update documentation and simplify fetch --- cpp/dolfinx/la/MatrixCSR.h | 13 ++++++++++++- cpp/dolfinx/la/matmul_impl.h | 26 +++++++------------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h index a1389c2e87..cd38f3c20d 100644 --- a/cpp/dolfinx/la/MatrixCSR.h +++ b/cpp/dolfinx/la/MatrixCSR.h @@ -491,7 +491,18 @@ class MatrixCSR /// @param[in,out] y Vector to accumulate the result into. void mult(Vector& x, Vector& y) const; - void multT(Vector& x, Vector& y) const; + + /// @brief Compute the product `y += A^T x`. + /// + /// The vectors `x` and `y` must have parallel layouts that are + /// compatible with `A`. In detail, `x` must have the same `IndexMap` as + /// the matrix rows, `A.index_map(0)` and `y` must have the same owned + /// indices as the matrix columns in `A.index_map(1)`. Ghost entries will + /// be overwritten, and owned entries of `y` will be updated. + /// + /// @param[in] x Vector to apply `A^T` to. + /// @param[in,out] y Vector to accumulate the result into. + void multT(Vector& x, Vector& y) const; /// @brief Get MPI communicator that matrix is defined on. diff --git a/cpp/dolfinx/la/matmul_impl.h b/cpp/dolfinx/la/matmul_impl.h index 086843d293..d473d18a57 100644 --- a/cpp/dolfinx/la/matmul_impl.h +++ b/cpp/dolfinx/la/matmul_impl.h @@ -72,29 +72,13 @@ fetch_ghost_rows(const dolfinx::la::MatrixCSR& A, auto col_map_A = A.index_map(1); // column IndexMap of A auto row_map_B = B.index_map(0); // row IndexMap of B - // Serial fast-path: with a single rank there are no ghost columns and - // no MPI communication to perform. We must NOT apply this check per-rank - // in a parallel run: some ranks may have no ghost columns while others do, - // and the neighbourhood collectives below must be reached by every rank. - { - int comm_size = 1; - MPI_Comm_size(col_map_A->comm(), &comm_size); - if (comm_size == 1) - { - auto col_map_B = B.index_map(1); - auto new_col_map = std::make_shared( - col_map_B->comm(), col_map_B->size_local(), col_map_B->ghosts(), - col_map_B->owners()); - return {new_col_map, std::vector{0}, - std::vector{}, std::vector{}}; - } - } - // Create neighborhood comms for col_map_A std::span src = col_map_A->src(); std::span dest = col_map_A->dest(); MPI_Comm comm = col_map_A->comm(); + // Serial fast-path: with a single rank there are no ghost columns and + // no MPI communication to perform. // No-neighbour parallel fast-path: A has no ghost columns (src empty) and // no rank ghosts our A-owned columns (dest empty), so no remote rows of B // are needed and no rank is waiting for us to participate in a collective. @@ -102,7 +86,9 @@ fetch_ghost_rows(const dolfinx::la::MatrixCSR& A, // synchronisation (same argument as for transpose()). // Use col_map_B — not col_map_A — for new_col_map so that any existing // ghost columns in B are preserved in the product's column map. - if (src.empty() && dest.empty()) + + int comm_size = dolfinx::MPI::size(col_map_A->comm()); + if (comm_size == 1 or (src.empty() && dest.empty())) { auto col_map_B = B.index_map(1); auto new_col_map = std::make_shared( @@ -481,6 +467,8 @@ matmul(const dolfinx::la::MatrixCSR& A, const dolfinx::la::MatrixCSR& B, /// @param B Right matrix. /// @return The product C = A*B as a MatrixCSR with row distribution /// matching A and column distribution determined by B. +/// The row IndexMap of C has no ghosts, and the column IndexMap of C +/// will generally be a larger superset of the column IndexMap of B. template dolfinx::la::MatrixCSR matmul(const dolfinx::la::MatrixCSR& A, const dolfinx::la::MatrixCSR& B) From 0eca1511bbe54ec03e7c6e57c7a3ee6876055126 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 16:53:26 +0100 Subject: [PATCH 09/23] bug fix --- cpp/dolfinx/la/Vector.h | 29 +---------------------------- cpp/dolfinx/la/superlu_dist.cpp | 2 +- 2 files changed, 2 insertions(+), 29 deletions(-) diff --git a/cpp/dolfinx/la/Vector.h b/cpp/dolfinx/la/Vector.h index e29e6f38b1..afbdb10cd2 100644 --- a/cpp/dolfinx/la/Vector.h +++ b/cpp/dolfinx/la/Vector.h @@ -354,33 +354,6 @@ class Vector _x.begin()); } - /// @brief End scatter (send) of local data values - /// to the owning process and assign/accumulate into the owned data entries - /// (simplified CPU version). - /// - /// Suitable for scatter operations on a CPU with `std::vector` - /// storage. The receive buffer is unpacked internally by a function - /// that is suitable for use on a CPU. - /// - /// @note Collective MPI operation. - template - void scatter_rev_end(BinaryOperation op) - requires requires(Container c) { - { c.data() } -> std::same_as; - } - { - // Inline the kernel overload directly to avoid ambiguity: the lambda - // produced by get_unpack_op satisfies both VectorPackKernel and the - // unconstrained BinaryOperation parameter, making a delegating call to - // scatter_rev_end(get_unpack_op(op)) ambiguous. - auto unpack = get_unpack_op(op); - _scatterer->scatter_end(_request); - unpack(_scatterer->local_indices().begin(), - _scatterer->local_indices().end(), _buffer_local.begin(), - _x.begin()); - } - - /// @brief Scatter (send) of ghost data values to the owning process /// and assign/accumulate into the owned data entries (simplified CPU /// version). @@ -399,7 +372,7 @@ class Vector void scatter_rev(BinaryOperation op) { this->scatter_rev_begin(); - this->scatter_rev_end(op); + this->scatter_rev_end(get_unpack_op(op)); } /// Get IndexMap diff --git a/cpp/dolfinx/la/superlu_dist.cpp b/cpp/dolfinx/la/superlu_dist.cpp index eb72f13c3f..8cd3a88dd1 100644 --- a/cpp/dolfinx/la/superlu_dist.cpp +++ b/cpp/dolfinx/la/superlu_dist.cpp @@ -458,7 +458,7 @@ void SuperLUDistSolver::set_option(std::string name, std::string value) } else { - std::runtime_error("Unsupported option name"); + throw std::runtime_error("Unsupported option name"); } } //---------------------------------------------------------------------------- From 254d5616ee2fd2eaff49b1dfc5d7cd9a96549897 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 17:31:20 +0100 Subject: [PATCH 10/23] Update header --- cpp/dolfinx/la/matmul_impl.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/dolfinx/la/matmul_impl.h b/cpp/dolfinx/la/matmul_impl.h index d473d18a57..d7c5f586a8 100644 --- a/cpp/dolfinx/la/matmul_impl.h +++ b/cpp/dolfinx/la/matmul_impl.h @@ -1,5 +1,8 @@ // Copyright (C) 2026 Chris Richardson -// SPDX-License-Identifier: MIT +// +// This file is part of DOLFINx (https://www.fenicsproject.org) +// +// SPDX-License-Identifier: LGPL-3.0-or-later #pragma once From d3d760008f834bc5350f6c96d4570d72a5d39427 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 17:32:05 +0100 Subject: [PATCH 11/23] update copyright --- python/test/unit/la/test_matmul.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/test/unit/la/test_matmul.py b/python/test/unit/la/test_matmul.py index 9d8da204e7..97e748715f 100644 --- a/python/test/unit/la/test_matmul.py +++ b/python/test/unit/la/test_matmul.py @@ -1,3 +1,4 @@ +# Copyright (C) 2026 Chris Richardson # # This file is part of DOLFINx (https://www.fenicsproject.org) # From 463f9e6afd79c15c2cf101c900b729e962782301 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 17:35:58 +0100 Subject: [PATCH 12/23] Factor out mat_gather into conftest.py --- python/test/unit/la/conftest.py | 33 +++++++++++++++++++++++ python/test/unit/la/test_matmul.py | 16 +---------- python/test/unit/la/test_matrix_vector.py | 18 ++----------- 3 files changed, 36 insertions(+), 31 deletions(-) create mode 100644 python/test/unit/la/conftest.py diff --git a/python/test/unit/la/conftest.py b/python/test/unit/la/conftest.py new file mode 100644 index 0000000000..c431b44f6f --- /dev/null +++ b/python/test/unit/la/conftest.py @@ -0,0 +1,33 @@ +# Copyright (C) 2026 Chris Richardson +# +# This file is part of DOLFINx (https://www.fenicsproject.org) +# +# SPDX-License-Identifier: LGPL-3.0-or-later + +"""Shared pytest fixtures for dolfinx.la unit tests.""" + +from mpi4py import MPI + +import numpy as np +import pytest +from scipy.sparse import csr_matrix + + +@pytest.fixture +def mat_gather(): + """Return a function that gathers a distributed MatrixCSR onto all + processes as a scipy CSR matrix (in global column indexing). + """ + + def _mat_gather(A): + nr = A.index_map(0).size_local + gatheredvals = np.concatenate(MPI.COMM_WORLD.allgather(A.data[: A.indptr[nr]])) + gatheredptrs = MPI.COMM_WORLD.allgather(A.indptr[: nr + 1]) + cols = A.index_map(1).local_to_global(A.indices[: A.indptr[nr]]) + gatheredcols = np.concatenate(MPI.COMM_WORLD.allgather(cols)) + indptr = gatheredptrs[0] + for i in range(1, len(gatheredptrs)): + indptr = np.concatenate((indptr, (gatheredptrs[i][1:] + indptr[-1]))) + return csr_matrix((gatheredvals, gatheredcols, indptr)) + + return _mat_gather diff --git a/python/test/unit/la/test_matmul.py b/python/test/unit/la/test_matmul.py index 97e748715f..a74073fd70 100644 --- a/python/test/unit/la/test_matmul.py +++ b/python/test/unit/la/test_matmul.py @@ -9,26 +9,12 @@ import numpy as np import pytest -from scipy.sparse import csr_matrix from dolfinx import cpp as _cpp from dolfinx import la from dolfinx.mesh import create_unit_square -def mat_gather(A): - # Gather full matrix on all processes for scipy - nr = A.index_map(0).size_local - gatheredvals = np.concatenate(MPI.COMM_WORLD.allgather(A.data[: A.indptr[nr]])) - gatheredptrs = MPI.COMM_WORLD.allgather(A.indptr[: nr + 1]) - cols = A.index_map(1).local_to_global(A.indices[: A.indptr[nr]]) - gatheredcols = np.concatenate(MPI.COMM_WORLD.allgather(cols)) - indptr = gatheredptrs[0] - for i in range(1, len(gatheredptrs)): - indptr = np.concatenate((indptr, (gatheredptrs[i][1:] + indptr[-1]))) - return csr_matrix((gatheredvals, gatheredcols, indptr)) - - @pytest.mark.parametrize( "dtype", [ @@ -38,7 +24,7 @@ def mat_gather(A): np.complex128, ], ) -def test_matmul(dtype): +def test_matmul(dtype, mat_gather): mesh = create_unit_square(MPI.COMM_WORLD, 3, 3) imap = mesh.topology.index_map(0) sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1]) diff --git a/python/test/unit/la/test_matrix_vector.py b/python/test/unit/la/test_matrix_vector.py index e064b6b04a..79ad1b3f0e 100644 --- a/python/test/unit/la/test_matrix_vector.py +++ b/python/test/unit/la/test_matrix_vector.py @@ -9,7 +9,6 @@ import numpy as np import pytest -from scipy.sparse import csr_matrix from dolfinx import cpp as _cpp from dolfinx import la @@ -17,19 +16,6 @@ from dolfinx.mesh import create_unit_square -def mat_gather(A): - # Gather full matrix on all processes for scipy - nr = A.index_map(0).size_local - gatheredvals = np.concatenate(MPI.COMM_WORLD.allgather(A.data[: A.indptr[nr]])) - gatheredptrs = MPI.COMM_WORLD.allgather(A.indptr[: nr + 1]) - cols = A.index_map(1).local_to_global(A.indices[: A.indptr[nr]]) - gatheredcols = np.concatenate(MPI.COMM_WORLD.allgather(cols)) - indptr = gatheredptrs[0] - for i in range(1, len(gatheredptrs)): - indptr = np.concatenate((indptr, (gatheredptrs[i][1:] + indptr[-1]))) - return csr_matrix((gatheredvals, gatheredcols, indptr)) - - def test_create_matrix_csr(): """Test creation of CSR matrix with specified types.""" mesh = create_unit_square(MPI.COMM_WORLD, 10, 11) @@ -67,7 +53,7 @@ def test_create_matrix_csr(): np.complex128, ], ) -def test_matvec(dtype): +def test_matvec(dtype, mat_gather): mesh = create_unit_square(MPI.COMM_WORLD, 3, 3) imap = mesh.topology.index_map(0) sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1]) @@ -109,7 +95,7 @@ def test_matvec(dtype): np.complex128, ], ) -def test_matvec_transpose(dtype): +def test_matvec_transpose(dtype, mat_gather): mesh = create_unit_square(MPI.COMM_WORLD, 3, 3) imap = mesh.topology.index_map(0) sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1]) From f905bb6bd4f5aca71f13f57e23d6bc4952284acd Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 17:39:49 +0100 Subject: [PATCH 13/23] docs --- cpp/dolfinx/la/matrix_csr_impl.h | 44 ++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h index 8af8c903fb..6190331279 100644 --- a/cpp/dolfinx/la/matrix_csr_impl.h +++ b/cpp/dolfinx/la/matrix_csr_impl.h @@ -260,17 +260,39 @@ void spmv(std::span values, std::span row_begin, } -/// @brief Sparse matrix-vector transpose product implementation. -/// @tparam T -/// @tparam BS1 -/// @param values -/// @param row_begin -/// @param row_end -/// @param indices -/// @param x -/// @param y -/// @param bs0 -/// @param bs1 +/// @brief Sparse matrix-vector transpose product implementation. +/// +/// Computes `y += A^T x` where A is given in CSR format. The transpose +/// is applied implicitly: for each nonzero A(i, indices[j]) the +/// contribution `A(i,j) * x[i]` is scattered into `y[indices[j]]`. +/// +/// @note `y` is accumulated into (not overwritten). Callers should +/// zero `y` before the first call if a fresh result is required. +/// +/// @note The value block layout is column-major within each block: for +/// block entry `j` the element at row-offset `k0` and column-offset `k1` +/// is stored at `values[j * bs1 * bs0 + k1 * bs0 + k0]`. +/// +/// @tparam T Scalar type of the matrix and vector entries. +/// @tparam BS1 Compile-time column block size. Pass `-1` to use the +/// runtime value `bs1` instead. +/// +/// @param[in] values Nonzero values of A, stored block-column-major. +/// Length: `nnz * bs0 * bs1`. +/// @param[in] row_begin Start positions in `values`/`indices` for each +/// row of A. Length: number of rows of A. +/// @param[in] row_end End positions in `values`/`indices` for each +/// row of A. Length: number of rows of A. +/// @param[in] indices Column indices of each nonzero block entry of A. +/// Length: `nnz`. +/// @param[in] x Input vector, indexed by the *rows* of A. +/// Length: `num_rows * bs0`. +/// @param[in,out] y Output vector, indexed by the *columns* of A, +/// accumulated into. +/// Length: `num_cols * bs1`. +/// @param[in] bs0 Row block size (runtime value). +/// @param[in] bs1 Column block size (runtime value, used when +/// `BS1 == -1`). template void spmvT(std::span values, std::span row_begin, std::span row_end, From 5bb2cb298601547191cd1d4448925f3a6b35628d Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 17:43:34 +0100 Subject: [PATCH 14/23] wrapping --- python/dolfinx/la/__init__.py | 2 +- python/test/unit/la/test_matmul.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/dolfinx/la/__init__.py b/python/dolfinx/la/__init__.py index a4b8d44bd0..d6fcd57897 100644 --- a/python/dolfinx/la/__init__.py +++ b/python/dolfinx/la/__init__.py @@ -177,7 +177,7 @@ def matmul(self, B): """ if self.index_map(1).size_local != B.index_map(0).size_local: raise RuntimeError("Invalid matrix sizes for matmult.") - return self._cpp_object.mult(B._cpp_object) + return MatrixCSR(self._cpp_object.mult(B._cpp_object)) @property def block_size(self) -> list: diff --git a/python/test/unit/la/test_matmul.py b/python/test/unit/la/test_matmul.py index a74073fd70..e6a75058ea 100644 --- a/python/test/unit/la/test_matmul.py +++ b/python/test/unit/la/test_matmul.py @@ -53,7 +53,7 @@ def test_matmul(dtype, mat_gather): # Check gathered matrix assert np.allclose(B.to_dense()[:nrB, :], Bscipy.todense()[lrB0:lrB1]) - C = la.MatrixCSR(A.matmul(B)) + C = A.matmul(B) lrC0, lrC1 = C.index_map(0).local_range nrC = C.index_map(0).size_local Cscipy = Ascipy @ Bscipy From 82bc93af1e81a76ac5d9d6e689e4dde5903bd4ee Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 21:28:12 +0100 Subject: [PATCH 15/23] revert for this PR --- cpp/dolfinx/la/superlu_dist.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/dolfinx/la/superlu_dist.cpp b/cpp/dolfinx/la/superlu_dist.cpp index 8cd3a88dd1..eb72f13c3f 100644 --- a/cpp/dolfinx/la/superlu_dist.cpp +++ b/cpp/dolfinx/la/superlu_dist.cpp @@ -458,7 +458,7 @@ void SuperLUDistSolver::set_option(std::string name, std::string value) } else { - throw std::runtime_error("Unsupported option name"); + std::runtime_error("Unsupported option name"); } } //---------------------------------------------------------------------------- From 6f18d46212083cefde7f5da01bafef4633cab92a Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 21:37:47 +0100 Subject: [PATCH 16/23] clang-format --- cpp/dolfinx/la/MatrixCSR.h | 19 ++++++++++--------- cpp/dolfinx/la/matrix_csr_impl.h | 18 ++++++++---------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h index f7e51685ab..cf54b04b93 100644 --- a/cpp/dolfinx/la/MatrixCSR.h +++ b/cpp/dolfinx/la/MatrixCSR.h @@ -903,13 +903,13 @@ void MatrixCSR::mult(la::Vector& x, /// @param y template void MatrixCSR::multT(la::Vector& x, - la::Vector& y) const + la::Vector& y) const { std::int32_t nrowslocal = num_owned_rows(); std::span Arow_ptr(row_ptr().data(), nrowslocal + 1); std::span Acols(cols().data(), Arow_ptr[nrowslocal]); std::span Aoff_diag_offset(off_diag_offset().data(), - nrowslocal); + nrowslocal); std::span Avalues(values().data(), Arow_ptr[nrowslocal]); std::span _x = x.array(); @@ -922,19 +922,20 @@ void MatrixCSR::multT(la::Vector& x, int ncolslocal = index_map(1)->size_local(); std::fill(std::next(_y.begin(), ncolslocal), _y.end(), 0.0); if (_bs[1] == 1) - impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], 1); + impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, + _bs[0], 1); else - impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]); + impl::spmvT(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, + _bs[0], _bs[1]); y.scatter_rev(std::plus{}); if (_bs[1] == 1) - impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], 1); + impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, + _bs[0], 1); else - impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]); - + impl::spmvT(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, + _y, _bs[0], _bs[1]); } - - } // namespace dolfinx::la diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h index 6190331279..2a6fa5bb1a 100644 --- a/cpp/dolfinx/la/matrix_csr_impl.h +++ b/cpp/dolfinx/la/matrix_csr_impl.h @@ -259,7 +259,6 @@ void spmv(std::span values, std::span row_begin, } } - /// @brief Sparse matrix-vector transpose product implementation. /// /// Computes `y += A^T x` where A is given in CSR format. The transpose @@ -295,9 +294,9 @@ void spmv(std::span values, std::span row_begin, /// `BS1 == -1`). template void spmvT(std::span values, std::span row_begin, - std::span row_end, - std::span indices, std::span x, - std::span y, int bs0, int bs1) + std::span row_end, + std::span indices, std::span x, + std::span y, int bs0, int bs1) { assert(row_begin.size() == row_end.size()); @@ -308,20 +307,20 @@ void spmvT(std::span values, std::span row_begin, const T xval = x[i * bs0 + k0]; for (std::int32_t j = row_begin[i]; j < row_end[i]; j++) { - if constexpr(BS1 == -1) + if constexpr (BS1 == -1) { for (int k1 = 0; k1 < bs1; ++k1) { - y[indices[j] * bs1 + k1] += values[j * bs1 * bs0 + k1 * bs0 + k0] - * xval; + y[indices[j] * bs1 + k1] + += values[j * bs1 * bs0 + k1 * bs0 + k0] * xval; } } else { for (int k1 = 0; k1 < BS1; ++k1) { - y[indices[j] * BS1 + k1] += values[j * BS1 * bs0 + k1 * bs0 + k0] - * xval; + y[indices[j] * BS1 + k1] + += values[j * BS1 * bs0 + k1 * bs0 + k0] * xval; } } } @@ -329,6 +328,5 @@ void spmvT(std::span values, std::span row_begin, } } - } // namespace impl } // namespace dolfinx::la From 17b342b64e02bd6d0b838b78cf53198824b5e738 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 21:39:17 +0100 Subject: [PATCH 17/23] And python wrappers --- python/dolfinx/wrappers/dolfinx_wrappers/la.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/dolfinx/wrappers/dolfinx_wrappers/la.h b/python/dolfinx/wrappers/dolfinx_wrappers/la.h index 6427a032f7..9085b2142d 100644 --- a/python/dolfinx/wrappers/dolfinx_wrappers/la.h +++ b/python/dolfinx/wrappers/dolfinx_wrappers/la.h @@ -11,8 +11,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -146,9 +146,9 @@ void declare_la_objects(nanobind::module_& m, const std::string& type) }) .def("scatter_reverse", &dolfinx::la::MatrixCSR::scatter_rev) .def("mult", &dolfinx::la::MatrixCSR::mult) - .def("mult", [](const dolfinx::la::MatrixCSR& A, const dolfinx::la::MatrixCSR&B){ - return dolfinx::la::matmul(A, B); - }) + .def("mult", [](const dolfinx::la::MatrixCSR& A, + const dolfinx::la::MatrixCSR& B) + { return dolfinx::la::matmul(A, B); }) .def("multT", &dolfinx::la::MatrixCSR::multT) .def("to_dense", [](const dolfinx::la::MatrixCSR& self) From fcf3e3a90e89e04f2738d8a7091290c3109b1090 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 23:02:33 +0100 Subject: [PATCH 18/23] documentation --- cpp/dolfinx/la/CMakeLists.txt | 2 +- cpp/dolfinx/la/MatrixCSR.h | 41 ++++-- cpp/dolfinx/la/{matmul_impl.h => matmul.h} | 124 ++++++++++++------ python/dolfinx/wrappers/dolfinx_wrappers/la.h | 2 +- 4 files changed, 114 insertions(+), 55 deletions(-) rename cpp/dolfinx/la/{matmul_impl.h => matmul.h} (82%) diff --git a/cpp/dolfinx/la/CMakeLists.txt b/cpp/dolfinx/la/CMakeLists.txt index 48c97917f3..965e7cfe30 100644 --- a/cpp/dolfinx/la/CMakeLists.txt +++ b/cpp/dolfinx/la/CMakeLists.txt @@ -2,7 +2,7 @@ set(HEADERS_la ${CMAKE_CURRENT_SOURCE_DIR}/dolfinx_la.h ${CMAKE_CURRENT_SOURCE_DIR}/MatrixCSR.h ${CMAKE_CURRENT_SOURCE_DIR}/matrix_csr_impl.h - ${CMAKE_CURRENT_SOURCE_DIR}/matmul_impl.h + ${CMAKE_CURRENT_SOURCE_DIR}/matmul.h ${CMAKE_CURRENT_SOURCE_DIR}/SparsityPattern.h ${CMAKE_CURRENT_SOURCE_DIR}/superlu_dist.h ${CMAKE_CURRENT_SOURCE_DIR}/Vector.h diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h index cf54b04b93..2f81cee27b 100644 --- a/cpp/dolfinx/la/MatrixCSR.h +++ b/cpp/dolfinx/la/MatrixCSR.h @@ -493,14 +493,36 @@ class MatrixCSR /// @brief Compute the product `y += A^T x`. /// - /// The vectors `x` and `y` must have parallel layouts that are - /// compatible with `A`. In detail, `x` must have the same `IndexMap` as - /// the matrix rows, `A.index_map(0)` and `y` must have the same owned - /// indices as the matrix columns in `A.index_map(1)`. Ghost entries will - /// be overwritten, and owned entries of `y` will be updated. + /// Performs the distributed sparse matrix–vector product with the + /// *transpose* of the matrix. The computation is split into two phases: /// - /// @param[in] x Vector to apply `A^T` to. - /// @param[in,out] y Vector to accumulate the result into. + /// 1. **Off-diagonal phase** — contributions from the off-diagonal block + /// (ghost columns of A, i.e. columns owned by remote ranks) are + /// accumulated into the ghost region of `y`. The ghost entries of + /// `y` are zeroed before this phase so that stale values do not + /// pollute the result. A reverse scatter (`scatter_rev`) then + /// reduces those ghost contributions back to the owning ranks. + /// + /// 2. **Diagonal phase** — contributions from the diagonal block + /// (locally owned columns of A) are accumulated into the owned + /// entries of `y`. + /// + /// **Layout requirements:** + /// - `x` must share the same `IndexMap` as the matrix *rows*, + /// `A.index_map(0)`. All ghost entries of `x` must be up-to-date + /// before calling (the implementation reads `x.array()` directly + /// without a forward scatter). + /// - `y` must share the same *owned* indices as the matrix *columns*, + /// `A.index_map(1)`. Only owned entries of `y` are meaningful after + /// the call; ghost entries are used as scratch and left in an + /// unspecified state. + /// + /// @note `y` is accumulated *into* (not overwritten) on the owned + /// entries. Zero `y` before the first call if a fresh result is + /// required. + /// + /// @param[in] x Vector to apply `A^T` to; must cover the row space of A. + /// @param[in,out] y Vector accumulated into; covers the column space of A. void multT(Vector& x, Vector& y) const; /// @brief Get MPI communicator that matrix is defined on. @@ -898,9 +920,8 @@ void MatrixCSR::mult(la::Vector& x, } } -/// @brief Compute transpose y += A^T x -/// @param x -/// @param y +/// @brief Out-of-line implementation of MatrixCSR::multT. +/// @see MatrixCSR::multT for the full specification. template void MatrixCSR::multT(la::Vector& x, la::Vector& y) const diff --git a/cpp/dolfinx/la/matmul_impl.h b/cpp/dolfinx/la/matmul.h similarity index 82% rename from cpp/dolfinx/la/matmul_impl.h rename to cpp/dolfinx/la/matmul.h index d7c5f586a8..45a4f63299 100644 --- a/cpp/dolfinx/la/matmul_impl.h +++ b/cpp/dolfinx/la/matmul.h @@ -35,42 +35,78 @@ namespace dolfinx::la namespace impl { -// Lightweight struct satisfying SparsityImplementation for MatrixCSR -// constructor. +/// @brief Lightweight sparsity descriptor satisfying the +/// `SparsityImplementation` concept required by the `MatrixCSR` +/// constructor. +/// +/// Holds non-owning views into externally managed CSR arrays together with +/// the row and column `IndexMap`s that describe the parallel distribution. +/// It is intended as a short-lived helper: to be constructed it from the +/// output of `impl::matmul` and passed to the +/// `MatrixCSR` constructor. +/// +/// @note All spans must remain valid for the lifetime of this object. struct Sparsity { + /// @brief Row `IndexMap` — describes the parallel distribution of rows. std::shared_ptr _row_map; + + /// @brief Column `IndexMap` — describes the parallel distribution of + /// columns, including any ghost columns needed for the + /// off-diagonal block. std::shared_ptr _col_map; + + /// @brief CSR column indices, length `nnz`. Values in + /// `[0, size_local(col) + num_ghosts(col))`, sorted within each row. std::span _cols; + + /// @brief CSR row pointers, length `num_rows + 1`. `_offsets[i]` is the + /// start of row `i` in `_cols`; `_offsets[num_rows]` equals `nnz`. std::span _offsets; + + /// @brief Per-row diagonal block size, length `num_rows`. + /// `_off_diag[i]` is the number of entries in row `i` whose column + /// index is strictly less than `_col_map->size_local()`, i.e. the + /// count of entries in the diagonal (owned-column) block. Entries + /// at positions `[_off_diag[i], row_end)` belong to the + /// off-diagonal block. std::span _off_diag; + /// @brief Return the row (`dim == 0`) or column (`dim == 1`) `IndexMap`. std::shared_ptr index_map(int dim) const { return dim == 0 ? _row_map : _col_map; } + /// @brief Return the block size. Always 1 — this struct does not yet support + /// block-structured sparsity patterns. int block_size(int) const { return 1; } + /// @brief Return the CSR graph as `(column_indices, row_pointers)`. std::pair, std::span> graph() const { return {_cols, _offsets}; } + /// @brief Return the per-row diagonal block sizes (see `_off_diag`). std::span off_diagonal_offsets() const { return _off_diag; } }; +/// @brief Fetch the rows of Matrix B which are referenced by the ghost +/// columns of Matrix A. +/// @param A MatrixCSR +/// @param B MatrixCSR +/// @returns Tuple containing [new index map, rowptr, cols, values] for the received rows template std::tuple, std::vector, std::vector, std::vector> fetch_ghost_rows(const dolfinx::la::MatrixCSR& A, const dolfinx::la::MatrixCSR& B) { - dolfinx::common::Timer tsp1("{*}fetch"); // The ghost columns of A are global row indices into B. auto col_map_A = A.index_map(1); // column IndexMap of A auto row_map_B = B.index_map(0); // row IndexMap of B @@ -101,9 +137,14 @@ fetch_ghost_rows(const dolfinx::la::MatrixCSR& A, std::vector{}, std::vector{}}; } - std::map src_rank_to_nbr; - for (std::size_t i = 0; i < src.size(); ++i) - src_rank_to_nbr.insert({src[i], i}); + // src is guaranteed sorted (Scatterer asserts this), so use binary search + // rather than a heap-allocated map. + auto rank_to_nbr = [&src](int r) -> int + { + auto it = std::lower_bound(src.begin(), src.end(), r); + assert(it != src.end() && *it == r); + return static_cast(std::distance(src.begin(), it)); + }; MPI_Comm neigh_comm_fwd; MPI_Comm neigh_comm_rev; MPI_Dist_graph_create_adjacent(comm, dest.size(), dest.data(), MPI_UNWEIGHTED, @@ -123,7 +164,7 @@ fetch_ghost_rows(const dolfinx::la::MatrixCSR& A, std::vector recv_count(dest.size(), 0); for (int gh : ghost_owners) - ++send_count[src_rank_to_nbr[gh]]; + ++send_count[rank_to_nbr(gh)]; MPI_Neighbor_alltoall(send_count.data(), 1, MPI_INT, recv_count.data(), 1, MPI_INT, neigh_comm_fwd); @@ -141,7 +182,7 @@ fetch_ghost_rows(const dolfinx::la::MatrixCSR& A, std::vector required_globals(required_globals_map.size()); for (std::size_t i = 0; i < ghost_owners.size(); ++i) { - int pos = src_rank_to_nbr[ghost_owners[i]]; + int pos = rank_to_nbr(ghost_owners[i]); perm[i] = send_disp[pos]; // position before increment required_globals[send_disp[pos]++] = required_globals_map[i]; } @@ -358,19 +399,22 @@ fetch_ghost_rows(const dolfinx::la::MatrixCSR& A, /// @param ghost_cols Local column indices (w.r.t. new_col_map) for ghost /// rows. /// @param ghost_vals Values for the ghost rows of B. -/// @return (row_ptr, cols, vals) — the CSR structure and values of C. +/// @return Tuple (row_ptr, off_diag_offsets, cols, vals). off_diag_offsets[i] +/// is the number of diagonal-block entries in row i, computed during +/// the same sort step that establishes column order. template -std::tuple, std::vector, std::vector> +std::tuple, std::vector, + std::vector, std::vector> matmul(const dolfinx::la::MatrixCSR& A, const dolfinx::la::MatrixCSR& B, std::shared_ptr new_col_map, std::span ghost_row_ptr, std::span ghost_cols, std::span ghost_vals) { - dolfinx::common::Timer tsp("{*}sparsity_and_mult"); auto col_map_B = B.index_map(1); const std::int32_t num_local_rows_A = A.index_map(0)->size_local(); const std::int32_t num_local_rows_B = B.index_map(0)->size_local(); const std::int32_t num_owned_cols_B = col_map_B->size_local(); + const std::int32_t num_owned_cols_C = new_col_map->size_local(); auto row_ptr_A = A.row_ptr(); auto offdiag_ptr_A = A.off_diag_offset(); @@ -390,10 +434,12 @@ matmul(const dolfinx::la::MatrixCSR& A, const dolfinx::la::MatrixCSR& B, = new_col_map->size_local() + static_cast(new_col_map->ghosts().size()); - std::vector C_row_ptr = {0}; - C_row_ptr.reserve(num_local_rows_A + 1); - std::vector C_cols; - std::vector C_vals; + std::vector row_ptr_C = {0}; + row_ptr_C.reserve(num_local_rows_A + 1); + std::vector off_diag_offsets_C; + off_diag_offsets_C.reserve(num_local_rows_A); + std::vector cols_C; + std::vector vals_C; // Dense accumulator: acc[k] holds the running sum for column k. // in_row[k] marks whether column k was touched in the current row. @@ -448,18 +494,25 @@ matmul(const dolfinx::la::MatrixCSR& A, const dolfinx::la::MatrixCSR& B, // required by off_diag_offset and MatrixCSR. std::sort(row_cols.begin(), row_cols.end()); + // Diagonal block boundary: count entries in the owned column range. + // row_cols is sorted, so a single lower_bound suffices. + off_diag_offsets_C.push_back(static_cast( + std::lower_bound(row_cols.begin(), row_cols.end(), num_owned_cols_C) + - row_cols.begin())); + // Flush accumulator to output and reset both acc and in_row. for (const std::int32_t c : row_cols) { - C_cols.push_back(c); - C_vals.push_back(acc[c]); + cols_C.push_back(c); + vals_C.push_back(acc[c]); acc[c] = T(0); in_row[c] = false; } - C_row_ptr.push_back(static_cast(C_cols.size())); + row_ptr_C.push_back(static_cast(cols_C.size())); } - return {std::move(C_row_ptr), std::move(C_cols), std::move(C_vals)}; + return {std::move(row_ptr_C), std::move(off_diag_offsets_C), + std::move(cols_C), std::move(vals_C)}; } } // namespace impl @@ -468,6 +521,7 @@ matmul(const dolfinx::la::MatrixCSR& A, const dolfinx::la::MatrixCSR& B, /// /// @param A Left matrix. /// @param B Right matrix. +/// @note Currently only supports block-size 1 matrices. /// @return The product C = A*B as a MatrixCSR with row distribution /// matching A and column distribution determined by B. /// The row IndexMap of C has no ghosts, and the column IndexMap of C @@ -479,37 +533,21 @@ dolfinx::la::MatrixCSR matmul(const dolfinx::la::MatrixCSR& A, dolfinx::common::Timer t_spgemm("MatrixCSR SpGEMM"); // Fetch ghost rows of B needed to multiply against A's ghost columns. - spdlog::info("Fetch remote rows of B in C=A*B"); + spdlog::debug("Fetch remote rows of B in C=A*B"); auto [new_col_map, ghost_row_ptr, ghost_cols, ghost_vals] = impl::fetch_ghost_rows(A, B); - // Single pass: compute sparsity and values simultaneously. - spdlog::info("Compute sparsity and values of C=A*B"); - auto [C_row_ptr_32, C_cols_32, C_vals_vec] = impl::matmul( + // Single pass: compute sparsity, off-diagonal boundaries, and values. + spdlog::debug("Compute sparsity and values of C=A*B"); + auto [C_row_ptr, C_off_diag_offsets, C_cols, C_vals_vec] = impl::matmul( A, B, new_col_map, std::span(ghost_row_ptr), std::span(ghost_cols), std::span(ghost_vals)); - const std::int32_t num_local_rows = A.index_map(0)->size_local(); - const std::int32_t num_owned_cols = new_col_map->size_local(); - - // Per-row diagonal block boundary, needed by MatrixCSR constructor. - std::vector off_diag_offsets(num_local_rows); - for (std::int32_t i = 0; i < num_local_rows; ++i) - { - const auto begin = C_cols_32.cbegin() + C_row_ptr_32[i]; - const auto end = C_cols_32.cbegin() + C_row_ptr_32[i + 1]; - off_diag_offsets[i] = static_cast( - std::lower_bound(begin, end, num_owned_cols) - begin); - } - - auto C_row_map = std::make_shared(A.index_map(0)->comm(), - num_local_rows); - std::vector C_row_ptr_64(C_row_ptr_32.begin(), - C_row_ptr_32.end()); - - impl::Sparsity sp{C_row_map, new_col_map, C_cols_32, C_row_ptr_64, - off_diag_offsets}; + auto C_row_map = std::make_shared( + A.index_map(0)->comm(), A.index_map(0)->size_local()); + impl::Sparsity sp{C_row_map, new_col_map, C_cols, C_row_ptr, + C_off_diag_offsets}; dolfinx::la::MatrixCSR C(sp); std::copy(C_vals_vec.begin(), C_vals_vec.end(), C.values().begin()); diff --git a/python/dolfinx/wrappers/dolfinx_wrappers/la.h b/python/dolfinx/wrappers/dolfinx_wrappers/la.h index 9085b2142d..fd670548c6 100644 --- a/python/dolfinx/wrappers/dolfinx_wrappers/la.h +++ b/python/dolfinx/wrappers/dolfinx_wrappers/la.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include From 8aead8505c1ee6bfd5685f555180b4af8f35b1d0 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Fri, 17 Apr 2026 23:03:36 +0100 Subject: [PATCH 19/23] clang-format --- cpp/dolfinx/la/matmul.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/dolfinx/la/matmul.h b/cpp/dolfinx/la/matmul.h index 45a4f63299..b5f929ff5a 100644 --- a/cpp/dolfinx/la/matmul.h +++ b/cpp/dolfinx/la/matmul.h @@ -100,7 +100,8 @@ struct Sparsity /// columns of Matrix A. /// @param A MatrixCSR /// @param B MatrixCSR -/// @returns Tuple containing [new index map, rowptr, cols, values] for the received rows +/// @returns Tuple containing [new index map, rowptr, cols, values] for the +/// received rows template std::tuple, std::vector, std::vector, std::vector> From 8e67100dccafd53cee9de244870d18ca534998a9 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Sat, 18 Apr 2026 06:42:29 +0100 Subject: [PATCH 20/23] not sure why superlu_dist docs are causing problem... --- cpp/dolfinx/la/superlu_dist.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/dolfinx/la/superlu_dist.h b/cpp/dolfinx/la/superlu_dist.h index a7736bcb0c..86aad636ae 100644 --- a/cpp/dolfinx/la/superlu_dist.h +++ b/cpp/dolfinx/la/superlu_dist.h @@ -42,6 +42,7 @@ class SuperLUDistStructs // the solver class to select the typed set based on T. namespace impl { +/// \cond template constexpr bool always_false_v = false; @@ -55,34 +56,29 @@ struct map template <> struct map { - /// \cond using ScalePermstruct_t = SuperLUDistStructs::dScalePermstruct_t; using LUstruct_t = SuperLUDistStructs::dLUstruct_t; using SOLVEstruct_t = SuperLUDistStructs::dSOLVEstruct_t; - /// \endcond }; /// Map float type to float 'typed' structs template <> struct map { - /// \cond using ScalePermstruct_t = SuperLUDistStructs::sScalePermstruct_t; using LUstruct_t = SuperLUDistStructs::sLUstruct_t; using SOLVEstruct_t = SuperLUDistStructs::sSOLVEstruct_t; - /// \endcond }; /// Map std::complex type to doublecomplex 'typed' structs template <> struct map> { - /// \cond using ScalePermstruct_t = SuperLUDistStructs::zScalePermstruct_t; using LUstruct_t = SuperLUDistStructs::zLUstruct_t; using SOLVEstruct_t = SuperLUDistStructs::zSOLVEstruct_t; - /// \endcond }; +/// \endcond } // namespace impl From ba1d0a3d9306b826e0f0c64d93d150cfbad35a86 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Sun, 19 Apr 2026 14:07:23 +0100 Subject: [PATCH 21/23] int type --- cpp/dolfinx/la/matrix_csr_impl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h index 2a6fa5bb1a..6fb1c97628 100644 --- a/cpp/dolfinx/la/matrix_csr_impl.h +++ b/cpp/dolfinx/la/matrix_csr_impl.h @@ -234,7 +234,7 @@ void spmv(std::span values, std::span row_begin, for (std::size_t i = 0; i < row_begin.size(); i++) { T vi{0}; - for (std::int32_t j = row_begin[i]; j < row_end[i]; j++) + for (std::int64_t j = row_begin[i]; j < row_end[i]; j++) { if constexpr (BS1 == -1) { @@ -305,7 +305,7 @@ void spmvT(std::span values, std::span row_begin, for (std::size_t i = 0; i < row_begin.size(); i++) { const T xval = x[i * bs0 + k0]; - for (std::int32_t j = row_begin[i]; j < row_end[i]; j++) + for (std::int64_t j = row_begin[i]; j < row_end[i]; j++) { if constexpr (BS1 == -1) { From 37f66efabeda9c58407dc375cc0214103f571e3d Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Mon, 20 Apr 2026 10:06:54 +0100 Subject: [PATCH 22/23] Add block size --- cpp/dolfinx/la/matmul.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/dolfinx/la/matmul.h b/cpp/dolfinx/la/matmul.h index b5f929ff5a..6521c7c16c 100644 --- a/cpp/dolfinx/la/matmul.h +++ b/cpp/dolfinx/la/matmul.h @@ -72,15 +72,17 @@ struct Sparsity /// off-diagonal block. std::span _off_diag; + /// @brief block size in each direction. + int _bs[2]; + /// @brief Return the row (`dim == 0`) or column (`dim == 1`) `IndexMap`. std::shared_ptr index_map(int dim) const { return dim == 0 ? _row_map : _col_map; } - /// @brief Return the block size. Always 1 — this struct does not yet support - /// block-structured sparsity patterns. - int block_size(int) const { return 1; } + /// @brief Return the block size. + int block_size(int i) const { return _bs[i]; } /// @brief Return the CSR graph as `(column_indices, row_pointers)`. std::pair, std::span> @@ -548,7 +550,7 @@ dolfinx::la::MatrixCSR matmul(const dolfinx::la::MatrixCSR& A, auto C_row_map = std::make_shared( A.index_map(0)->comm(), A.index_map(0)->size_local()); impl::Sparsity sp{C_row_map, new_col_map, C_cols, C_row_ptr, - C_off_diag_offsets}; + C_off_diag_offsets, {1, 1}}; dolfinx::la::MatrixCSR C(sp); std::copy(C_vals_vec.begin(), C_vals_vec.end(), C.values().begin()); From 703c6eb76d1a8ab260c9ce4e8a33903404ed1cc0 Mon Sep 17 00:00:00 2001 From: Chris Richardson Date: Tue, 21 Apr 2026 11:47:02 +0100 Subject: [PATCH 23/23] clang-format --- cpp/dolfinx/la/matmul.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/dolfinx/la/matmul.h b/cpp/dolfinx/la/matmul.h index 6521c7c16c..b220ccc0d5 100644 --- a/cpp/dolfinx/la/matmul.h +++ b/cpp/dolfinx/la/matmul.h @@ -549,8 +549,8 @@ dolfinx::la::MatrixCSR matmul(const dolfinx::la::MatrixCSR& A, auto C_row_map = std::make_shared( A.index_map(0)->comm(), A.index_map(0)->size_local()); - impl::Sparsity sp{C_row_map, new_col_map, C_cols, C_row_ptr, - C_off_diag_offsets, {1, 1}}; + impl::Sparsity sp{C_row_map, new_col_map, C_cols, + C_row_ptr, C_off_diag_offsets, {1, 1}}; dolfinx::la::MatrixCSR C(sp); std::copy(C_vals_vec.begin(), C_vals_vec.end(), C.values().begin());