From 48130822f256dcadadd4107977197622f3df80d2 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Thu, 16 Apr 2026 10:39:20 +0100
Subject: [PATCH 01/23] Matrix multiplication does not change MatrixCSR

---
 cpp/dolfinx/la/MatrixCSR.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index 6094a32335..58df422174 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -489,7 +489,7 @@ class MatrixCSR
   ///
   /// @param[in] x Vector to apply `A` to.
   /// @param[in,out] y Vector to accumulate the result into.
-  void mult(Vector<value_type>& x, Vector<value_type>& y);
+  void mult(Vector<value_type>& x, Vector<value_type>& y) const;
 
   /// @brief Get MPI communicator that matrix is defined on.
   MPI_Comm comm() const { return _comm.comm(); }
@@ -838,7 +838,7 @@ MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityType& p, BlockMode mode)
 /// x,y
 template <typename Scalar, typename V, typename W, typename X>
 void MatrixCSR<Scalar, V, W, X>::mult(la::Vector<Scalar>& x,
-                                      la::Vector<Scalar>& y)
+                                      la::Vector<Scalar>& y) const
 {
   // start communication (update ghosts)
   x.scatter_fwd_begin();

From 0b0b958dbfbc08f44edb2c5033f54be0325342ec Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Thu, 16 Apr 2026 16:47:29 +0100
Subject: [PATCH 02/23] Add transpose implementation

---
 cpp/dolfinx/la/MatrixCSR.h                    | 29 ++++++++
 cpp/dolfinx/la/matrix_csr_impl.h              | 37 ++++++++++
 python/dolfinx/wrappers/dolfinx_wrappers/la.h |  1 +
 python/test/unit/la/test_matrix_vector.py     | 67 +++++++++++++++++--
 4 files changed, 130 insertions(+), 4 deletions(-)

diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index 58df422174..87e9806173 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -491,6 +491,9 @@ class MatrixCSR
   /// @param[in,out] y Vector to accumulate the result into.
   void mult(Vector<value_type>& x, Vector<value_type>& y) const;
 
+   void multT(Vector<value_type>& x, Vector<value_type>& y) const;
+
+
   /// @brief Get MPI communicator that matrix is defined on.
   MPI_Comm comm() const { return _comm.comm(); }
 
@@ -886,4 +889,30 @@ void MatrixCSR<Scalar, V, W, X>::mult(la::Vector<Scalar>& x,
   }
 }
 
+template <typename Scalar, typename V, typename W, typename X>
+void MatrixCSR<Scalar, V, W, X>::multT(la::Vector<Scalar>& x,
+                                      la::Vector<Scalar>& y) const
+{
+  std::int32_t nrowslocal = num_owned_rows();
+  std::span<const std::int64_t> Arow_ptr(row_ptr().data(), nrowslocal + 1);
+  std::span<const std::int32_t> Acols(cols().data(), Arow_ptr[nrowslocal]);
+  std::span<const std::int64_t> Aoff_diag_offset(off_diag_offset().data(),
+                                                  nrowslocal);
+  std::span<const Scalar> Avalues(values().data(), Arow_ptr[nrowslocal]);
+
+  std::span<const Scalar> _x = x.array();
+  std::span<Scalar> _y = y.array();
+
+  std::span<const std::int64_t> Arow_begin(Arow_ptr.data(), nrowslocal);
+  std::span<const std::int64_t> Arow_end(Arow_ptr.data() + 1, nrowslocal);
+
+  // Compute ghost region contribution and scatter back
+  int ncolslocal = index_map(1)->size_local();
+  std::fill(std::next(_y.begin(), ncolslocal), _y.end(), 0.0);
+  impl::spmvT<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]);
+  y.scatter_rev(std::plus<Scalar>());
+  impl::spmvT<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]);
+
+}
+
 } // namespace dolfinx::la
diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h
index 143b6c0d4f..ffed5aedd9 100644
--- a/cpp/dolfinx/la/matrix_csr_impl.h
+++ b/cpp/dolfinx/la/matrix_csr_impl.h
@@ -259,5 +259,42 @@ void spmv(std::span<const T> values, std::span<const std::int64_t> row_begin,
   }
 }
 
+
+/// @brief  Sparse matrix-vector transpose product implementation.
+/// @tparam T
+/// @tparam BS1
+/// @param values
+/// @param row_begin
+/// @param row_end
+/// @param indices
+/// @param x
+/// @param y
+/// @param bs0
+/// @param bs1
+template <typename T, int BS1>
+void spmvT(std::span<const T> values, std::span<const std::int64_t> row_begin,
+          std::span<const std::int64_t> row_end,
+          std::span<const std::int32_t> indices, std::span<const T> x,
+          std::span<T> y, int bs0, int bs1)
+{
+  assert(row_begin.size() == row_end.size());
+  for (int k0 = 0; k0 < bs0; ++k0)
+  {
+    for (std::size_t i = 0; i < row_begin.size(); i++)
+    {
+      const T xval = x[i * bs0 + k0];
+      for (std::int32_t j = row_begin[i]; j < row_end[i]; j++)
+      {
+          for (int k1 = 0; k1 < bs1; ++k1)
+          {
+            y[indices[j] * bs1 + k1] += values[j * bs1 * bs0 + k1 * bs0 + k0]
+                  * xval;
+          }
+      }
+    }
+  }
+}
+
+
 } // namespace impl
 } // namespace dolfinx::la
diff --git a/python/dolfinx/wrappers/dolfinx_wrappers/la.h b/python/dolfinx/wrappers/dolfinx_wrappers/la.h
index 57ede85a46..4ac7a36730 100644
--- a/python/dolfinx/wrappers/dolfinx_wrappers/la.h
+++ b/python/dolfinx/wrappers/dolfinx_wrappers/la.h
@@ -145,6 +145,7 @@ void declare_la_objects(nanobind::module_& m, const std::string& type)
            })
       .def("scatter_reverse", &dolfinx::la::MatrixCSR<T>::scatter_rev)
       .def("mult", &dolfinx::la::MatrixCSR<T>::mult)
+      .def("multT", &dolfinx::la::MatrixCSR<T>::multT)
       .def("to_dense",
            [](const dolfinx::la::MatrixCSR<T>& self)
            {
diff --git a/python/test/unit/la/test_matrix_vector.py b/python/test/unit/la/test_matrix_vector.py
index fc5847a109..13f4d8f267 100644
--- a/python/test/unit/la/test_matrix_vector.py
+++ b/python/test/unit/la/test_matrix_vector.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 import pytest
+from scipy.sparse import csr_matrix
 
 from dolfinx import cpp as _cpp
 from dolfinx import la
@@ -16,6 +17,19 @@
 from dolfinx.mesh import create_unit_square
 
 
+def mat_gather(A):
+    # Gather full matrix on all processes for scipy
+    nr = A.index_map(0).size_local
+    gatheredvals = np.concatenate(MPI.COMM_WORLD.allgather(A.data[: A.indptr[nr]]))
+    gatheredptrs = MPI.COMM_WORLD.allgather(A.indptr[: nr + 1])
+    cols = A.index_map(1).local_to_global(A.indices[: A.indptr[nr]])
+    gatheredcols = np.concatenate(MPI.COMM_WORLD.allgather(cols))
+    indptr = gatheredptrs[0]
+    for i in range(1, len(gatheredptrs)):
+        indptr = np.concatenate((indptr, (gatheredptrs[i][1:] + indptr[-1])))
+    return csr_matrix((gatheredvals, gatheredcols, indptr))
+
+
 def test_create_matrix_csr():
     """Test creation of CSR matrix with specified types."""
     mesh = create_unit_square(MPI.COMM_WORLD, 10, 11)
@@ -58,22 +72,67 @@ def test_matvec(dtype):
     imap = mesh.topology.index_map(0)
     sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1])
     rows = np.arange(0, imap.size_local)
-    cols = np.arange(0, imap.size_local)
+    cols = np.arange(0, imap.size_local + imap.num_ghosts)
     sp.insert(rows, cols)
     sp.finalize()
 
     # Identity
     A = la.matrix_csr(sp, dtype=dtype)
-    for i in range(imap.size_local):
-        A.add(np.array([2.0], dtype=dtype), np.array([i]), np.array([i]))
+    rng = np.random.default_rng(12345)
+    A.data[:] = rng.random(len(A.data))
     A.scatter_reverse()
 
+    Ascipy = mat_gather(A)
+    lr0, lr1 = A.index_map(0).local_range
+    nr = A.index_map(0).size_local
+    # Check gathered matrix
+    assert np.allclose(A.to_dense()[:nr, :], Ascipy.todense()[lr0:lr1])
+
     b = la.vector(imap, dtype=dtype)
     u = la.vector(imap, dtype=dtype)
     b.array[:] = 1.0
     A.mult(b, u)
     u.scatter_forward()
-    assert np.allclose(u.array[: imap.size_local], 2.0)
+
+    bs = np.ones(A.index_map(0).size_global)
+    us = Ascipy @ bs
+    assert np.allclose(u.array[:nr], us[lr0:lr1])
+
+
+def test_matvec_transpose():
+    dtype = np.float64
+    mesh = create_unit_square(MPI.COMM_WORLD, 3, 3)
+    imap = mesh.topology.index_map(0)
+    sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1])
+    rows = np.arange(0, imap.size_local)
+    cols = np.arange(0, imap.size_local + imap.num_ghosts)
+    sp.insert(rows, cols)
+    sp.finalize()
+
+    # Identity
+    A = la.matrix_csr(sp, dtype=dtype)
+    rng = np.random.default_rng(12345)
+    A.data[:] = rng.random(len(A.data))
+    A.scatter_reverse()
+
+    Ascipy = mat_gather(A)
+    lr0, lr1 = A.index_map(0).local_range
+    nr = A.index_map(0).size_local
+    # Check gathered matrix
+    assert np.allclose(A.to_dense()[:nr, :], Ascipy.todense()[lr0:lr1])
+
+    b = la.vector(imap, dtype=dtype)
+    u = la.vector(imap, dtype=dtype)
+    b.array[:] = 1.0
+    A._cpp_object.multT(b._cpp_object, u._cpp_object)
+    u.scatter_forward()
+    print(u.array[: imap.size_local])
+    # assert np.allclose(u.array[: imap.size_local], 2.0)
+
+    bs = np.ones(A.index_map(0).size_global)
+    us = Ascipy.T @ bs
+    print(us)
+    assert np.allclose(u.array[:nr], us[lr0:lr1])
 
 
 @pytest.mark.parametrize(

From 7c065753ab8a95ef1f7106f408905cd609658cfa Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Thu, 16 Apr 2026 17:38:25 +0100
Subject: [PATCH 03/23] fix up test a bit

---
 cpp/dolfinx/la/MatrixCSR.h                | 15 +++++++++++++--
 cpp/dolfinx/la/matrix_csr_impl.h          | 12 ++++++++++++
 python/dolfinx/la/__init__.py             | 10 +++++++---
 python/test/unit/la/test_matrix_vector.py |  6 +-----
 4 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index 87e9806173..971f18ce6f 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -889,6 +889,9 @@ void MatrixCSR<Scalar, V, W, X>::mult(la::Vector<Scalar>& x,
   }
 }
 
+/// @brief Compute transpose y += A^T x
+/// @param x
+/// @param y
 template <typename Scalar, typename V, typename W, typename X>
 void MatrixCSR<Scalar, V, W, X>::multT(la::Vector<Scalar>& x,
                                       la::Vector<Scalar>& y) const
@@ -909,9 +912,17 @@ void MatrixCSR<Scalar, V, W, X>::multT(la::Vector<Scalar>& x,
   // Compute ghost region contribution and scatter back
   int ncolslocal = index_map(1)->size_local();
   std::fill(std::next(_y.begin(), ncolslocal), _y.end(), 0.0);
-  impl::spmvT<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]);
+  if (_bs[1] == 1)
+    impl::spmvT<Scalar, 1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]);
+  else
+    impl::spmvT<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]);
+
   y.scatter_rev(std::plus<Scalar>());
-  impl::spmvT<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]);
+
+  if (_bs[1] == 1)
+    impl::spmvT<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]);
+  else
+    impl::spmvT<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]);
 
 }
 
diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h
index ffed5aedd9..8af8c903fb 100644
--- a/cpp/dolfinx/la/matrix_csr_impl.h
+++ b/cpp/dolfinx/la/matrix_csr_impl.h
@@ -278,6 +278,7 @@ void spmvT(std::span<const T> values, std::span<const std::int64_t> row_begin,
           std::span<T> y, int bs0, int bs1)
 {
   assert(row_begin.size() == row_end.size());
+
   for (int k0 = 0; k0 < bs0; ++k0)
   {
     for (std::size_t i = 0; i < row_begin.size(); i++)
@@ -285,11 +286,22 @@ void spmvT(std::span<const T> values, std::span<const std::int64_t> row_begin,
       const T xval = x[i * bs0 + k0];
       for (std::int32_t j = row_begin[i]; j < row_end[i]; j++)
       {
+        if constexpr(BS1 == -1)
+        {
           for (int k1 = 0; k1 < bs1; ++k1)
           {
             y[indices[j] * bs1 + k1] += values[j * bs1 * bs0 + k1 * bs0 + k0]
                   * xval;
           }
+        }
+        else
+        {
+          for (int k1 = 0; k1 < BS1; ++k1)
+          {
+            y[indices[j] * BS1 + k1] += values[j * BS1 * bs0 + k1 * bs0 + k0]
+                  * xval;
+          }
+        }
       }
     }
   }
diff --git a/python/dolfinx/la/__init__.py b/python/dolfinx/la/__init__.py
index 8c3f40aaf4..88054df14a 100644
--- a/python/dolfinx/la/__init__.py
+++ b/python/dolfinx/la/__init__.py
@@ -155,14 +155,18 @@ def index_map(self, i: int) -> IndexMap:
         """
         return self._cpp_object.index_map(i)
 
-    def mult(self, x: Vector, y: Vector) -> None:
-        """Compute ``y += Ax``.
+    def mult(self, x: Vector, y: Vector, transpose: bool = False) -> None:
+        """Compute ``y += Ax`` or ``y += A^T x``.
 
         Args:
             x: Input Vector
             y: Output Vector
+            transpose: if True, compute y += A^T x
         """
-        self._cpp_object.mult(x._cpp_object, y._cpp_object)
+        if transpose:
+            self._cpp_object.multT(x._cpp_object, y._cpp_object)
+        else:
+            self._cpp_object.mult(x._cpp_object, y._cpp_object)
 
     @property
     def block_size(self) -> list:
diff --git a/python/test/unit/la/test_matrix_vector.py b/python/test/unit/la/test_matrix_vector.py
index 13f4d8f267..9175c6ee16 100644
--- a/python/test/unit/la/test_matrix_vector.py
+++ b/python/test/unit/la/test_matrix_vector.py
@@ -92,7 +92,6 @@ def test_matvec(dtype):
     u = la.vector(imap, dtype=dtype)
     b.array[:] = 1.0
     A.mult(b, u)
-    u.scatter_forward()
 
     bs = np.ones(A.index_map(0).size_global)
     us = Ascipy @ bs
@@ -124,10 +123,7 @@ def test_matvec_transpose():
     b = la.vector(imap, dtype=dtype)
     u = la.vector(imap, dtype=dtype)
     b.array[:] = 1.0
-    A._cpp_object.multT(b._cpp_object, u._cpp_object)
-    u.scatter_forward()
-    print(u.array[: imap.size_local])
-    # assert np.allclose(u.array[: imap.size_local], 2.0)
+    A.mult(b, u, True)
 
     bs = np.ones(A.index_map(0).size_global)
     us = Ascipy.T @ bs

From 3fdebefad84af29bf05e4d8709607a85a7f94cd8 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 09:54:37 +0100
Subject: [PATCH 04/23] Update tests

---
 cpp/dolfinx/la/MatrixCSR.h                |  4 ++--
 python/test/unit/la/test_matrix_vector.py | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index 971f18ce6f..cab4e01c5b 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -913,14 +913,14 @@ void MatrixCSR<Scalar, V, W, X>::multT(la::Vector<Scalar>& x,
   int ncolslocal = index_map(1)->size_local();
   std::fill(std::next(_y.begin(), ncolslocal), _y.end(), 0.0);
   if (_bs[1] == 1)
-    impl::spmvT<Scalar, 1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]);
+    impl::spmvT<Scalar, 1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], 1);
   else
     impl::spmvT<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]);
 
   y.scatter_rev(std::plus<Scalar>());
 
   if (_bs[1] == 1)
-    impl::spmvT<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]);
+    impl::spmvT<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], 1);
   else
     impl::spmvT<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]);
 
diff --git a/python/test/unit/la/test_matrix_vector.py b/python/test/unit/la/test_matrix_vector.py
index 9175c6ee16..c20558dcfe 100644
--- a/python/test/unit/la/test_matrix_vector.py
+++ b/python/test/unit/la/test_matrix_vector.py
@@ -90,10 +90,11 @@ def test_matvec(dtype):
 
     b = la.vector(imap, dtype=dtype)
     u = la.vector(imap, dtype=dtype)
-    b.array[:] = 1.0
+    b.array[:] = np.arange(len(b.array))
     A.mult(b, u)
 
-    bs = np.ones(A.index_map(0).size_global)
+    bs = np.concatenate(mesh.comm.allgather(b.array[:nr]))
+    print(Ascipy.shape, bs.shape)
     us = Ascipy @ bs
     assert np.allclose(u.array[:nr], us[lr0:lr1])
 
@@ -122,12 +123,11 @@ def test_matvec_transpose():
 
     b = la.vector(imap, dtype=dtype)
     u = la.vector(imap, dtype=dtype)
-    b.array[:] = 1.0
+    b.array[:] = np.arange(len(b.array))
     A.mult(b, u, True)
 
-    bs = np.ones(A.index_map(0).size_global)
+    bs = np.concatenate(mesh.comm.allgather(b.array[:nr]))
     us = Ascipy.T @ bs
-    print(us)
     assert np.allclose(u.array[:nr], us[lr0:lr1])
 
 

From 1a541ad4b197bd690b5370d1c981726bb7188811 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 11:32:31 +0100
Subject: [PATCH 05/23] Use non-blocking MPI for y update

---
 cpp/dolfinx/la/MatrixCSR.h |  4 +++-
 cpp/dolfinx/la/Vector.h    | 29 ++++++++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index cab4e01c5b..d95ab873c3 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -917,13 +917,15 @@ void MatrixCSR<Scalar, V, W, X>::multT(la::Vector<Scalar>& x,
   else
     impl::spmvT<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]);
 
-  y.scatter_rev(std::plus<Scalar>());
+  y.scatter_rev_begin();
 
   if (_bs[1] == 1)
     impl::spmvT<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], 1);
   else
     impl::spmvT<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]);
 
+  y.scatter_rev_end(std::plus<Scalar>());
+
 }
 
 } // namespace dolfinx::la
diff --git a/cpp/dolfinx/la/Vector.h b/cpp/dolfinx/la/Vector.h
index afbdb10cd2..e29e6f38b1 100644
--- a/cpp/dolfinx/la/Vector.h
+++ b/cpp/dolfinx/la/Vector.h
@@ -354,6 +354,33 @@ class Vector
            _x.begin());
   }
 
+  /// @brief End scatter (send) of local data values
+  /// to the owning process and assign/accumulate into the owned data entries
+  /// (simplified CPU version).
+  ///
+  /// Suitable for scatter operations on a CPU with `std::vector`
+  /// storage. The receive buffer is unpacked internally by a function
+  /// that is suitable for use on a CPU.
+  ///
+  /// @note Collective MPI operation.
+  template <class BinaryOperation>
+  void scatter_rev_end(BinaryOperation op)
+    requires requires(Container c) {
+      { c.data() } -> std::same_as<T*>;
+    }
+  {
+    // Inline the kernel overload directly to avoid ambiguity: the lambda
+    // produced by get_unpack_op satisfies both VectorPackKernel and the
+    // unconstrained BinaryOperation parameter, making a delegating call to
+    // scatter_rev_end(get_unpack_op(op)) ambiguous.
+    auto unpack = get_unpack_op(op);
+    _scatterer->scatter_end(_request);
+    unpack(_scatterer->local_indices().begin(),
+           _scatterer->local_indices().end(), _buffer_local.begin(),
+           _x.begin());
+  }
+
+
   /// @brief Scatter (send) of ghost data values to the owning process
   /// and assign/accumulate into the owned data entries (simplified CPU
   /// version).
@@ -372,7 +399,7 @@ class Vector
   void scatter_rev(BinaryOperation op)
   {
     this->scatter_rev_begin();
-    this->scatter_rev_end(get_unpack_op(op));
+    this->scatter_rev_end(op);
   }
 
   /// Get IndexMap

From 109843c0e89309758592c62cf8a70e68b83162b9 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 11:37:04 +0100
Subject: [PATCH 06/23] revert and update test

---
 cpp/dolfinx/la/MatrixCSR.h                |  4 +---
 python/test/unit/la/test_matrix_vector.py | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index d95ab873c3..619fa54397 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -917,15 +917,13 @@ void MatrixCSR<Scalar, V, W, X>::multT(la::Vector<Scalar>& x,
   else
     impl::spmvT<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]);
 
-  y.scatter_rev_begin();
+  y.scatter_rev();
 
   if (_bs[1] == 1)
     impl::spmvT<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], 1);
   else
     impl::spmvT<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]);
 
-  y.scatter_rev_end(std::plus<Scalar>());
-
 }
 
 } // namespace dolfinx::la
diff --git a/python/test/unit/la/test_matrix_vector.py b/python/test/unit/la/test_matrix_vector.py
index c20558dcfe..e064b6b04a 100644
--- a/python/test/unit/la/test_matrix_vector.py
+++ b/python/test/unit/la/test_matrix_vector.py
@@ -90,6 +90,7 @@ def test_matvec(dtype):
 
     b = la.vector(imap, dtype=dtype)
     u = la.vector(imap, dtype=dtype)
+    u.array[:] = 0.0
     b.array[:] = np.arange(len(b.array))
     A.mult(b, u)
 
@@ -99,8 +100,16 @@ def test_matvec(dtype):
     assert np.allclose(u.array[:nr], us[lr0:lr1])
 
 
-def test_matvec_transpose():
-    dtype = np.float64
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.float32,
+        np.float64,
+        np.complex64,
+        np.complex128,
+    ],
+)
+def test_matvec_transpose(dtype):
     mesh = create_unit_square(MPI.COMM_WORLD, 3, 3)
     imap = mesh.topology.index_map(0)
     sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1])
@@ -123,6 +132,7 @@ def test_matvec_transpose():
 
     b = la.vector(imap, dtype=dtype)
     u = la.vector(imap, dtype=dtype)
+    u.array[:] = 0.0
     b.array[:] = np.arange(len(b.array))
     A.mult(b, u, True)
 

From f822b92ca0cb3391118e556c2cdb8190fc8ef142 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 15:34:32 +0100
Subject: [PATCH 07/23] Add mutmul implementation

---
 cpp/dolfinx/la/CMakeLists.txt                 |   1 +
 cpp/dolfinx/la/MatrixCSR.h                    |   4 +-
 cpp/dolfinx/la/matmul_impl.h                  | 528 ++++++++++++++++++
 python/dolfinx/la/__init__.py                 |  11 +
 python/dolfinx/wrappers/dolfinx_wrappers/la.h |   4 +
 python/test/unit/la/test_matmul.py            |  73 +++
 6 files changed, 620 insertions(+), 1 deletion(-)
 create mode 100644 cpp/dolfinx/la/matmul_impl.h
 create mode 100644 python/test/unit/la/test_matmul.py

diff --git a/cpp/dolfinx/la/CMakeLists.txt b/cpp/dolfinx/la/CMakeLists.txt
index eeebf8897d..48c97917f3 100644
--- a/cpp/dolfinx/la/CMakeLists.txt
+++ b/cpp/dolfinx/la/CMakeLists.txt
@@ -2,6 +2,7 @@ set(HEADERS_la
   ${CMAKE_CURRENT_SOURCE_DIR}/dolfinx_la.h
   ${CMAKE_CURRENT_SOURCE_DIR}/MatrixCSR.h
   ${CMAKE_CURRENT_SOURCE_DIR}/matrix_csr_impl.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/matmul_impl.h
   ${CMAKE_CURRENT_SOURCE_DIR}/SparsityPattern.h
   ${CMAKE_CURRENT_SOURCE_DIR}/superlu_dist.h
   ${CMAKE_CURRENT_SOURCE_DIR}/Vector.h
diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index 619fa54397..a1389c2e87 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -917,7 +917,7 @@ void MatrixCSR<Scalar, V, W, X>::multT(la::Vector<Scalar>& x,
   else
     impl::spmvT<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]);
 
-  y.scatter_rev();
+  y.scatter_rev(std::plus<Scalar>{});
 
   if (_bs[1] == 1)
     impl::spmvT<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], 1);
@@ -926,4 +926,6 @@ void MatrixCSR<Scalar, V, W, X>::multT(la::Vector<Scalar>& x,
 
 }
 
+
+
 } // namespace dolfinx::la
diff --git a/cpp/dolfinx/la/matmul_impl.h b/cpp/dolfinx/la/matmul_impl.h
new file mode 100644
index 0000000000..086843d293
--- /dev/null
+++ b/cpp/dolfinx/la/matmul_impl.h
@@ -0,0 +1,528 @@
+// Copyright (C) 2026 Chris Richardson
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <dolfinx/common/IndexMap.h>
+
+#include <dolfinx/la/MatrixCSR.h>
+#include <mpi.h>
+
+#include <algorithm>
+#include <numeric>
+#include <span>
+#include <vector>
+
+namespace dolfinx::la
+{
+
+/// @brief Fetch the rows of B that correspond to the ghost columns of A.
+///
+/// For computing the product A*B, each rank needs the rows of B whose
+/// global indices match the ghost columns of A. Those ghost columns are
+/// owned by remote ranks, so we request those rows via neighbourhood
+/// communication.
+///
+/// @param A Matrix whose ghost column indices determine which rows of B
+///          are needed.
+/// @param B Matrix whose rows are fetched.
+/// @return A new MatrixCSR containing the fetched ghost rows of B, with
+///         an extended column IndexMap covering any new ghost columns
+///         introduced by those rows.
+namespace impl
+{
+
+// Lightweight struct satisfying SparsityImplementation for MatrixCSR
+// constructor.
+struct Sparsity
+{
+  std::shared_ptr<const common::IndexMap> _row_map;
+  std::shared_ptr<const common::IndexMap> _col_map;
+  std::span<const std::int32_t> _cols;
+  std::span<const std::int64_t> _offsets;
+  std::span<const std::int32_t> _off_diag;
+
+  std::shared_ptr<const common::IndexMap> index_map(int dim) const
+  {
+    return dim == 0 ? _row_map : _col_map;
+  }
+
+  int block_size(int) const { return 1; }
+
+  std::pair<std::span<const std::int32_t>, std::span<const std::int64_t>>
+  graph() const
+  {
+    return {_cols, _offsets};
+  }
+
+  std::span<const std::int32_t> off_diagonal_offsets() const
+  {
+    return _off_diag;
+  }
+};
+
+template <typename T>
+std::tuple<std::shared_ptr<common::IndexMap>, std::vector<std::int32_t>,
+           std::vector<std::int32_t>, std::vector<T>>
+fetch_ghost_rows(const dolfinx::la::MatrixCSR<T>& A,
+                 const dolfinx::la::MatrixCSR<T>& B)
+{
+  dolfinx::common::Timer tsp1("{*}fetch");
+  // The ghost columns of A are global row indices into B.
+  auto col_map_A = A.index_map(1); // column IndexMap of A
+  auto row_map_B = B.index_map(0); // row IndexMap of B
+
+  // Serial fast-path: with a single rank there are no ghost columns and
+  // no MPI communication to perform.  We must NOT apply this check per-rank
+  // in a parallel run: some ranks may have no ghost columns while others do,
+  // and the neighbourhood collectives below must be reached by every rank.
+  {
+    int comm_size = 1;
+    MPI_Comm_size(col_map_A->comm(), &comm_size);
+    if (comm_size == 1)
+    {
+      auto col_map_B = B.index_map(1);
+      auto new_col_map = std::make_shared<dolfinx::common::IndexMap>(
+          col_map_B->comm(), col_map_B->size_local(), col_map_B->ghosts(),
+          col_map_B->owners());
+      return {new_col_map, std::vector<std::int32_t>{0},
+              std::vector<std::int32_t>{}, std::vector<T>{}};
+    }
+  }
+
+  // Create neighborhood comms for col_map_A
+  std::span<const int> src = col_map_A->src();
+  std::span<const int> dest = col_map_A->dest();
+  MPI_Comm comm = col_map_A->comm();
+
+  // No-neighbour parallel fast-path: A has no ghost columns (src empty) and
+  // no rank ghosts our A-owned columns (dest empty), so no remote rows of B
+  // are needed and no rank is waiting for us to participate in a collective.
+  // Symmetric topology guarantees the early return is safe without any global
+  // synchronisation (same argument as for transpose()).
+  // Use col_map_B — not col_map_A — for new_col_map so that any existing
+  // ghost columns in B are preserved in the product's column map.
+  if (src.empty() && dest.empty())
+  {
+    auto col_map_B = B.index_map(1);
+    auto new_col_map = std::make_shared<dolfinx::common::IndexMap>(
+        col_map_B->comm(), col_map_B->size_local(), col_map_B->ghosts(),
+        col_map_B->owners());
+    return {new_col_map, std::vector<std::int32_t>{0},
+            std::vector<std::int32_t>{}, std::vector<T>{}};
+  }
+
+  std::map<int, int> src_rank_to_nbr;
+  for (std::size_t i = 0; i < src.size(); ++i)
+    src_rank_to_nbr.insert({src[i], i});
+  MPI_Comm neigh_comm_fwd;
+  MPI_Comm neigh_comm_rev;
+  MPI_Dist_graph_create_adjacent(comm, dest.size(), dest.data(), MPI_UNWEIGHTED,
+                                 src.size(), src.data(), MPI_UNWEIGHTED,
+                                 MPI_INFO_NULL, false, &neigh_comm_fwd);
+  MPI_Dist_graph_create_adjacent(comm, src.size(), src.data(), MPI_UNWEIGHTED,
+                                 dest.size(), dest.data(), MPI_UNWEIGHTED,
+                                 MPI_INFO_NULL, false, &neigh_comm_rev);
+
+  // Global indices of the rows of B we need (= ghost cols of A)
+  std::span<const std::int64_t> required_globals_map = col_map_A->ghosts();
+
+  // Owning rank for each ghost col of A (= source rank for each needed row)
+  std::span<const int> ghost_owners = col_map_A->owners();
+
+  std::vector<int> send_count(src.size(), 0);
+  std::vector<int> recv_count(dest.size(), 0);
+
+  for (int gh : ghost_owners)
+    ++send_count[src_rank_to_nbr[gh]];
+  MPI_Neighbor_alltoall(send_count.data(), 1, MPI_INT, recv_count.data(), 1,
+                        MPI_INT, neigh_comm_fwd);
+
+  // Send and recv displacements
+  std::vector<int> send_disp(src.size() + 1, 0);
+  std::partial_sum(send_count.begin(), send_count.end(),
+                   std::next(send_disp.begin()));
+  std::vector<int> recv_disp(dest.size() + 1, 0);
+  std::partial_sum(recv_count.begin(), recv_count.end(),
+                   std::next(recv_disp.begin()));
+
+  // perm[i] = position in the send buffer (and received data) of original
+  // ghost column i, so we can invert the ordering after receiving.
+  std::vector<std::int32_t> perm(ghost_owners.size());
+  std::vector<std::int64_t> required_globals(required_globals_map.size());
+  for (std::size_t i = 0; i < ghost_owners.size(); ++i)
+  {
+    int pos = src_rank_to_nbr[ghost_owners[i]];
+    perm[i] = send_disp[pos]; // position before increment
+    required_globals[send_disp[pos]++] = required_globals_map[i];
+  }
+
+  // Reset send_disp
+  send_disp[0] = 0;
+  std::partial_sum(send_count.begin(), send_count.end(),
+                   std::next(send_disp.begin()));
+
+  // Send the global row indices we need; receive the indices others need from
+  // us send buffer = required_globals (already ordered by src[])
+  std::vector<std::int64_t> recv_row_globals(recv_disp.back());
+
+  MPI_Neighbor_alltoallv(required_globals.data(), send_count.data(),
+                         send_disp.data(), MPI_INT64_T, recv_row_globals.data(),
+                         recv_count.data(), recv_disp.data(), MPI_INT64_T,
+                         neigh_comm_fwd);
+
+  // Convert received global row indices to local row indices in B.
+  // The received indices are global rows of B that remote ranks need.
+  // They must all be owned by this rank (by construction).
+  std::vector<std::int32_t> recv_row_locals(recv_row_globals.size());
+  row_map_B->global_to_local(recv_row_globals, recv_row_locals);
+
+  // Compute size of send_data and communicate
+  auto row_ptr_B = B.row_ptr();
+
+  std::vector<int> send_entry_count(dest.size(), 0);
+  std::vector<int> send_entry_disp(dest.size() + 1, 0);
+  std::vector<int> recv_entry_count(src.size(), 0);
+  std::vector<int> recv_entry_disp(src.size() + 1, 0);
+  assert(recv_count.size() == dest.size());
+  for (std::size_t p = 0; p < recv_count.size(); ++p)
+  {
+    for (std::int32_t i = recv_disp[p]; i < recv_disp[p + 1]; ++i)
+    {
+      int r = recv_row_locals[i];
+      send_entry_count[p] += row_ptr_B[r + 1] - row_ptr_B[r];
+    }
+  }
+  std::partial_sum(send_entry_count.begin(), send_entry_count.end(),
+                   std::next(send_entry_disp.begin()));
+
+  MPI_Neighbor_alltoall(send_entry_count.data(), 1, MPI_INT,
+                        recv_entry_count.data(), 1, MPI_INT, neigh_comm_rev);
+
+  std::partial_sum(recv_entry_count.begin(), recv_entry_count.end(),
+                   std::next(recv_entry_disp.begin()));
+
+  // Pack and send data values
+  auto values_B = B.values();
+  std::vector<T> send_vals(send_entry_disp.back());
+  std::vector<T> recv_vals(recv_entry_disp.back());
+  std::size_t k = 0;
+  for (int r : recv_row_locals)
+  {
+    std::size_t row_size = row_ptr_B[r + 1] - row_ptr_B[r];
+    std::copy(std::next(values_B.begin(), row_ptr_B[r]),
+              std::next(values_B.begin(), row_ptr_B[r + 1]),
+              std::next(send_vals.begin(), k));
+    k += row_size;
+  }
+  MPI_Datatype mpi_T = dolfinx::MPI::mpi_t<T>;
+  MPI_Neighbor_alltoallv(send_vals.data(), send_entry_count.data(),
+                         send_entry_disp.data(), mpi_T, recv_vals.data(),
+                         recv_entry_count.data(), recv_entry_disp.data(), mpi_T,
+                         neigh_comm_rev);
+
+  // Pack and send column indices
+  auto cols_B = B.cols();
+  auto col_map_B = B.index_map(1);
+  const std::int32_t num_owned_cols_B = col_map_B->size_local();
+  std::span<const int> ghost_owners_B = col_map_B->owners();
+
+  // Send column indices (global indexing) and their owner ranks
+  std::vector<std::int64_t> send_col_data(send_entry_disp.back());
+  std::vector<std::int64_t> recv_col_data(recv_entry_disp.back());
+  std::vector<int> send_col_owners(send_entry_disp.back());
+  std::vector<int> recv_col_owners(recv_entry_disp.back());
+
+  // Send row sizes so receivers can reconstruct the CSR row_ptr
+  std::vector<int> send_row_size(recv_disp.back());
+  std::vector<int> recv_row_size(send_disp.back());
+
+  k = 0;
+  int rank = dolfinx::MPI::rank(col_map_A->comm());
+  for (std::size_t p = 0; p < dest.size(); ++p)
+  {
+    for (std::int32_t i = recv_disp[p]; i < recv_disp[p + 1]; ++i)
+    {
+      int r = recv_row_locals[i];
+      std::size_t row_size = row_ptr_B[r + 1] - row_ptr_B[r];
+      col_map_B->local_to_global(
+          std::span(std::next(cols_B.begin(), row_ptr_B[r]), row_size),
+          std::span(std::next(send_col_data.begin(), k), row_size));
+      for (std::size_t j = 0; j < row_size; ++j)
+      {
+        std::int32_t local_col = cols_B[row_ptr_B[r] + j];
+        send_col_owners[k + j]
+            = (local_col < num_owned_cols_B)
+                  ? rank
+                  : ghost_owners_B[local_col - num_owned_cols_B];
+      }
+      k += row_size;
+      send_row_size[i] = row_size;
+    }
+  }
+
+  MPI_Neighbor_alltoallv(send_col_data.data(), send_entry_count.data(),
+                         send_entry_disp.data(), MPI_INT64_T,
+                         recv_col_data.data(), recv_entry_count.data(),
+                         recv_entry_disp.data(), MPI_INT64_T, neigh_comm_rev);
+  MPI_Neighbor_alltoallv(send_col_owners.data(), send_entry_count.data(),
+                         send_entry_disp.data(), MPI_INT,
+                         recv_col_owners.data(), recv_entry_count.data(),
+                         recv_entry_disp.data(), MPI_INT, neigh_comm_rev);
+  MPI_Neighbor_alltoallv(send_row_size.data(), recv_count.data(),
+                         recv_disp.data(), MPI_INT, recv_row_size.data(),
+                         send_count.data(), send_disp.data(), MPI_INT,
+                         neigh_comm_rev);
+
+  MPI_Comm_free(&neigh_comm_fwd);
+  MPI_Comm_free(&neigh_comm_rev);
+
+  // Build CSR row pointer for the received ghost rows.
+  // recv_vals and recv_col_data are the values and (global) column indices;
+  // recv_row_size[i] is the number of entries in ghost row i.
+  std::vector<std::int32_t> ghost_row_ptr(recv_row_size.size() + 1, 0);
+  std::partial_sum(recv_row_size.begin(), recv_row_size.end(),
+                   std::next(ghost_row_ptr.begin()));
+
+  // ghost_row_ptr, recv_vals, recv_col_data now form a CSR structure for
+  // the ghost rows of B needed by this rank.
+
+  // Collect ghost column indices with their owners.
+  // Start from the received ghost rows, then add existing ghosts of col_map_B.
+  // Exclude anything in the locally owned range of col_map_B, then deduplicate.
+  auto [col_B_local_start, col_B_local_end] = col_map_B->local_range();
+  auto is_local = [col_B_local_start, col_B_local_end](std::int64_t idx)
+  { return idx >= col_B_local_start && idx < col_B_local_end; };
+
+  // Merge received (global_col, owner) pairs with the existing ghosts.
+  std::vector<std::pair<std::int64_t, int>> col_owner_pairs;
+  col_owner_pairs.reserve(recv_col_data.size() + col_map_B->ghosts().size());
+  for (std::size_t i = 0; i < recv_col_data.size(); ++i)
+    if (!is_local(recv_col_data[i]))
+      col_owner_pairs.push_back({recv_col_data[i], recv_col_owners[i]});
+
+  std::span<const std::int64_t> existing_ghosts = col_map_B->ghosts();
+  std::span<const int> existing_ghost_owners = col_map_B->owners();
+  for (std::size_t i = 0; i < existing_ghosts.size(); ++i)
+    col_owner_pairs.push_back({existing_ghosts[i], existing_ghost_owners[i]});
+
+  // Sort by global index and deduplicate.
+  std::sort(col_owner_pairs.begin(), col_owner_pairs.end());
+  col_owner_pairs.erase(
+      std::unique(col_owner_pairs.begin(), col_owner_pairs.end()),
+      col_owner_pairs.end());
+
+  std::vector<std::int64_t> unique_cols(col_owner_pairs.size());
+  std::vector<int> unique_col_owners(col_owner_pairs.size());
+  for (std::size_t i = 0; i < col_owner_pairs.size(); ++i)
+  {
+    unique_cols[i] = col_owner_pairs[i].first;
+    unique_col_owners[i] = col_owner_pairs[i].second;
+  }
+
+  // Build the extended column IndexMap for B.
+  auto new_col_map = std::make_shared<dolfinx::common::IndexMap>(
+      comm, col_map_B->size_local(), unique_cols, unique_col_owners);
+
+  // Convert received global column indices to local indices using new_col_map.
+  std::vector<std::int32_t> recv_col_local(recv_col_data.size());
+  new_col_map->global_to_local(recv_col_data, recv_col_local);
+
+  // The received data is in send-buffer order (grouped by src rank).
+  // Reorder it back to the original ghost column order of col_map_A so that
+  // sparsity_pattern and matmul can index by (j - num_local_rows_B).
+  const std::size_t num_ghosts = perm.size();
+  std::vector<std::int32_t> orig_ghost_row_ptr(num_ghosts + 1, 0);
+  for (std::size_t i = 0; i < num_ghosts; ++i)
+    orig_ghost_row_ptr[i + 1] = recv_row_size[perm[i]];
+  std::partial_sum(orig_ghost_row_ptr.begin(), orig_ghost_row_ptr.end(),
+                   orig_ghost_row_ptr.begin());
+
+  std::vector<std::int32_t> orig_recv_col_local(recv_col_local.size());
+  std::vector<T> orig_recv_vals(recv_vals.size());
+  for (std::size_t i = 0; i < num_ghosts; ++i)
+  {
+    std::int32_t src_start = ghost_row_ptr[perm[i]];
+    std::int32_t src_end = ghost_row_ptr[perm[i] + 1];
+    std::int32_t dst_start = orig_ghost_row_ptr[i];
+    std::copy(recv_col_local.begin() + src_start,
+              recv_col_local.begin() + src_end,
+              orig_recv_col_local.begin() + dst_start);
+    std::copy(recv_vals.begin() + src_start, recv_vals.begin() + src_end,
+              orig_recv_vals.begin() + dst_start);
+  }
+
+  return {new_col_map, orig_ghost_row_ptr, orig_recv_col_local, orig_recv_vals};
+}
+
+/// @brief Compute the sparsity pattern and values of C = A*B in a single pass.
+///
+/// Uses a dense accumulator (one entry per possible output column) to
+/// simultaneously detect nonzero columns and accumulate A[i,j]*B[j,k],
+/// eliminating the separate value-fill loop and the per-entry lower_bound
+/// search that a two-pass approach requires.
+///
+/// @param A             Left matrix.
+/// @param B             Right matrix (local rows only).
+/// @param new_col_map   Extended column IndexMap for C, from fetch_ghost_rows.
+/// @param ghost_row_ptr CSR row pointer for the ghost rows of B.
+/// @param ghost_cols    Local column indices (w.r.t. new_col_map) for ghost
+/// rows.
+/// @param ghost_vals    Values for the ghost rows of B.
+/// @return (row_ptr, cols, vals) — the CSR structure and values of C.
+template <typename T>
+std::tuple<std::vector<std::int32_t>, std::vector<std::int32_t>, std::vector<T>>
+matmul(const dolfinx::la::MatrixCSR<T>& A, const dolfinx::la::MatrixCSR<T>& B,
+       std::shared_ptr<const common::IndexMap> new_col_map,
+       std::span<const std::int32_t> ghost_row_ptr,
+       std::span<const std::int32_t> ghost_cols, std::span<const T> ghost_vals)
+{
+  dolfinx::common::Timer tsp("{*}sparsity_and_mult");
+  auto col_map_B = B.index_map(1);
+  const std::int32_t num_local_rows_A = A.index_map(0)->size_local();
+  const std::int32_t num_local_rows_B = B.index_map(0)->size_local();
+  const std::int32_t num_owned_cols_B = col_map_B->size_local();
+
+  auto row_ptr_A = A.row_ptr();
+  auto offdiag_ptr_A = A.off_diag_offset();
+  auto cols_A = A.cols();
+  auto vals_A = A.values();
+  auto row_ptr_B = B.row_ptr();
+  auto cols_B = B.cols();
+  auto vals_B = B.values();
+
+  // Remap B's ghost column indices into new_col_map's index space.
+  // Owned column indices (0..num_owned_cols_B-1) are identical in both maps.
+  std::span<const std::int64_t> b_ghost_globals = col_map_B->ghosts();
+  std::vector<std::int32_t> b_ghost_remap(b_ghost_globals.size());
+  new_col_map->global_to_local(b_ghost_globals, b_ghost_remap);
+
+  const std::int32_t num_cols_C
+      = new_col_map->size_local()
+        + static_cast<std::int32_t>(new_col_map->ghosts().size());
+
+  std::vector<std::int32_t> C_row_ptr = {0};
+  C_row_ptr.reserve(num_local_rows_A + 1);
+  std::vector<std::int32_t> C_cols;
+  std::vector<T> C_vals;
+
+  // Dense accumulator: acc[k] holds the running sum for column k.
+  // in_row[k] marks whether column k was touched in the current row.
+  // Both are reset via row_cols at the end of each row (O(nnz/row), not
+  // O(num_cols_C)).
+  std::vector<T> acc(num_cols_C, T(0));
+  std::vector<bool> in_row(num_cols_C, false);
+  std::vector<std::int32_t> row_cols;
+
+  for (std::int32_t i = 0; i < num_local_rows_A; ++i)
+  {
+    row_cols.clear();
+
+    // Local columns of A  →  local rows of B.
+    for (std::int32_t ka = row_ptr_A[i]; ka < offdiag_ptr_A[i]; ++ka)
+    {
+      const std::int32_t j = cols_A[ka];
+      const T a = vals_A[ka];
+      for (std::int32_t kb = row_ptr_B[j]; kb < row_ptr_B[j + 1]; ++kb)
+      {
+        const std::int32_t c = cols_B[kb];
+        const std::int32_t k
+            = c < num_owned_cols_B ? c : b_ghost_remap[c - num_owned_cols_B];
+        if (!in_row[k])
+        {
+          in_row[k] = true;
+          row_cols.push_back(k);
+        }
+        acc[k] += a * vals_B[kb];
+      }
+    }
+
+    // Ghost columns of A  →  ghost rows of B.
+    for (std::int32_t ka = offdiag_ptr_A[i]; ka < row_ptr_A[i + 1]; ++ka)
+    {
+      const std::int32_t j = cols_A[ka];
+      const T a = vals_A[ka];
+      const std::int32_t g = j - num_local_rows_B;
+      for (std::int32_t kb = ghost_row_ptr[g]; kb < ghost_row_ptr[g + 1]; ++kb)
+      {
+        const std::int32_t k = ghost_cols[kb];
+        if (!in_row[k])
+        {
+          in_row[k] = true;
+          row_cols.push_back(k);
+        }
+        acc[k] += a * ghost_vals[kb];
+      }
+    }
+
+    // Sort touched indices to satisfy the CSR sorted-column invariant
+    // required by off_diag_offset and MatrixCSR.
+    std::sort(row_cols.begin(), row_cols.end());
+
+    // Flush accumulator to output and reset both acc and in_row.
+    for (const std::int32_t c : row_cols)
+    {
+      C_cols.push_back(c);
+      C_vals.push_back(acc[c]);
+      acc[c] = T(0);
+      in_row[c] = false;
+    }
+    C_row_ptr.push_back(static_cast<std::int32_t>(C_cols.size()));
+  }
+
+  return {std::move(C_row_ptr), std::move(C_cols), std::move(C_vals)};
+}
+
+} // namespace impl
+
+/// @brief Compute C = A*B as a distributed MatrixCSR.
+///
+/// @param A Left matrix.
+/// @param B Right matrix.
+/// @return The product C = A*B as a MatrixCSR with row distribution
+///         matching A and column distribution determined by B.
+template <typename T>
+dolfinx::la::MatrixCSR<T> matmul(const dolfinx::la::MatrixCSR<T>& A,
+                                 const dolfinx::la::MatrixCSR<T>& B)
+{
+  dolfinx::common::Timer t_spgemm("MatrixCSR SpGEMM");
+
+  // Fetch ghost rows of B needed to multiply against A's ghost columns.
+  spdlog::info("Fetch remote rows of B in C=A*B");
+  auto [new_col_map, ghost_row_ptr, ghost_cols, ghost_vals]
+      = impl::fetch_ghost_rows(A, B);
+
+  // Single pass: compute sparsity and values simultaneously.
+  spdlog::info("Compute sparsity and values of C=A*B");
+  auto [C_row_ptr_32, C_cols_32, C_vals_vec] = impl::matmul(
+      A, B, new_col_map, std::span<const std::int32_t>(ghost_row_ptr),
+      std::span<const std::int32_t>(ghost_cols),
+      std::span<const T>(ghost_vals));
+
+  const std::int32_t num_local_rows = A.index_map(0)->size_local();
+  const std::int32_t num_owned_cols = new_col_map->size_local();
+
+  // Per-row diagonal block boundary, needed by MatrixCSR constructor.
+  std::vector<std::int32_t> off_diag_offsets(num_local_rows);
+  for (std::int32_t i = 0; i < num_local_rows; ++i)
+  {
+    const auto begin = C_cols_32.cbegin() + C_row_ptr_32[i];
+    const auto end = C_cols_32.cbegin() + C_row_ptr_32[i + 1];
+    off_diag_offsets[i] = static_cast<std::int32_t>(
+        std::lower_bound(begin, end, num_owned_cols) - begin);
+  }
+
+  auto C_row_map = std::make_shared<common::IndexMap>(A.index_map(0)->comm(),
+                                                      num_local_rows);
+  std::vector<std::int64_t> C_row_ptr_64(C_row_ptr_32.begin(),
+                                         C_row_ptr_32.end());
+
+  impl::Sparsity sp{C_row_map, new_col_map, C_cols_32, C_row_ptr_64,
+                    off_diag_offsets};
+  dolfinx::la::MatrixCSR<T> C(sp);
+  std::copy(C_vals_vec.begin(), C_vals_vec.end(), C.values().begin());
+
+  return C;
+}
+
+} // namespace dolfinx::la
diff --git a/python/dolfinx/la/__init__.py b/python/dolfinx/la/__init__.py
index 88054df14a..a4b8d44bd0 100644
--- a/python/dolfinx/la/__init__.py
+++ b/python/dolfinx/la/__init__.py
@@ -168,6 +168,17 @@ def mult(self, x: Vector, y: Vector, transpose: bool = False) -> None:
         else:
             self._cpp_object.mult(x._cpp_object, y._cpp_object)
 
+    def matmul(self, B):
+        """Compute ``C = A * B``, where `A` is this matrix.
+
+        Args:
+            B: Input Matrix
+            C: Output Matrix
+        """
+        if self.index_map(1).size_local != B.index_map(0).size_local:
+            raise RuntimeError("Invalid matrix sizes for matmult.")
+        return self._cpp_object.mult(B._cpp_object)
+
     @property
     def block_size(self) -> list:
         """Block sizes for the matrix."""
diff --git a/python/dolfinx/wrappers/dolfinx_wrappers/la.h b/python/dolfinx/wrappers/dolfinx_wrappers/la.h
index 4ac7a36730..6427a032f7 100644
--- a/python/dolfinx/wrappers/dolfinx_wrappers/la.h
+++ b/python/dolfinx/wrappers/dolfinx_wrappers/la.h
@@ -11,6 +11,7 @@
 #include <cstdint>
 #include <dolfinx/common/IndexMap.h>
 #include <dolfinx/la/MatrixCSR.h>
+#include <dolfinx/la/matmul_impl.h>
 #include <dolfinx/la/Vector.h>
 #include <dolfinx/la/utils.h>
 #include <nanobind/nanobind.h>
@@ -145,6 +146,9 @@ void declare_la_objects(nanobind::module_& m, const std::string& type)
            })
       .def("scatter_reverse", &dolfinx::la::MatrixCSR<T>::scatter_rev)
       .def("mult", &dolfinx::la::MatrixCSR<T>::mult)
+      .def("mult", [](const dolfinx::la::MatrixCSR<T>& A, const dolfinx::la::MatrixCSR<T>&B){
+        return dolfinx::la::matmul(A, B);
+      })
       .def("multT", &dolfinx::la::MatrixCSR<T>::multT)
       .def("to_dense",
            [](const dolfinx::la::MatrixCSR<T>& self)
diff --git a/python/test/unit/la/test_matmul.py b/python/test/unit/la/test_matmul.py
new file mode 100644
index 0000000000..9d8da204e7
--- /dev/null
+++ b/python/test/unit/la/test_matmul.py
@@ -0,0 +1,73 @@
+#
+# This file is part of DOLFINx (https://www.fenicsproject.org)
+#
+# SPDX-License-Identifier:    LGPL-3.0-or-later
+"""Unit tests for MatrixCSR."""
+
+from mpi4py import MPI
+
+import numpy as np
+import pytest
+from scipy.sparse import csr_matrix
+
+from dolfinx import cpp as _cpp
+from dolfinx import la
+from dolfinx.mesh import create_unit_square
+
+
+def mat_gather(A):
+    # Gather full matrix on all processes for scipy
+    nr = A.index_map(0).size_local
+    gatheredvals = np.concatenate(MPI.COMM_WORLD.allgather(A.data[: A.indptr[nr]]))
+    gatheredptrs = MPI.COMM_WORLD.allgather(A.indptr[: nr + 1])
+    cols = A.index_map(1).local_to_global(A.indices[: A.indptr[nr]])
+    gatheredcols = np.concatenate(MPI.COMM_WORLD.allgather(cols))
+    indptr = gatheredptrs[0]
+    for i in range(1, len(gatheredptrs)):
+        indptr = np.concatenate((indptr, (gatheredptrs[i][1:] + indptr[-1])))
+    return csr_matrix((gatheredvals, gatheredcols, indptr))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.float32,
+        np.float64,
+        np.complex64,
+        np.complex128,
+    ],
+)
+def test_matmul(dtype):
+    mesh = create_unit_square(MPI.COMM_WORLD, 3, 3)
+    imap = mesh.topology.index_map(0)
+    sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1])
+    rows = np.arange(0, imap.size_local)
+    cols = np.arange(0, imap.size_local + imap.num_ghosts)
+    sp.insert(rows, cols)
+    sp.finalize()
+
+    # Identity
+    A = la.matrix_csr(sp, dtype=dtype)
+    B = la.matrix_csr(sp, dtype=dtype)
+    rng = np.random.default_rng(12345)
+    A.data[:] = rng.random(len(A.data))
+    A.scatter_reverse()
+    B.data[:] = rng.random(len(B.data))
+    B.scatter_reverse()
+
+    Ascipy = mat_gather(A)
+    Bscipy = mat_gather(B)
+    lr0, lr1 = A.index_map(0).local_range
+    nr = A.index_map(0).size_local
+    # Check gathered matrix
+    assert np.allclose(A.to_dense()[:nr, :], Ascipy.todense()[lr0:lr1])
+    lrB0, lrB1 = B.index_map(0).local_range
+    nrB = B.index_map(0).size_local
+    # Check gathered matrix
+    assert np.allclose(B.to_dense()[:nrB, :], Bscipy.todense()[lrB0:lrB1])
+
+    C = la.MatrixCSR(A.matmul(B))
+    lrC0, lrC1 = C.index_map(0).local_range
+    nrC = C.index_map(0).size_local
+    Cscipy = Ascipy @ Bscipy
+    assert np.allclose(C.to_dense()[:nrC, :], Cscipy.todense()[lrC0:lrC1])

From 594c2c0374e3a8e9d0e00467de09d4f267a6e500 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 15:51:51 +0100
Subject: [PATCH 08/23] Update documentation and simplify fetch

---
 cpp/dolfinx/la/MatrixCSR.h   | 13 ++++++++++++-
 cpp/dolfinx/la/matmul_impl.h | 26 +++++++-------------------
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index a1389c2e87..cd38f3c20d 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -491,7 +491,18 @@ class MatrixCSR
   /// @param[in,out] y Vector to accumulate the result into.
   void mult(Vector<value_type>& x, Vector<value_type>& y) const;
 
-   void multT(Vector<value_type>& x, Vector<value_type>& y) const;
+
+  /// @brief Compute the product `y += A^T x`.
+  ///
+  /// The vectors `x` and `y` must have parallel layouts that are
+  /// compatible with `A`. In detail, `x` must have the same `IndexMap` as
+  /// the matrix rows, `A.index_map(0)` and `y` must have the same owned
+  /// indices as the matrix columns in `A.index_map(1)`. Ghost entries will
+  /// be overwritten, and owned entries of `y` will be updated.
+  ///
+  /// @param[in] x Vector to apply `A^T` to.
+  /// @param[in,out] y Vector to accumulate the result into.
+  void multT(Vector<value_type>& x, Vector<value_type>& y) const;
 
 
   /// @brief Get MPI communicator that matrix is defined on.
diff --git a/cpp/dolfinx/la/matmul_impl.h b/cpp/dolfinx/la/matmul_impl.h
index 086843d293..d473d18a57 100644
--- a/cpp/dolfinx/la/matmul_impl.h
+++ b/cpp/dolfinx/la/matmul_impl.h
@@ -72,29 +72,13 @@ fetch_ghost_rows(const dolfinx::la::MatrixCSR<T>& A,
   auto col_map_A = A.index_map(1); // column IndexMap of A
   auto row_map_B = B.index_map(0); // row IndexMap of B
 
-  // Serial fast-path: with a single rank there are no ghost columns and
-  // no MPI communication to perform.  We must NOT apply this check per-rank
-  // in a parallel run: some ranks may have no ghost columns while others do,
-  // and the neighbourhood collectives below must be reached by every rank.
-  {
-    int comm_size = 1;
-    MPI_Comm_size(col_map_A->comm(), &comm_size);
-    if (comm_size == 1)
-    {
-      auto col_map_B = B.index_map(1);
-      auto new_col_map = std::make_shared<dolfinx::common::IndexMap>(
-          col_map_B->comm(), col_map_B->size_local(), col_map_B->ghosts(),
-          col_map_B->owners());
-      return {new_col_map, std::vector<std::int32_t>{0},
-              std::vector<std::int32_t>{}, std::vector<T>{}};
-    }
-  }
-
   // Create neighborhood comms for col_map_A
   std::span<const int> src = col_map_A->src();
   std::span<const int> dest = col_map_A->dest();
   MPI_Comm comm = col_map_A->comm();
 
+  // Serial fast-path: with a single rank there are no ghost columns and
+  // no MPI communication to perform.
   // No-neighbour parallel fast-path: A has no ghost columns (src empty) and
   // no rank ghosts our A-owned columns (dest empty), so no remote rows of B
   // are needed and no rank is waiting for us to participate in a collective.
@@ -102,7 +86,9 @@ fetch_ghost_rows(const dolfinx::la::MatrixCSR<T>& A,
   // synchronisation (same argument as for transpose()).
   // Use col_map_B — not col_map_A — for new_col_map so that any existing
   // ghost columns in B are preserved in the product's column map.
-  if (src.empty() && dest.empty())
+
+  int comm_size = dolfinx::MPI::size(col_map_A->comm());
+  if (comm_size == 1 or (src.empty() && dest.empty()))
   {
     auto col_map_B = B.index_map(1);
     auto new_col_map = std::make_shared<dolfinx::common::IndexMap>(
@@ -481,6 +467,8 @@ matmul(const dolfinx::la::MatrixCSR<T>& A, const dolfinx::la::MatrixCSR<T>& B,
 /// @param B Right matrix.
 /// @return The product C = A*B as a MatrixCSR with row distribution
 ///         matching A and column distribution determined by B.
+///         The row IndexMap of C has no ghosts, and the column IndexMap of C
+///         will generally be a larger superset of the column IndexMap of B.
 template <typename T>
 dolfinx::la::MatrixCSR<T> matmul(const dolfinx::la::MatrixCSR<T>& A,
                                  const dolfinx::la::MatrixCSR<T>& B)

From 0eca1511bbe54ec03e7c6e57c7a3ee6876055126 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 16:53:26 +0100
Subject: [PATCH 09/23] bug fix

---
 cpp/dolfinx/la/Vector.h         | 29 +----------------------------
 cpp/dolfinx/la/superlu_dist.cpp |  2 +-
 2 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/cpp/dolfinx/la/Vector.h b/cpp/dolfinx/la/Vector.h
index e29e6f38b1..afbdb10cd2 100644
--- a/cpp/dolfinx/la/Vector.h
+++ b/cpp/dolfinx/la/Vector.h
@@ -354,33 +354,6 @@ class Vector
            _x.begin());
   }
 
-  /// @brief End scatter (send) of local data values
-  /// to the owning process and assign/accumulate into the owned data entries
-  /// (simplified CPU version).
-  ///
-  /// Suitable for scatter operations on a CPU with `std::vector`
-  /// storage. The receive buffer is unpacked internally by a function
-  /// that is suitable for use on a CPU.
-  ///
-  /// @note Collective MPI operation.
-  template <class BinaryOperation>
-  void scatter_rev_end(BinaryOperation op)
-    requires requires(Container c) {
-      { c.data() } -> std::same_as<T*>;
-    }
-  {
-    // Inline the kernel overload directly to avoid ambiguity: the lambda
-    // produced by get_unpack_op satisfies both VectorPackKernel and the
-    // unconstrained BinaryOperation parameter, making a delegating call to
-    // scatter_rev_end(get_unpack_op(op)) ambiguous.
-    auto unpack = get_unpack_op(op);
-    _scatterer->scatter_end(_request);
-    unpack(_scatterer->local_indices().begin(),
-           _scatterer->local_indices().end(), _buffer_local.begin(),
-           _x.begin());
-  }
-
-
   /// @brief Scatter (send) of ghost data values to the owning process
   /// and assign/accumulate into the owned data entries (simplified CPU
   /// version).
@@ -399,7 +372,7 @@ class Vector
   void scatter_rev(BinaryOperation op)
   {
     this->scatter_rev_begin();
-    this->scatter_rev_end(op);
+    this->scatter_rev_end(get_unpack_op(op));
   }
 
   /// Get IndexMap
diff --git a/cpp/dolfinx/la/superlu_dist.cpp b/cpp/dolfinx/la/superlu_dist.cpp
index eb72f13c3f..8cd3a88dd1 100644
--- a/cpp/dolfinx/la/superlu_dist.cpp
+++ b/cpp/dolfinx/la/superlu_dist.cpp
@@ -458,7 +458,7 @@ void SuperLUDistSolver<T>::set_option(std::string name, std::string value)
   }
   else
   {
-    std::runtime_error("Unsupported option name");
+    throw std::runtime_error("Unsupported option name");
   }
 }
 //----------------------------------------------------------------------------

From 254d5616ee2fd2eaff49b1dfc5d7cd9a96549897 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 17:31:20 +0100
Subject: [PATCH 10/23] Update header

---
 cpp/dolfinx/la/matmul_impl.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/dolfinx/la/matmul_impl.h b/cpp/dolfinx/la/matmul_impl.h
index d473d18a57..d7c5f586a8 100644
--- a/cpp/dolfinx/la/matmul_impl.h
+++ b/cpp/dolfinx/la/matmul_impl.h
@@ -1,5 +1,8 @@
 // Copyright (C) 2026 Chris Richardson
-// SPDX-License-Identifier: MIT
+//
+// This file is part of DOLFINx (https://www.fenicsproject.org)
+//
+// SPDX-License-Identifier:    LGPL-3.0-or-later
 
 #pragma once
 

From d3d760008f834bc5350f6c96d4570d72a5d39427 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 17:32:05 +0100
Subject: [PATCH 11/23] update copyright

---
 python/test/unit/la/test_matmul.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/test/unit/la/test_matmul.py b/python/test/unit/la/test_matmul.py
index 9d8da204e7..97e748715f 100644
--- a/python/test/unit/la/test_matmul.py
+++ b/python/test/unit/la/test_matmul.py
@@ -1,3 +1,4 @@
+# Copyright (C) 2026 Chris Richardson
 #
 # This file is part of DOLFINx (https://www.fenicsproject.org)
 #

From 463f9e6afd79c15c2cf101c900b729e962782301 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 17:35:58 +0100
Subject: [PATCH 12/23] Factor out mat_gather into conftest.py

---
 python/test/unit/la/conftest.py           | 33 +++++++++++++++++++++++
 python/test/unit/la/test_matmul.py        | 16 +----------
 python/test/unit/la/test_matrix_vector.py | 18 ++-----------
 3 files changed, 36 insertions(+), 31 deletions(-)
 create mode 100644 python/test/unit/la/conftest.py

diff --git a/python/test/unit/la/conftest.py b/python/test/unit/la/conftest.py
new file mode 100644
index 0000000000..c431b44f6f
--- /dev/null
+++ b/python/test/unit/la/conftest.py
@@ -0,0 +1,33 @@
+# Copyright (C) 2026 Chris Richardson
+#
+# This file is part of DOLFINx (https://www.fenicsproject.org)
+#
+# SPDX-License-Identifier:    LGPL-3.0-or-later
+
+"""Shared pytest fixtures for dolfinx.la unit tests."""
+
+from mpi4py import MPI
+
+import numpy as np
+import pytest
+from scipy.sparse import csr_matrix
+
+
+@pytest.fixture
+def mat_gather():
+    """Return a function that gathers a distributed MatrixCSR onto all
+    processes as a scipy CSR matrix (in global column indexing).
+    """
+
+    def _mat_gather(A):
+        nr = A.index_map(0).size_local
+        gatheredvals = np.concatenate(MPI.COMM_WORLD.allgather(A.data[: A.indptr[nr]]))
+        gatheredptrs = MPI.COMM_WORLD.allgather(A.indptr[: nr + 1])
+        cols = A.index_map(1).local_to_global(A.indices[: A.indptr[nr]])
+        gatheredcols = np.concatenate(MPI.COMM_WORLD.allgather(cols))
+        indptr = gatheredptrs[0]
+        for i in range(1, len(gatheredptrs)):
+            indptr = np.concatenate((indptr, (gatheredptrs[i][1:] + indptr[-1])))
+        return csr_matrix((gatheredvals, gatheredcols, indptr))
+
+    return _mat_gather
diff --git a/python/test/unit/la/test_matmul.py b/python/test/unit/la/test_matmul.py
index 97e748715f..a74073fd70 100644
--- a/python/test/unit/la/test_matmul.py
+++ b/python/test/unit/la/test_matmul.py
@@ -9,26 +9,12 @@
 
 import numpy as np
 import pytest
-from scipy.sparse import csr_matrix
 
 from dolfinx import cpp as _cpp
 from dolfinx import la
 from dolfinx.mesh import create_unit_square
 
 
-def mat_gather(A):
-    # Gather full matrix on all processes for scipy
-    nr = A.index_map(0).size_local
-    gatheredvals = np.concatenate(MPI.COMM_WORLD.allgather(A.data[: A.indptr[nr]]))
-    gatheredptrs = MPI.COMM_WORLD.allgather(A.indptr[: nr + 1])
-    cols = A.index_map(1).local_to_global(A.indices[: A.indptr[nr]])
-    gatheredcols = np.concatenate(MPI.COMM_WORLD.allgather(cols))
-    indptr = gatheredptrs[0]
-    for i in range(1, len(gatheredptrs)):
-        indptr = np.concatenate((indptr, (gatheredptrs[i][1:] + indptr[-1])))
-    return csr_matrix((gatheredvals, gatheredcols, indptr))
-
-
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -38,7 +24,7 @@ def mat_gather(A):
         np.complex128,
     ],
 )
-def test_matmul(dtype):
+def test_matmul(dtype, mat_gather):
     mesh = create_unit_square(MPI.COMM_WORLD, 3, 3)
     imap = mesh.topology.index_map(0)
     sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1])
diff --git a/python/test/unit/la/test_matrix_vector.py b/python/test/unit/la/test_matrix_vector.py
index e064b6b04a..79ad1b3f0e 100644
--- a/python/test/unit/la/test_matrix_vector.py
+++ b/python/test/unit/la/test_matrix_vector.py
@@ -9,7 +9,6 @@
 
 import numpy as np
 import pytest
-from scipy.sparse import csr_matrix
 
 from dolfinx import cpp as _cpp
 from dolfinx import la
@@ -17,19 +16,6 @@
 from dolfinx.mesh import create_unit_square
 
 
-def mat_gather(A):
-    # Gather full matrix on all processes for scipy
-    nr = A.index_map(0).size_local
-    gatheredvals = np.concatenate(MPI.COMM_WORLD.allgather(A.data[: A.indptr[nr]]))
-    gatheredptrs = MPI.COMM_WORLD.allgather(A.indptr[: nr + 1])
-    cols = A.index_map(1).local_to_global(A.indices[: A.indptr[nr]])
-    gatheredcols = np.concatenate(MPI.COMM_WORLD.allgather(cols))
-    indptr = gatheredptrs[0]
-    for i in range(1, len(gatheredptrs)):
-        indptr = np.concatenate((indptr, (gatheredptrs[i][1:] + indptr[-1])))
-    return csr_matrix((gatheredvals, gatheredcols, indptr))
-
-
 def test_create_matrix_csr():
     """Test creation of CSR matrix with specified types."""
     mesh = create_unit_square(MPI.COMM_WORLD, 10, 11)
@@ -67,7 +53,7 @@ def test_create_matrix_csr():
         np.complex128,
     ],
 )
-def test_matvec(dtype):
+def test_matvec(dtype, mat_gather):
     mesh = create_unit_square(MPI.COMM_WORLD, 3, 3)
     imap = mesh.topology.index_map(0)
     sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1])
@@ -109,7 +95,7 @@ def test_matvec(dtype):
         np.complex128,
     ],
 )
-def test_matvec_transpose(dtype):
+def test_matvec_transpose(dtype, mat_gather):
     mesh = create_unit_square(MPI.COMM_WORLD, 3, 3)
     imap = mesh.topology.index_map(0)
     sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1])

From f905bb6bd4f5aca71f13f57e23d6bc4952284acd Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 17:39:49 +0100
Subject: [PATCH 13/23] docs

---
 cpp/dolfinx/la/matrix_csr_impl.h | 44 ++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h
index 8af8c903fb..6190331279 100644
--- a/cpp/dolfinx/la/matrix_csr_impl.h
+++ b/cpp/dolfinx/la/matrix_csr_impl.h
@@ -260,17 +260,39 @@ void spmv(std::span<const T> values, std::span<const std::int64_t> row_begin,
 }
 
 
-/// @brief  Sparse matrix-vector transpose product implementation.
-/// @tparam T
-/// @tparam BS1
-/// @param values
-/// @param row_begin
-/// @param row_end
-/// @param indices
-/// @param x
-/// @param y
-/// @param bs0
-/// @param bs1
+/// @brief Sparse matrix-vector transpose product implementation.
+///
+/// Computes `y += A^T x` where A is given in CSR format.  The transpose
+/// is applied implicitly: for each nonzero A(i, indices[j]) the
+/// contribution `A(i,j) * x[i]` is scattered into `y[indices[j]]`.
+///
+/// @note `y` is accumulated into (not overwritten).  Callers should
+/// zero `y` before the first call if a fresh result is required.
+///
+/// @note The value block layout is column-major within each block: for
+/// block entry `j` the element at row-offset `k0` and column-offset `k1`
+/// is stored at `values[j * bs1 * bs0 + k1 * bs0 + k0]`.
+///
+/// @tparam T   Scalar type of the matrix and vector entries.
+/// @tparam BS1 Compile-time column block size.  Pass `-1` to use the
+///             runtime value `bs1` instead.
+///
+/// @param[in]  values    Nonzero values of A, stored block-column-major.
+///                       Length: `nnz * bs0 * bs1`.
+/// @param[in]  row_begin Start positions in `values`/`indices` for each
+///                       row of A.  Length: number of rows of A.
+/// @param[in]  row_end   End positions in `values`/`indices` for each
+///                       row of A.  Length: number of rows of A.
+/// @param[in]  indices   Column indices of each nonzero block entry of A.
+///                       Length: `nnz`.
+/// @param[in]  x         Input vector, indexed by the *rows* of A.
+///                       Length: `num_rows * bs0`.
+/// @param[in,out] y      Output vector, indexed by the *columns* of A,
+///                       accumulated into.
+///                       Length: `num_cols * bs1`.
+/// @param[in]  bs0       Row block size (runtime value).
+/// @param[in]  bs1       Column block size (runtime value, used when
+///                       `BS1 == -1`).
 template <typename T, int BS1>
 void spmvT(std::span<const T> values, std::span<const std::int64_t> row_begin,
           std::span<const std::int64_t> row_end,

From 5bb2cb298601547191cd1d4448925f3a6b35628d Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 17:43:34 +0100
Subject: [PATCH 14/23] wrapping

---
 python/dolfinx/la/__init__.py      | 2 +-
 python/test/unit/la/test_matmul.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/dolfinx/la/__init__.py b/python/dolfinx/la/__init__.py
index a4b8d44bd0..d6fcd57897 100644
--- a/python/dolfinx/la/__init__.py
+++ b/python/dolfinx/la/__init__.py
@@ -177,7 +177,7 @@ def matmul(self, B):
         """
         if self.index_map(1).size_local != B.index_map(0).size_local:
             raise RuntimeError("Invalid matrix sizes for matmult.")
-        return self._cpp_object.mult(B._cpp_object)
+        return MatrixCSR(self._cpp_object.mult(B._cpp_object))
 
     @property
     def block_size(self) -> list:
diff --git a/python/test/unit/la/test_matmul.py b/python/test/unit/la/test_matmul.py
index a74073fd70..e6a75058ea 100644
--- a/python/test/unit/la/test_matmul.py
+++ b/python/test/unit/la/test_matmul.py
@@ -53,7 +53,7 @@ def test_matmul(dtype, mat_gather):
     # Check gathered matrix
     assert np.allclose(B.to_dense()[:nrB, :], Bscipy.todense()[lrB0:lrB1])
 
-    C = la.MatrixCSR(A.matmul(B))
+    C = A.matmul(B)
     lrC0, lrC1 = C.index_map(0).local_range
     nrC = C.index_map(0).size_local
     Cscipy = Ascipy @ Bscipy

From 82bc93af1e81a76ac5d9d6e689e4dde5903bd4ee Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 21:28:12 +0100
Subject: [PATCH 15/23] revert for this PR

---
 cpp/dolfinx/la/superlu_dist.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/dolfinx/la/superlu_dist.cpp b/cpp/dolfinx/la/superlu_dist.cpp
index 8cd3a88dd1..eb72f13c3f 100644
--- a/cpp/dolfinx/la/superlu_dist.cpp
+++ b/cpp/dolfinx/la/superlu_dist.cpp
@@ -458,7 +458,7 @@ void SuperLUDistSolver<T>::set_option(std::string name, std::string value)
   }
   else
   {
-    throw std::runtime_error("Unsupported option name");
+    std::runtime_error("Unsupported option name");
   }
 }
 //----------------------------------------------------------------------------

From 6f18d46212083cefde7f5da01bafef4633cab92a Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 21:37:47 +0100
Subject: [PATCH 16/23] clang-format

---
 cpp/dolfinx/la/MatrixCSR.h       | 19 ++++++++++---------
 cpp/dolfinx/la/matrix_csr_impl.h | 18 ++++++++----------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index f7e51685ab..cf54b04b93 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -903,13 +903,13 @@ void MatrixCSR<Scalar, V, W, X>::mult(la::Vector<Scalar>& x,
 /// @param y
 template <typename Scalar, typename V, typename W, typename X>
 void MatrixCSR<Scalar, V, W, X>::multT(la::Vector<Scalar>& x,
-                                      la::Vector<Scalar>& y) const
+                                       la::Vector<Scalar>& y) const
 {
   std::int32_t nrowslocal = num_owned_rows();
   std::span<const std::int64_t> Arow_ptr(row_ptr().data(), nrowslocal + 1);
   std::span<const std::int32_t> Acols(cols().data(), Arow_ptr[nrowslocal]);
   std::span<const std::int64_t> Aoff_diag_offset(off_diag_offset().data(),
-                                                  nrowslocal);
+                                                 nrowslocal);
   std::span<const Scalar> Avalues(values().data(), Arow_ptr[nrowslocal]);
 
   std::span<const Scalar> _x = x.array();
@@ -922,19 +922,20 @@ void MatrixCSR<Scalar, V, W, X>::multT(la::Vector<Scalar>& x,
   int ncolslocal = index_map(1)->size_local();
   std::fill(std::next(_y.begin(), ncolslocal), _y.end(), 0.0);
   if (_bs[1] == 1)
-    impl::spmvT<Scalar, 1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], 1);
+    impl::spmvT<Scalar, 1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y,
+                           _bs[0], 1);
   else
-    impl::spmvT<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y, _bs[0], _bs[1]);
+    impl::spmvT<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y,
+                            _bs[0], _bs[1]);
 
   y.scatter_rev(std::plus<Scalar>{});
 
   if (_bs[1] == 1)
-    impl::spmvT<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], 1);
+    impl::spmvT<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y,
+                           _bs[0], 1);
   else
-    impl::spmvT<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y, _bs[0], _bs[1]);
-
+    impl::spmvT<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x,
+                            _y, _bs[0], _bs[1]);
 }
 
-
-
 } // namespace dolfinx::la
diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h
index 6190331279..2a6fa5bb1a 100644
--- a/cpp/dolfinx/la/matrix_csr_impl.h
+++ b/cpp/dolfinx/la/matrix_csr_impl.h
@@ -259,7 +259,6 @@ void spmv(std::span<const T> values, std::span<const std::int64_t> row_begin,
   }
 }
 
-
 /// @brief Sparse matrix-vector transpose product implementation.
 ///
 /// Computes `y += A^T x` where A is given in CSR format.  The transpose
@@ -295,9 +294,9 @@ void spmv(std::span<const T> values, std::span<const std::int64_t> row_begin,
 ///                       `BS1 == -1`).
 template <typename T, int BS1>
 void spmvT(std::span<const T> values, std::span<const std::int64_t> row_begin,
-          std::span<const std::int64_t> row_end,
-          std::span<const std::int32_t> indices, std::span<const T> x,
-          std::span<T> y, int bs0, int bs1)
+           std::span<const std::int64_t> row_end,
+           std::span<const std::int32_t> indices, std::span<const T> x,
+           std::span<T> y, int bs0, int bs1)
 {
   assert(row_begin.size() == row_end.size());
 
@@ -308,20 +307,20 @@ void spmvT(std::span<const T> values, std::span<const std::int64_t> row_begin,
       const T xval = x[i * bs0 + k0];
       for (std::int32_t j = row_begin[i]; j < row_end[i]; j++)
       {
-        if constexpr(BS1 == -1)
+        if constexpr (BS1 == -1)
         {
           for (int k1 = 0; k1 < bs1; ++k1)
           {
-            y[indices[j] * bs1 + k1] += values[j * bs1 * bs0 + k1 * bs0 + k0]
-                  * xval;
+            y[indices[j] * bs1 + k1]
+                += values[j * bs1 * bs0 + k1 * bs0 + k0] * xval;
           }
         }
         else
         {
           for (int k1 = 0; k1 < BS1; ++k1)
           {
-            y[indices[j] * BS1 + k1] += values[j * BS1 * bs0 + k1 * bs0 + k0]
-                  * xval;
+            y[indices[j] * BS1 + k1]
+                += values[j * BS1 * bs0 + k1 * bs0 + k0] * xval;
           }
         }
       }
@@ -329,6 +328,5 @@ void spmvT(std::span<const T> values, std::span<const std::int64_t> row_begin,
   }
 }
 
-
 } // namespace impl
 } // namespace dolfinx::la

From 17b342b64e02bd6d0b838b78cf53198824b5e738 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 21:39:17 +0100
Subject: [PATCH 17/23] And python wrappers

---
 python/dolfinx/wrappers/dolfinx_wrappers/la.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/dolfinx/wrappers/dolfinx_wrappers/la.h b/python/dolfinx/wrappers/dolfinx_wrappers/la.h
index 6427a032f7..9085b2142d 100644
--- a/python/dolfinx/wrappers/dolfinx_wrappers/la.h
+++ b/python/dolfinx/wrappers/dolfinx_wrappers/la.h
@@ -11,8 +11,8 @@
 #include <cstdint>
 #include <dolfinx/common/IndexMap.h>
 #include <dolfinx/la/MatrixCSR.h>
-#include <dolfinx/la/matmul_impl.h>
 #include <dolfinx/la/Vector.h>
+#include <dolfinx/la/matmul_impl.h>
 #include <dolfinx/la/utils.h>
 #include <nanobind/nanobind.h>
 #include <nanobind/ndarray.h>
@@ -146,9 +146,9 @@ void declare_la_objects(nanobind::module_& m, const std::string& type)
            })
       .def("scatter_reverse", &dolfinx::la::MatrixCSR<T>::scatter_rev)
       .def("mult", &dolfinx::la::MatrixCSR<T>::mult)
-      .def("mult", [](const dolfinx::la::MatrixCSR<T>& A, const dolfinx::la::MatrixCSR<T>&B){
-        return dolfinx::la::matmul(A, B);
-      })
+      .def("mult", [](const dolfinx::la::MatrixCSR<T>& A,
+                      const dolfinx::la::MatrixCSR<T>& B)
+           { return dolfinx::la::matmul(A, B); })
       .def("multT", &dolfinx::la::MatrixCSR<T>::multT)
       .def("to_dense",
            [](const dolfinx::la::MatrixCSR<T>& self)

From fcf3e3a90e89e04f2738d8a7091290c3109b1090 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 23:02:33 +0100
Subject: [PATCH 18/23] documentation

---
 cpp/dolfinx/la/CMakeLists.txt                 |   2 +-
 cpp/dolfinx/la/MatrixCSR.h                    |  41 ++++--
 cpp/dolfinx/la/{matmul_impl.h => matmul.h}    | 124 ++++++++++++------
 python/dolfinx/wrappers/dolfinx_wrappers/la.h |   2 +-
 4 files changed, 114 insertions(+), 55 deletions(-)
 rename cpp/dolfinx/la/{matmul_impl.h => matmul.h} (82%)

diff --git a/cpp/dolfinx/la/CMakeLists.txt b/cpp/dolfinx/la/CMakeLists.txt
index 48c97917f3..965e7cfe30 100644
--- a/cpp/dolfinx/la/CMakeLists.txt
+++ b/cpp/dolfinx/la/CMakeLists.txt
@@ -2,7 +2,7 @@ set(HEADERS_la
   ${CMAKE_CURRENT_SOURCE_DIR}/dolfinx_la.h
   ${CMAKE_CURRENT_SOURCE_DIR}/MatrixCSR.h
   ${CMAKE_CURRENT_SOURCE_DIR}/matrix_csr_impl.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/matmul_impl.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/matmul.h
   ${CMAKE_CURRENT_SOURCE_DIR}/SparsityPattern.h
   ${CMAKE_CURRENT_SOURCE_DIR}/superlu_dist.h
   ${CMAKE_CURRENT_SOURCE_DIR}/Vector.h
diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index cf54b04b93..2f81cee27b 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -493,14 +493,36 @@ class MatrixCSR
 
   /// @brief Compute the product `y += A^T x`.
   ///
-  /// The vectors `x` and `y` must have parallel layouts that are
-  /// compatible with `A`. In detail, `x` must have the same `IndexMap` as
-  /// the matrix rows, `A.index_map(0)` and `y` must have the same owned
-  /// indices as the matrix columns in `A.index_map(1)`. Ghost entries will
-  /// be overwritten, and owned entries of `y` will be updated.
+  /// Performs the distributed sparse matrix–vector product with the
+  /// *transpose* of the matrix.  The computation is split into two phases:
   ///
-  /// @param[in] x Vector to apply `A^T` to.
-  /// @param[in,out] y Vector to accumulate the result into.
+  /// 1. **Off-diagonal phase** — contributions from the off-diagonal block
+  ///    (ghost columns of A, i.e. columns owned by remote ranks) are
+  ///    accumulated into the ghost region of `y`.  The ghost entries of
+  ///    `y` are zeroed before this phase so that stale values do not
+  ///    pollute the result.  A reverse scatter (`scatter_rev`) then
+  ///    reduces those ghost contributions back to the owning ranks.
+  ///
+  /// 2. **Diagonal phase** — contributions from the diagonal block
+  ///    (locally owned columns of A) are accumulated into the owned
+  ///    entries of `y`.
+  ///
+  /// **Layout requirements:**
+  /// - `x` must share the same `IndexMap` as the matrix *rows*,
+  ///   `A.index_map(0)`.  All ghost entries of `x` must be up-to-date
+  ///   before calling (the implementation reads `x.array()` directly
+  ///   without a forward scatter).
+  /// - `y` must share the same *owned* indices as the matrix *columns*,
+  ///   `A.index_map(1)`.  Only owned entries of `y` are meaningful after
+  ///   the call; ghost entries are used as scratch and left in an
+  ///   unspecified state.
+  ///
+  /// @note `y` is accumulated *into* (not overwritten) on the owned
+  ///       entries.  Zero `y` before the first call if a fresh result is
+  ///       required.
+  ///
+  /// @param[in]     x Vector to apply `A^T` to; must cover the row space of A.
+  /// @param[in,out] y Vector accumulated into; covers the column space of A.
   void multT(Vector<value_type>& x, Vector<value_type>& y) const;
 
   /// @brief Get MPI communicator that matrix is defined on.
@@ -898,9 +920,8 @@ void MatrixCSR<Scalar, V, W, X>::mult(la::Vector<Scalar>& x,
   }
 }
 
-/// @brief Compute transpose y += A^T x
-/// @param x
-/// @param y
+/// @brief Out-of-line implementation of MatrixCSR::multT.
+/// @see MatrixCSR::multT for the full specification.
 template <typename Scalar, typename V, typename W, typename X>
 void MatrixCSR<Scalar, V, W, X>::multT(la::Vector<Scalar>& x,
                                        la::Vector<Scalar>& y) const
diff --git a/cpp/dolfinx/la/matmul_impl.h b/cpp/dolfinx/la/matmul.h
similarity index 82%
rename from cpp/dolfinx/la/matmul_impl.h
rename to cpp/dolfinx/la/matmul.h
index d7c5f586a8..45a4f63299 100644
--- a/cpp/dolfinx/la/matmul_impl.h
+++ b/cpp/dolfinx/la/matmul.h
@@ -35,42 +35,78 @@ namespace dolfinx::la
 namespace impl
 {
 
-// Lightweight struct satisfying SparsityImplementation for MatrixCSR
-// constructor.
+/// @brief Lightweight sparsity descriptor satisfying the
+///        `SparsityImplementation` concept required by the `MatrixCSR`
+///        constructor.
+///
+/// Holds non-owning views into externally managed CSR arrays together with
+/// the row and column `IndexMap`s that describe the parallel distribution.
+/// It is intended as a short-lived helper: to be constructed it from the
+/// output of `impl::matmul` and passed to the
+/// `MatrixCSR` constructor.
+///
+/// @note All spans must remain valid for the lifetime of this object.
 struct Sparsity
 {
+  /// @brief Row `IndexMap` — describes the parallel distribution of rows.
   std::shared_ptr<const common::IndexMap> _row_map;
+
+  /// @brief Column `IndexMap` — describes the parallel distribution of
+  ///        columns, including any ghost columns needed for the
+  ///        off-diagonal block.
   std::shared_ptr<const common::IndexMap> _col_map;
+
+  /// @brief CSR column indices, length `nnz`.  Values in
+  ///        `[0, size_local(col) + num_ghosts(col))`, sorted within each row.
   std::span<const std::int32_t> _cols;
+
+  /// @brief CSR row pointers, length `num_rows + 1`.  `_offsets[i]` is the
+  ///        start of row `i` in `_cols`; `_offsets[num_rows]` equals `nnz`.
   std::span<const std::int64_t> _offsets;
+
+  /// @brief Per-row diagonal block size, length `num_rows`.
+  ///        `_off_diag[i]` is the number of entries in row `i` whose column
+  ///        index is strictly less than `_col_map->size_local()`, i.e. the
+  ///        count of entries in the diagonal (owned-column) block.  Entries
+  ///        at positions `[_off_diag[i], row_end)` belong to the
+  ///        off-diagonal block.
   std::span<const std::int32_t> _off_diag;
 
+  /// @brief Return the row (`dim == 0`) or column (`dim == 1`) `IndexMap`.
   std::shared_ptr<const common::IndexMap> index_map(int dim) const
   {
     return dim == 0 ? _row_map : _col_map;
   }
 
+  /// @brief Return the block size. Always 1 — this struct does not yet support
+  ///        block-structured sparsity patterns.
   int block_size(int) const { return 1; }
 
+  /// @brief Return the CSR graph as `(column_indices, row_pointers)`.
   std::pair<std::span<const std::int32_t>, std::span<const std::int64_t>>
   graph() const
   {
     return {_cols, _offsets};
   }
 
+  /// @brief Return the per-row diagonal block sizes (see `_off_diag`).
   std::span<const std::int32_t> off_diagonal_offsets() const
   {
     return _off_diag;
   }
 };
 
+/// @brief Fetch the rows of Matrix B which are referenced by the ghost
+/// columns of Matrix A.
+/// @param A MatrixCSR
+/// @param B MatrixCSR
+/// @returns Tuple containing [new index map, rowptr, cols, values] for the received rows
 template <typename T>
 std::tuple<std::shared_ptr<common::IndexMap>, std::vector<std::int32_t>,
            std::vector<std::int32_t>, std::vector<T>>
 fetch_ghost_rows(const dolfinx::la::MatrixCSR<T>& A,
                  const dolfinx::la::MatrixCSR<T>& B)
 {
-  dolfinx::common::Timer tsp1("{*}fetch");
   // The ghost columns of A are global row indices into B.
   auto col_map_A = A.index_map(1); // column IndexMap of A
   auto row_map_B = B.index_map(0); // row IndexMap of B
@@ -101,9 +137,14 @@ fetch_ghost_rows(const dolfinx::la::MatrixCSR<T>& A,
             std::vector<std::int32_t>{}, std::vector<T>{}};
   }
 
-  std::map<int, int> src_rank_to_nbr;
-  for (std::size_t i = 0; i < src.size(); ++i)
-    src_rank_to_nbr.insert({src[i], i});
+  // src is guaranteed sorted (Scatterer asserts this), so use binary search
+  // rather than a heap-allocated map.
+  auto rank_to_nbr = [&src](int r) -> int
+  {
+    auto it = std::lower_bound(src.begin(), src.end(), r);
+    assert(it != src.end() && *it == r);
+    return static_cast<int>(std::distance(src.begin(), it));
+  };
   MPI_Comm neigh_comm_fwd;
   MPI_Comm neigh_comm_rev;
   MPI_Dist_graph_create_adjacent(comm, dest.size(), dest.data(), MPI_UNWEIGHTED,
@@ -123,7 +164,7 @@ fetch_ghost_rows(const dolfinx::la::MatrixCSR<T>& A,
   std::vector<int> recv_count(dest.size(), 0);
 
   for (int gh : ghost_owners)
-    ++send_count[src_rank_to_nbr[gh]];
+    ++send_count[rank_to_nbr(gh)];
   MPI_Neighbor_alltoall(send_count.data(), 1, MPI_INT, recv_count.data(), 1,
                         MPI_INT, neigh_comm_fwd);
 
@@ -141,7 +182,7 @@ fetch_ghost_rows(const dolfinx::la::MatrixCSR<T>& A,
   std::vector<std::int64_t> required_globals(required_globals_map.size());
   for (std::size_t i = 0; i < ghost_owners.size(); ++i)
   {
-    int pos = src_rank_to_nbr[ghost_owners[i]];
+    int pos = rank_to_nbr(ghost_owners[i]);
     perm[i] = send_disp[pos]; // position before increment
     required_globals[send_disp[pos]++] = required_globals_map[i];
   }
@@ -358,19 +399,22 @@ fetch_ghost_rows(const dolfinx::la::MatrixCSR<T>& A,
 /// @param ghost_cols    Local column indices (w.r.t. new_col_map) for ghost
 /// rows.
 /// @param ghost_vals    Values for the ghost rows of B.
-/// @return (row_ptr, cols, vals) — the CSR structure and values of C.
+/// @return Tuple (row_ptr, off_diag_offsets, cols, vals). off_diag_offsets[i]
+///         is the number of diagonal-block entries in row i, computed during
+///         the same sort step that establishes column order.
 template <typename T>
-std::tuple<std::vector<std::int32_t>, std::vector<std::int32_t>, std::vector<T>>
+std::tuple<std::vector<std::int64_t>, std::vector<std::int32_t>,
+           std::vector<std::int32_t>, std::vector<T>>
 matmul(const dolfinx::la::MatrixCSR<T>& A, const dolfinx::la::MatrixCSR<T>& B,
        std::shared_ptr<const common::IndexMap> new_col_map,
        std::span<const std::int32_t> ghost_row_ptr,
        std::span<const std::int32_t> ghost_cols, std::span<const T> ghost_vals)
 {
-  dolfinx::common::Timer tsp("{*}sparsity_and_mult");
   auto col_map_B = B.index_map(1);
   const std::int32_t num_local_rows_A = A.index_map(0)->size_local();
   const std::int32_t num_local_rows_B = B.index_map(0)->size_local();
   const std::int32_t num_owned_cols_B = col_map_B->size_local();
+  const std::int32_t num_owned_cols_C = new_col_map->size_local();
 
   auto row_ptr_A = A.row_ptr();
   auto offdiag_ptr_A = A.off_diag_offset();
@@ -390,10 +434,12 @@ matmul(const dolfinx::la::MatrixCSR<T>& A, const dolfinx::la::MatrixCSR<T>& B,
       = new_col_map->size_local()
         + static_cast<std::int32_t>(new_col_map->ghosts().size());
 
-  std::vector<std::int32_t> C_row_ptr = {0};
-  C_row_ptr.reserve(num_local_rows_A + 1);
-  std::vector<std::int32_t> C_cols;
-  std::vector<T> C_vals;
+  std::vector<std::int64_t> row_ptr_C = {0};
+  row_ptr_C.reserve(num_local_rows_A + 1);
+  std::vector<std::int32_t> off_diag_offsets_C;
+  off_diag_offsets_C.reserve(num_local_rows_A);
+  std::vector<std::int32_t> cols_C;
+  std::vector<T> vals_C;
 
   // Dense accumulator: acc[k] holds the running sum for column k.
   // in_row[k] marks whether column k was touched in the current row.
@@ -448,18 +494,25 @@ matmul(const dolfinx::la::MatrixCSR<T>& A, const dolfinx::la::MatrixCSR<T>& B,
     // required by off_diag_offset and MatrixCSR.
     std::sort(row_cols.begin(), row_cols.end());
 
+    // Diagonal block boundary: count entries in the owned column range.
+    // row_cols is sorted, so a single lower_bound suffices.
+    off_diag_offsets_C.push_back(static_cast<std::int32_t>(
+        std::lower_bound(row_cols.begin(), row_cols.end(), num_owned_cols_C)
+        - row_cols.begin()));
+
     // Flush accumulator to output and reset both acc and in_row.
     for (const std::int32_t c : row_cols)
     {
-      C_cols.push_back(c);
-      C_vals.push_back(acc[c]);
+      cols_C.push_back(c);
+      vals_C.push_back(acc[c]);
       acc[c] = T(0);
       in_row[c] = false;
     }
-    C_row_ptr.push_back(static_cast<std::int32_t>(C_cols.size()));
+    row_ptr_C.push_back(static_cast<std::int64_t>(cols_C.size()));
   }
 
-  return {std::move(C_row_ptr), std::move(C_cols), std::move(C_vals)};
+  return {std::move(row_ptr_C), std::move(off_diag_offsets_C),
+          std::move(cols_C), std::move(vals_C)};
 }
 
 } // namespace impl
@@ -468,6 +521,7 @@ matmul(const dolfinx::la::MatrixCSR<T>& A, const dolfinx::la::MatrixCSR<T>& B,
 ///
 /// @param A Left matrix.
 /// @param B Right matrix.
+/// @note Currently only supports block-size 1 matrices.
 /// @return The product C = A*B as a MatrixCSR with row distribution
 ///         matching A and column distribution determined by B.
 ///         The row IndexMap of C has no ghosts, and the column IndexMap of C
@@ -479,37 +533,21 @@ dolfinx::la::MatrixCSR<T> matmul(const dolfinx::la::MatrixCSR<T>& A,
   dolfinx::common::Timer t_spgemm("MatrixCSR SpGEMM");
 
   // Fetch ghost rows of B needed to multiply against A's ghost columns.
-  spdlog::info("Fetch remote rows of B in C=A*B");
+  spdlog::debug("Fetch remote rows of B in C=A*B");
   auto [new_col_map, ghost_row_ptr, ghost_cols, ghost_vals]
       = impl::fetch_ghost_rows(A, B);
 
-  // Single pass: compute sparsity and values simultaneously.
-  spdlog::info("Compute sparsity and values of C=A*B");
-  auto [C_row_ptr_32, C_cols_32, C_vals_vec] = impl::matmul(
+  // Single pass: compute sparsity, off-diagonal boundaries, and values.
+  spdlog::debug("Compute sparsity and values of C=A*B");
+  auto [C_row_ptr, C_off_diag_offsets, C_cols, C_vals_vec] = impl::matmul(
       A, B, new_col_map, std::span<const std::int32_t>(ghost_row_ptr),
       std::span<const std::int32_t>(ghost_cols),
       std::span<const T>(ghost_vals));
 
-  const std::int32_t num_local_rows = A.index_map(0)->size_local();
-  const std::int32_t num_owned_cols = new_col_map->size_local();
-
-  // Per-row diagonal block boundary, needed by MatrixCSR constructor.
-  std::vector<std::int32_t> off_diag_offsets(num_local_rows);
-  for (std::int32_t i = 0; i < num_local_rows; ++i)
-  {
-    const auto begin = C_cols_32.cbegin() + C_row_ptr_32[i];
-    const auto end = C_cols_32.cbegin() + C_row_ptr_32[i + 1];
-    off_diag_offsets[i] = static_cast<std::int32_t>(
-        std::lower_bound(begin, end, num_owned_cols) - begin);
-  }
-
-  auto C_row_map = std::make_shared<common::IndexMap>(A.index_map(0)->comm(),
-                                                      num_local_rows);
-  std::vector<std::int64_t> C_row_ptr_64(C_row_ptr_32.begin(),
-                                         C_row_ptr_32.end());
-
-  impl::Sparsity sp{C_row_map, new_col_map, C_cols_32, C_row_ptr_64,
-                    off_diag_offsets};
+  auto C_row_map = std::make_shared<common::IndexMap>(
+      A.index_map(0)->comm(), A.index_map(0)->size_local());
+  impl::Sparsity sp{C_row_map, new_col_map, C_cols, C_row_ptr,
+                    C_off_diag_offsets};
   dolfinx::la::MatrixCSR<T> C(sp);
   std::copy(C_vals_vec.begin(), C_vals_vec.end(), C.values().begin());
 
diff --git a/python/dolfinx/wrappers/dolfinx_wrappers/la.h b/python/dolfinx/wrappers/dolfinx_wrappers/la.h
index 9085b2142d..fd670548c6 100644
--- a/python/dolfinx/wrappers/dolfinx_wrappers/la.h
+++ b/python/dolfinx/wrappers/dolfinx_wrappers/la.h
@@ -12,7 +12,7 @@
 #include <dolfinx/common/IndexMap.h>
 #include <dolfinx/la/MatrixCSR.h>
 #include <dolfinx/la/Vector.h>
-#include <dolfinx/la/matmul_impl.h>
+#include <dolfinx/la/matmul.h>
 #include <dolfinx/la/utils.h>
 #include <nanobind/nanobind.h>
 #include <nanobind/ndarray.h>

From 8aead8505c1ee6bfd5685f555180b4af8f35b1d0 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Fri, 17 Apr 2026 23:03:36 +0100
Subject: [PATCH 19/23] clang-format

---
 cpp/dolfinx/la/matmul.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/dolfinx/la/matmul.h b/cpp/dolfinx/la/matmul.h
index 45a4f63299..b5f929ff5a 100644
--- a/cpp/dolfinx/la/matmul.h
+++ b/cpp/dolfinx/la/matmul.h
@@ -100,7 +100,8 @@ struct Sparsity
 /// columns of Matrix A.
 /// @param A MatrixCSR
 /// @param B MatrixCSR
-/// @returns Tuple containing [new index map, rowptr, cols, values] for the received rows
+/// @returns Tuple containing [new index map, rowptr, cols, values] for the
+/// received rows
 template <typename T>
 std::tuple<std::shared_ptr<common::IndexMap>, std::vector<std::int32_t>,
            std::vector<std::int32_t>, std::vector<T>>

From 8e67100dccafd53cee9de244870d18ca534998a9 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Sat, 18 Apr 2026 06:42:29 +0100
Subject: [PATCH 20/23] not sure why superlu_dist docs are causing problem...

---
 cpp/dolfinx/la/superlu_dist.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/cpp/dolfinx/la/superlu_dist.h b/cpp/dolfinx/la/superlu_dist.h
index a7736bcb0c..86aad636ae 100644
--- a/cpp/dolfinx/la/superlu_dist.h
+++ b/cpp/dolfinx/la/superlu_dist.h
@@ -42,6 +42,7 @@ class SuperLUDistStructs
 // the solver class to select the typed set based on T.
 namespace impl
 {
+/// \cond
 template <typename...>
 constexpr bool always_false_v = false;
 
@@ -55,34 +56,29 @@ struct map
 template <>
 struct map<double>
 {
-  /// \cond
   using ScalePermstruct_t = SuperLUDistStructs::dScalePermstruct_t;
   using LUstruct_t = SuperLUDistStructs::dLUstruct_t;
   using SOLVEstruct_t = SuperLUDistStructs::dSOLVEstruct_t;
-  /// \endcond
 };
 
 /// Map float type to float 'typed' structs
 template <>
 struct map<float>
 {
-  /// \cond
   using ScalePermstruct_t = SuperLUDistStructs::sScalePermstruct_t;
   using LUstruct_t = SuperLUDistStructs::sLUstruct_t;
   using SOLVEstruct_t = SuperLUDistStructs::sSOLVEstruct_t;
-  /// \endcond
 };
 
 /// Map std::complex type to doublecomplex 'typed' structs
 template <>
 struct map<std::complex<double>>
 {
-  /// \cond
   using ScalePermstruct_t = SuperLUDistStructs::zScalePermstruct_t;
   using LUstruct_t = SuperLUDistStructs::zLUstruct_t;
   using SOLVEstruct_t = SuperLUDistStructs::zSOLVEstruct_t;
-  /// \endcond
 };
+/// \endcond
 
 } // namespace impl
 

From ba1d0a3d9306b826e0f0c64d93d150cfbad35a86 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Sun, 19 Apr 2026 14:07:23 +0100
Subject: [PATCH 21/23] int type

---
 cpp/dolfinx/la/matrix_csr_impl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h
index 2a6fa5bb1a..6fb1c97628 100644
--- a/cpp/dolfinx/la/matrix_csr_impl.h
+++ b/cpp/dolfinx/la/matrix_csr_impl.h
@@ -234,7 +234,7 @@ void spmv(std::span<const T> values, std::span<const std::int64_t> row_begin,
     for (std::size_t i = 0; i < row_begin.size(); i++)
     {
       T vi{0};
-      for (std::int32_t j = row_begin[i]; j < row_end[i]; j++)
+      for (std::int64_t j = row_begin[i]; j < row_end[i]; j++)
       {
         if constexpr (BS1 == -1)
         {
@@ -305,7 +305,7 @@ void spmvT(std::span<const T> values, std::span<const std::int64_t> row_begin,
     for (std::size_t i = 0; i < row_begin.size(); i++)
     {
       const T xval = x[i * bs0 + k0];
-      for (std::int32_t j = row_begin[i]; j < row_end[i]; j++)
+      for (std::int64_t j = row_begin[i]; j < row_end[i]; j++)
       {
         if constexpr (BS1 == -1)
         {

From 37f66efabeda9c58407dc375cc0214103f571e3d Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Mon, 20 Apr 2026 10:06:54 +0100
Subject: [PATCH 22/23] Add block size

---
 cpp/dolfinx/la/matmul.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/dolfinx/la/matmul.h b/cpp/dolfinx/la/matmul.h
index b5f929ff5a..6521c7c16c 100644
--- a/cpp/dolfinx/la/matmul.h
+++ b/cpp/dolfinx/la/matmul.h
@@ -72,15 +72,17 @@ struct Sparsity
   ///        off-diagonal block.
   std::span<const std::int32_t> _off_diag;
 
+  /// @brief block size in each direction.
+  int _bs[2];
+
   /// @brief Return the row (`dim == 0`) or column (`dim == 1`) `IndexMap`.
   std::shared_ptr<const common::IndexMap> index_map(int dim) const
   {
     return dim == 0 ? _row_map : _col_map;
   }
 
-  /// @brief Return the block size. Always 1 — this struct does not yet support
-  ///        block-structured sparsity patterns.
-  int block_size(int) const { return 1; }
+  /// @brief Return the block size.
+  int block_size(int i) const { return _bs[i]; }
 
   /// @brief Return the CSR graph as `(column_indices, row_pointers)`.
   std::pair<std::span<const std::int32_t>, std::span<const std::int64_t>>
@@ -548,7 +550,7 @@ dolfinx::la::MatrixCSR<T> matmul(const dolfinx::la::MatrixCSR<T>& A,
   auto C_row_map = std::make_shared<common::IndexMap>(
       A.index_map(0)->comm(), A.index_map(0)->size_local());
   impl::Sparsity sp{C_row_map, new_col_map, C_cols, C_row_ptr,
-                    C_off_diag_offsets};
+                    C_off_diag_offsets, {1, 1}};
   dolfinx::la::MatrixCSR<T> C(sp);
   std::copy(C_vals_vec.begin(), C_vals_vec.end(), C.values().begin());
 

From 703c6eb76d1a8ab260c9ce4e8a33903404ed1cc0 Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Tue, 21 Apr 2026 11:47:02 +0100
Subject: [PATCH 23/23] clang-format

---
 cpp/dolfinx/la/matmul.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/dolfinx/la/matmul.h b/cpp/dolfinx/la/matmul.h
index 6521c7c16c..b220ccc0d5 100644
--- a/cpp/dolfinx/la/matmul.h
+++ b/cpp/dolfinx/la/matmul.h
@@ -549,8 +549,8 @@ dolfinx::la::MatrixCSR<T> matmul(const dolfinx::la::MatrixCSR<T>& A,
 
   auto C_row_map = std::make_shared<common::IndexMap>(
       A.index_map(0)->comm(), A.index_map(0)->size_local());
-  impl::Sparsity sp{C_row_map, new_col_map, C_cols, C_row_ptr,
-                    C_off_diag_offsets, {1, 1}};
+  impl::Sparsity sp{C_row_map, new_col_map,        C_cols,
+                    C_row_ptr, C_off_diag_offsets, {1, 1}};
   dolfinx::la::MatrixCSR<T> C(sp);
   std::copy(C_vals_vec.begin(), C_vals_vec.end(), C.values().begin());