FEniCS · chrisrichardson · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 17, 2026
diff --git a/cpp/dolfinx/la/CMakeLists.txt b/cpp/dolfinx/la/CMakeLists.txt
@@ -2,6 +2,7 @@ set(HEADERS_la
   ${CMAKE_CURRENT_SOURCE_DIR}/dolfinx_la.h
   ${CMAKE_CURRENT_SOURCE_DIR}/MatrixCSR.h
   ${CMAKE_CURRENT_SOURCE_DIR}/matrix_csr_impl.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/matmul.h
   ${CMAKE_CURRENT_SOURCE_DIR}/SparsityPattern.h
   ${CMAKE_CURRENT_SOURCE_DIR}/superlu_dist.h
   ${CMAKE_CURRENT_SOURCE_DIR}/Vector.h

diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
@@ -491,6 +491,40 @@ class MatrixCSR
   /// @param[in,out] y Vector to accumulate the result into.
   void mult(Vector<value_type>& x, Vector<value_type>& y) const;
 
+  /// @brief Compute the product `y += A^T x`.
+  ///
+  /// Performs the distributed sparse matrix–vector product with the
+  /// *transpose* of the matrix.  The computation is split into two phases:
+  ///
+  /// 1. **Off-diagonal phase** — contributions from the off-diagonal block
+  ///    (ghost columns of A, i.e. columns owned by remote ranks) are
+  ///    accumulated into the ghost region of `y`.  The ghost entries of
+  ///    `y` are zeroed before this phase so that stale values do not
+  ///    pollute the result.  A reverse scatter (`scatter_rev`) then
+  ///    reduces those ghost contributions back to the owning ranks.
+  ///
+  /// 2. **Diagonal phase** — contributions from the diagonal block
+  ///    (locally owned columns of A) are accumulated into the owned
+  ///    entries of `y`.
+  ///
+  /// **Layout requirements:**
+  /// - `x` must share the same `IndexMap` as the matrix *rows*,
+  ///   `A.index_map(0)`.  All ghost entries of `x` must be up-to-date
+  ///   before calling (the implementation reads `x.array()` directly
+  ///   without a forward scatter).
+  /// - `y` must share the same *owned* indices as the matrix *columns*,
+  ///   `A.index_map(1)`.  Only owned entries of `y` are meaningful after
+  ///   the call; ghost entries are used as scratch and left in an
+  ///   unspecified state.
+  ///
+  /// @note `y` is accumulated *into* (not overwritten) on the owned
+  ///       entries.  Zero `y` before the first call if a fresh result is
+  ///       required.
+  ///
+  /// @param[in]     x Vector to apply `A^T` to; must cover the row space of A.
+  /// @param[in,out] y Vector accumulated into; covers the column space of A.
+  void multT(Vector<value_type>& x, Vector<value_type>& y) const;
+
   /// @brief Get MPI communicator that matrix is defined on.
   MPI_Comm comm() const { return _comm.comm(); }
 
@@ -886,4 +920,43 @@ void MatrixCSR<Scalar, V, W, X>::mult(la::Vector<Scalar>& x,
   }
 }
 
+/// @brief Out-of-line implementation of MatrixCSR::multT.
+/// @see MatrixCSR::multT for the full specification.
+template <typename Scalar, typename V, typename W, typename X>
+void MatrixCSR<Scalar, V, W, X>::multT(la::Vector<Scalar>& x,
+                                       la::Vector<Scalar>& y) const
+{
+  std::int32_t nrowslocal = num_owned_rows();
+  std::span<const std::int64_t> Arow_ptr(row_ptr().data(), nrowslocal + 1);
+  std::span<const std::int32_t> Acols(cols().data(), Arow_ptr[nrowslocal]);
+  std::span<const std::int64_t> Aoff_diag_offset(off_diag_offset().data(),
+                                                 nrowslocal);
+  std::span<const Scalar> Avalues(values().data(), Arow_ptr[nrowslocal]);
+
+  std::span<const Scalar> _x = x.array();
+  std::span<Scalar> _y = y.array();
+
+  std::span<const std::int64_t> Arow_begin(Arow_ptr.data(), nrowslocal);
+  std::span<const std::int64_t> Arow_end(Arow_ptr.data() + 1, nrowslocal);
+
+  // Compute ghost region contribution and scatter back
+  int ncolslocal = index_map(1)->size_local();
+  std::fill(std::next(_y.begin(), ncolslocal), _y.end(), 0.0);
+  if (_bs[1] == 1)
+    impl::spmvT<Scalar, 1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y,
+                           _bs[0], 1);
+  else
+    impl::spmvT<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y,
+                            _bs[0], _bs[1]);
+
+  y.scatter_rev(std::plus<Scalar>{});
+
+  if (_bs[1] == 1)
+    impl::spmvT<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y,
+                           _bs[0], 1);
+  else
+    impl::spmvT<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x,
+                            _y, _bs[0], _bs[1]);
+}
+
 } // namespace dolfinx::la