Skip to content

Commit 25927cc

Browse files
committed
more itemizations in assay-matrix ingestor
1 parent 601039e commit 25927cc

File tree

1 file changed

+52
-3
lines changed

1 file changed

+52
-3
lines changed

apis/python/src/tiledbsoma/assay_matrix.py

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,14 @@ def __init__(
5353
* For reading from an already-populated SOMA, we wish to avoid cache-coherency issues.
5454
"""
5555
super().__init__(uri=uri, name=name, parent=parent)
56+
s0 = self.timing_start("__init__", "total")
5657

5758
self.row_dim_name = row_dim_name
5859
self.col_dim_name = col_dim_name
5960
self.attr_name = "value"
6061
self.row_dataframe = row_dataframe
6162
self.col_dataframe = col_dataframe
63+
self.timing_end(s0)
6264

6365
# ----------------------------------------------------------------
6466
def shape(self) -> Tuple[int, int]:
@@ -69,12 +71,15 @@ def shape(self) -> Tuple[int, int]:
6971
7072
Note: currently implemented via data scan --- will be optimized for TileDB core 2.10.
7173
"""
74+
s1 = self.timing_start("shape", "total")
7275
with self._open():
7376
# These TileDB arrays are string-dimensioned sparse arrays so there is no '.shape'.
7477
# Instead we compute it ourselves. See also:
7578
num_rows = self.row_dataframe.shape()[0]
7679
num_cols = self.col_dataframe.shape()[0]
77-
return (num_rows, num_cols)
80+
retval = (num_rows, num_cols)
81+
self.timing_end(s1)
82+
return retval
7883

7984
# ----------------------------------------------------------------
8085
def dim_select(
@@ -89,7 +94,13 @@ def dim_select(
8994
Either or both of the ID lists may be ``None``, meaning, do not subselect along
9095
that dimension. If both ID lists are ``None``, the entire matrix is returned.
9196
"""
97+
s0 = self.timing_start("dim_select", "open")
98+
99+
s1 = self.timing_start("dim_select", "open")
92100
with tiledb.open(self.uri, ctx=self._ctx) as A:
101+
self.timing_end(s1)
102+
103+
s2 = self.timing_start("dim_select", "tiledb_query")
93104
query = A.query(return_arrow=return_arrow)
94105
if obs_ids is None:
95106
if var_ids is None:
@@ -101,8 +112,14 @@ def dim_select(
101112
df = query.df[obs_ids, :]
102113
else:
103114
df = query.df[obs_ids, var_ids]
115+
self.timing_end(s2)
116+
117+
s3 = self.timing_start("dim_select", "set_index")
104118
if not return_arrow:
105119
df.set_index([self.row_dim_name, self.col_dim_name], inplace=True)
120+
self.timing_end(s3)
121+
122+
self.timing_end(s0)
106123
return df
107124

108125
# ----------------------------------------------------------------
@@ -126,15 +143,21 @@ def csr(
126143
"""
127144
Like ``.df()`` but returns results in ``scipy.sparse.csr_matrix`` format.
128145
"""
129-
return self._csr_or_csc("csr", obs_ids, var_ids)
146+
s0 = self.timing_start("csr", "total")
147+
retval = self._csr_or_csc("csr", obs_ids, var_ids)
148+
self.timing_end(s0)
149+
return retval
130150

131151
def csc(
132152
self, obs_ids: Optional[Ids] = None, var_ids: Optional[Ids] = None
133153
) -> sp.csc_matrix:
134154
"""
135155
Like ``.df()`` but returns results in ``scipy.sparse.csc_matrix`` format.
136156
"""
137-
return self._csr_or_csc("csc", obs_ids, var_ids)
157+
s0 = self.timing_start("csc", "total")
158+
retval = self._csr_or_csc("csc", obs_ids, var_ids)
159+
self.timing_end(s0)
160+
return retval
138161

139162
def _csr_or_csc(
140163
self,
@@ -168,6 +191,7 @@ def from_matrix_and_dim_values(
168191
``scipy.sparse.csr_matrix``, ``scipy.sparse.csc_matrix``, ``numpy.ndarray``, etc.
169192
For ingest from ``AnnData``, these should be ``ann.obs_names`` and ``ann.var_names``.
170193
"""
194+
s0 = self.timing_start("from_matrix_and_dim_values", "total")
171195

172196
s = util.get_start_stamp()
173197
log_io(
@@ -206,11 +230,14 @@ def from_matrix_and_dim_values(
206230
util.format_elapsed(s, f"{self._indent}FINISH WRITING {self.uri}"),
207231
)
208232

233+
self.timing_end(s0)
234+
209235
# ----------------------------------------------------------------
210236
def _create_empty_array(self, matrix_dtype: np.dtype) -> None:
211237
"""
212238
Create a TileDB 2D sparse array with string dimensions and a single attribute.
213239
"""
240+
s0 = self.timing_start("_create_empty_array", "total")
214241

215242
dom = tiledb.Domain(
216243
tiledb.Dim(
@@ -248,6 +275,7 @@ def _create_empty_array(self, matrix_dtype: np.dtype) -> None:
248275
)
249276

250277
tiledb.Array.create(self.uri, sch, ctx=self._ctx)
278+
self.timing_end(s0)
251279

252280
# ----------------------------------------------------------------
253281
def ingest_data_whole(
@@ -264,6 +292,7 @@ def ingest_data_whole(
264292
:param row_names: List of row names.
265293
:param col_names: List of column names.
266294
"""
295+
s0 = self.timing_start("ingest_data_whole", "total")
267296

268297
assert len(row_names) == matrix.shape[0]
269298
assert len(col_names) == matrix.shape[1]
@@ -274,6 +303,7 @@ def ingest_data_whole(
274303

275304
with tiledb.open(self.uri, mode="w", ctx=self._ctx) as A:
276305
A[d0, d1] = mat_coo.data
306+
self.timing_end(s0)
277307

278308
# ----------------------------------------------------------------
279309
# Example: suppose this 4x3 is to be written in two chunks of two rows each
@@ -321,7 +351,9 @@ def ingest_data_rows_chunked(
321351
:param row_names: List of row names.
322352
:param col_names: List of column names.
323353
"""
354+
s0 = self.timing_start("ingest_data_rows_chunked", "total")
324355

356+
s1 = self.timing_start("ingest_data_rows_chunked", "sortprep")
325357
assert len(row_names) == matrix.shape[0]
326358
assert len(col_names) == matrix.shape[1]
327359

@@ -341,13 +373,20 @@ def ingest_data_rows_chunked(
341373
f"{self._indent}START ingest_data_rows_chunked",
342374
)
343375

376+
self.timing_end(s1)
377+
344378
eta_tracker = util.ETATracker()
379+
s2 = self.timing_start("ingest_data_rows_chunked", "open")
345380
with tiledb.open(self.uri, mode="w", ctx=self._ctx) as A:
381+
self.timing_end(s2)
382+
346383
nrow = len(sorted_row_names)
347384

348385
i = 0
349386
while i < nrow:
350387
t1 = time.time()
388+
389+
s3 = self.timing_start("ingest_data_rows_chunked", "chunkprep")
351390
# Find a number of CSR rows which will result in a desired nnz for the chunk.
352391
chunk_size = util._find_csr_chunk_size(
353392
matrix, permutation, i, self._soma_options.goal_chunk_nnz
@@ -360,6 +399,7 @@ def ingest_data_rows_chunked(
360399
# Write the chunk-COO to TileDB.
361400
d0 = sorted_row_names[chunk_coo.row + i]
362401
d1 = col_names[chunk_coo.col]
402+
self.timing_end(s3)
363403

364404
if len(d0) == 0:
365405
i = i2
@@ -385,7 +425,9 @@ def ingest_data_rows_chunked(
385425
)
386426

387427
# Write a TileDB fragment
428+
s4 = self.timing_start("ingest_data_rows_chunked", "tiledb-write")
388429
A[d0, d1] = chunk_coo.data
430+
self.timing_end(s4)
389431

390432
t2 = time.time()
391433
chunk_seconds = t2 - t1
@@ -408,6 +450,7 @@ def ingest_data_rows_chunked(
408450
f"{self._indent}FINISH __ingest_coo_data_string_dims_rows_chunked",
409451
),
410452
)
453+
self.timing_end(s0)
411454

412455
# This method is very similar to ingest_data_rows_chunked. The code is largely repeated,
413456
# and this is intentional. The algorithm here is non-trivial (among the most non-trivial
@@ -427,6 +470,7 @@ def ingest_data_cols_chunked(
427470
:param row_names: List of row names.
428471
:param col_names: List of column names.
429472
"""
473+
s0 = self.timing_start("ingest_data_cols_chunked", "total")
430474

431475
assert len(row_names) == matrix.shape[0]
432476
assert len(col_names) == matrix.shape[1]
@@ -514,6 +558,7 @@ def ingest_data_cols_chunked(
514558
f"{self._indent}FINISH __ingest_coo_data_string_dims_rows_chunked",
515559
),
516560
)
561+
self.timing_end(s0)
517562

518563
# This method is very similar to ingest_data_rows_chunked. The code is largely repeated,
519564
# and this is intentional. The algorithm here is non-trivial (among the most non-trivial
@@ -533,6 +578,7 @@ def ingest_data_dense_rows_chunked(
533578
:param row_names: List of row names.
534579
:param col_names: List of column names.
535580
"""
581+
s0 = self.timing_start("ingest_data_dense_rows_chunked", "total")
536582

537583
assert len(row_names) == matrix.shape[0]
538584
assert len(col_names) == matrix.shape[1]
@@ -622,6 +668,7 @@ def ingest_data_dense_rows_chunked(
622668
f"{self._indent}FINISH __ingest_coo_data_string_dims_dense_rows_chunked",
623669
),
624670
)
671+
self.timing_end(s0)
625672

626673
# ----------------------------------------------------------------
627674
def to_csr_matrix(self, row_labels: Labels, col_labels: Labels) -> sp.csr_matrix:
@@ -633,6 +680,7 @@ def to_csr_matrix(self, row_labels: Labels, col_labels: Labels) -> sp.csr_matrix
633680
be in the same order as they were in any anndata object which was used to create the
634681
TileDB storage.
635682
"""
683+
s0 = self.timing_start("to_csr_matrix", "total")
636684

637685
s = util.get_start_stamp()
638686
log_io(None, f"{self._indent}START read {self.uri}")
@@ -644,4 +692,5 @@ def to_csr_matrix(self, row_labels: Labels, col_labels: Labels) -> sp.csr_matrix
644692
util.format_elapsed(s, f"{self._indent}FINISH read {self.uri}"),
645693
)
646694

695+
self.timing_end(s0)
647696
return csr

0 commit comments

Comments
 (0)