Skip to content

Commit ad10f3e

Browse files
committed
more itemizations in assay-matrix ingestor
1 parent 6733c13 commit ad10f3e

File tree

1 file changed

+52
-3
lines changed

1 file changed

+52
-3
lines changed

apis/python/src/tiledbsc/assay_matrix.py

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,14 @@ def __init__(
5353
* For reading from an already-populated SOMA, we wish to avoid cache-coherency issues.
5454
"""
5555
super().__init__(uri=uri, name=name, parent=parent)
56+
s0 = self.timing_start("__init__", "total")
5657

5758
self.row_dim_name = row_dim_name
5859
self.col_dim_name = col_dim_name
5960
self.attr_name = "value"
6061
self.row_dataframe = row_dataframe
6162
self.col_dataframe = col_dataframe
63+
self.timing_end(s0)
6264

6365
# ----------------------------------------------------------------
6466
def shape(self) -> Tuple[int, int]:
@@ -69,12 +71,15 @@ def shape(self) -> Tuple[int, int]:
6971
7072
Note: currently implemented via data scan -- will be optimized for TileDB core 2.10.
7173
"""
74+
s1 = self.timing_start("shape", "total")
7275
with self._open():
7376
# These TileDB arrays are string-dimensioned sparse arrays so there is no '.shape'.
7477
# Instead we compute it ourselves. See also:
7578
num_rows = self.row_dataframe.shape()[0]
7679
num_cols = self.col_dataframe.shape()[0]
77-
return (num_rows, num_cols)
80+
retval = (num_rows, num_cols)
81+
self.timing_end(s1)
82+
return retval
7883

7984
# ----------------------------------------------------------------
8085
def dim_select(
@@ -89,7 +94,13 @@ def dim_select(
8994
Either or both of the ID lists may be `None`, meaning, do not subselect along
9095
that dimension. If both ID lists are `None`, the entire matrix is returned.
9196
"""
97+
s0 = self.timing_start("dim_select", "open")
98+
99+
s1 = self.timing_start("dim_select", "open")
92100
with tiledb.open(self.uri, ctx=self._ctx) as A:
101+
self.timing_end(s1)
102+
103+
s2 = self.timing_start("dim_select", "tiledb_query")
93104
query = A.query(return_arrow=return_arrow)
94105
if obs_ids is None:
95106
if var_ids is None:
@@ -101,8 +112,14 @@ def dim_select(
101112
df = query.df[obs_ids, :]
102113
else:
103114
df = query.df[obs_ids, var_ids]
115+
self.timing_end(s2)
116+
117+
s3 = self.timing_start("dim_select", "set_index")
104118
if not return_arrow:
105119
df.set_index([self.row_dim_name, self.col_dim_name], inplace=True)
120+
self.timing_end(s3)
121+
122+
self.timing_end(s0)
106123
return df
107124

108125
# ----------------------------------------------------------------
@@ -126,15 +143,21 @@ def csr(
126143
"""
127144
Like `.df()` but returns results in `scipy.sparse.csr_matrix` format.
128145
"""
129-
return self._csr_or_csc("csr", obs_ids, var_ids)
146+
s0 = self.timing_start("csr", "total")
147+
retval = self._csr_or_csc("csr", obs_ids, var_ids)
148+
self.timing_end(s0)
149+
return retval
130150

131151
def csc(
132152
self, obs_ids: Optional[Ids] = None, var_ids: Optional[Ids] = None
133153
) -> sp.csc_matrix:
134154
"""
135155
Like `.df()` but returns results in `scipy.sparse.csc_matrix` format.
136156
"""
137-
return self._csr_or_csc("csc", obs_ids, var_ids)
157+
s0 = self.timing_start("csc", "total")
158+
retval = self._csr_or_csc("csc", obs_ids, var_ids)
159+
self.timing_end(s0)
160+
return retval
138161

139162
def _csr_or_csc(
140163
self,
@@ -168,6 +191,7 @@ def from_matrix_and_dim_values(
168191
`scipy.sparse.csr_matrix`, `scipy.sparse.csc_matrix`, `numpy.ndarray`, etc.
169192
For ingest from `AnnData`, these should be `ann.obs_names` and `ann.var_names`.
170193
"""
194+
s0 = self.timing_start("from_matrix_and_dim_values", "total")
171195

172196
s = util.get_start_stamp()
173197
log_io(
@@ -206,11 +230,14 @@ def from_matrix_and_dim_values(
206230
util.format_elapsed(s, f"{self._indent}FINISH WRITING {self.uri}"),
207231
)
208232

233+
self.timing_end(s0)
234+
209235
# ----------------------------------------------------------------
210236
def _create_empty_array(self, matrix_dtype: np.dtype) -> None:
211237
"""
212238
Create a TileDB 2D sparse array with string dimensions and a single attribute.
213239
"""
240+
s0 = self.timing_start("_create_empty_array", "total")
214241

215242
level = self._soma_options.string_dim_zstd_level
216243
dom = tiledb.Domain(
@@ -253,6 +280,7 @@ def _create_empty_array(self, matrix_dtype: np.dtype) -> None:
253280
)
254281

255282
tiledb.Array.create(self.uri, sch, ctx=self._ctx)
283+
self.timing_end(s0)
256284

257285
# ----------------------------------------------------------------
258286
def ingest_data_whole(
@@ -269,6 +297,7 @@ def ingest_data_whole(
269297
:param row_names: List of row names.
270298
:param col_names: List of column names.
271299
"""
300+
s0 = self.timing_start("ingest_data_whole", "total")
272301

273302
assert len(row_names) == matrix.shape[0]
274303
assert len(col_names) == matrix.shape[1]
@@ -279,6 +308,7 @@ def ingest_data_whole(
279308

280309
with tiledb.open(self.uri, mode="w", ctx=self._ctx) as A:
281310
A[d0, d1] = mat_coo.data
311+
self.timing_end(s0)
282312

283313
# ----------------------------------------------------------------
284314
# Example: suppose this 4x3 is to be written in two chunks of two rows each
@@ -326,7 +356,9 @@ def ingest_data_rows_chunked(
326356
:param row_names: List of row names.
327357
:param col_names: List of column names.
328358
"""
359+
s0 = self.timing_start("ingest_data_rows_chunked", "total")
329360

361+
s1 = self.timing_start("ingest_data_rows_chunked", "sortprep")
330362
assert len(row_names) == matrix.shape[0]
331363
assert len(col_names) == matrix.shape[1]
332364

@@ -346,13 +378,20 @@ def ingest_data_rows_chunked(
346378
f"{self._indent}START ingest_data_rows_chunked",
347379
)
348380

381+
self.timing_end(s1)
382+
349383
eta_tracker = util.ETATracker()
384+
s2 = self.timing_start("ingest_data_rows_chunked", "open")
350385
with tiledb.open(self.uri, mode="w", ctx=self._ctx) as A:
386+
self.timing_end(s2)
387+
351388
nrow = len(sorted_row_names)
352389

353390
i = 0
354391
while i < nrow:
355392
t1 = time.time()
393+
394+
s3 = self.timing_start("ingest_data_rows_chunked", "chunkprep")
356395
# Find a number of CSR rows which will result in a desired nnz for the chunk.
357396
chunk_size = util._find_csr_chunk_size(
358397
matrix, permutation, i, self._soma_options.goal_chunk_nnz
@@ -365,6 +404,7 @@ def ingest_data_rows_chunked(
365404
# Write the chunk-COO to TileDB.
366405
d0 = sorted_row_names[chunk_coo.row + i]
367406
d1 = col_names[chunk_coo.col]
407+
self.timing_end(s3)
368408

369409
if len(d0) == 0:
370410
i = i2
@@ -390,7 +430,9 @@ def ingest_data_rows_chunked(
390430
)
391431

392432
# Write a TileDB fragment
433+
s4 = self.timing_start("ingest_data_rows_chunked", "tiledb-write")
393434
A[d0, d1] = chunk_coo.data
435+
self.timing_end(s4)
394436

395437
t2 = time.time()
396438
chunk_seconds = t2 - t1
@@ -413,6 +455,7 @@ def ingest_data_rows_chunked(
413455
f"{self._indent}FINISH __ingest_coo_data_string_dims_rows_chunked",
414456
),
415457
)
458+
self.timing_end(s0)
416459

417460
# This method is very similar to ingest_data_rows_chunked. The code is largely repeated,
418461
# and this is intentional. The algorithm here is non-trivial (among the most non-trivial
@@ -432,6 +475,7 @@ def ingest_data_cols_chunked(
432475
:param row_names: List of row names.
433476
:param col_names: List of column names.
434477
"""
478+
s0 = self.timing_start("ingest_data_cols_chunked", "total")
435479

436480
assert len(row_names) == matrix.shape[0]
437481
assert len(col_names) == matrix.shape[1]
@@ -519,6 +563,7 @@ def ingest_data_cols_chunked(
519563
f"{self._indent}FINISH __ingest_coo_data_string_dims_rows_chunked",
520564
),
521565
)
566+
self.timing_end(s0)
522567

523568
# This method is very similar to ingest_data_rows_chunked. The code is largely repeated,
524569
# and this is intentional. The algorithm here is non-trivial (among the most non-trivial
@@ -538,6 +583,7 @@ def ingest_data_dense_rows_chunked(
538583
:param row_names: List of row names.
539584
:param col_names: List of column names.
540585
"""
586+
s0 = self.timing_start("ingest_data_dense_rows_chunked", "total")
541587

542588
assert len(row_names) == matrix.shape[0]
543589
assert len(col_names) == matrix.shape[1]
@@ -627,6 +673,7 @@ def ingest_data_dense_rows_chunked(
627673
f"{self._indent}FINISH __ingest_coo_data_string_dims_dense_rows_chunked",
628674
),
629675
)
676+
self.timing_end(s0)
630677

631678
# ----------------------------------------------------------------
632679
def to_csr_matrix(self, row_labels: Labels, col_labels: Labels) -> sp.csr_matrix:
@@ -638,6 +685,7 @@ def to_csr_matrix(self, row_labels: Labels, col_labels: Labels) -> sp.csr_matrix
638685
be in the same order as they were in any anndata object which was used to create the
639686
TileDB storage.
640687
"""
688+
s0 = self.timing_start("to_csr_matrix", "total")
641689

642690
s = util.get_start_stamp()
643691
log_io(None, f"{self._indent}START read {self.uri}")
@@ -649,4 +697,5 @@ def to_csr_matrix(self, row_labels: Labels, col_labels: Labels) -> sp.csr_matrix
649697
util.format_elapsed(s, f"{self._indent}FINISH read {self.uri}"),
650698
)
651699

700+
self.timing_end(s0)
652701
return csr

0 commit comments

Comments
 (0)