Skip to content

Commit 8193c21

Browse files
committed
remove util_arrow.concat_tables
1 parent a8f9e6d commit 8193c21

File tree

5 files changed

+7
-174
lines changed

5 files changed

+7
-174
lines changed

apis/python/src/tiledbsoma/soma_dataframe.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ def read_all(
245245
"""
246246
This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
247247
"""
248-
return util_arrow.concat_tables(
248+
return pa.concat_tables(
249249
self.read(
250250
ids=ids,
251251
value_filter=value_filter,
@@ -271,13 +271,13 @@ def _get_is_sparse(self) -> bool:
271271

272272
return self._cached_is_sparse
273273

274-
def write(self, values: pa.Table) -> None:
274+
def write(self, values: pa.RecordBatch) -> None:
275275
"""
276-
Write an Arrow.Table to the persistent object.
276+
Write an Arrow.RecordBatch to the persistent object.
277277
278-
:param values: An Arrow.Table containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``.
278+
:param values: An Arrow.RecordBatch containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``.
279279
280-
The ``values`` Arrow Table must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written.
280+
The ``values`` Arrow RecordBatch must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written.
281281
"""
282282
self._shape = None # cache-invalidate
283283

apis/python/src/tiledbsoma/soma_dense_nd_array.py

Lines changed: 0 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -168,21 +168,6 @@ def read_tensor(
168168
)
169169

170170
def read_numpy(
171-
if row_ids is None:
172-
if col_ids is None:
173-
iterator = query.df[:, :]
174-
else:
175-
iterator = query.df[:, col_ids]
176-
else:
177-
if col_ids is None:
178-
iterator = query.df[row_ids, :]
179-
else:
180-
iterator = query.df[row_ids, col_ids]
181-
182-
for table in iterator:
183-
yield table
184-
185-
def read_as_pandas(
186171
self,
187172
coords: SOMADenseNdCoordinates,
188173
*,
@@ -193,77 +178,11 @@ def read_as_pandas(
193178
"""
194179
return cast(
195180
np.ndarray, self.read_tensor(coords, result_order=result_order).to_numpy()
196-
with self._tiledb_open() as A:
197-
query = A.query(return_incomplete=True)
198-
199-
if row_ids is None:
200-
if col_ids is None:
201-
iterator = query.df[:, :]
202-
else:
203-
iterator = query.df[:, col_ids]
204-
else:
205-
if col_ids is None:
206-
iterator = query.df[row_ids, :]
207-
else:
208-
iterator = query.df[row_ids, col_ids]
209-
210-
for df in iterator:
211-
# Make this opt-in only. For large arrays, this df.set_index is time-consuming
212-
# so we should not do it without direction.
213-
if set_index:
214-
df.set_index(self._tiledb_dim_names(), inplace=True)
215-
yield df
216-
217-
def read_all(
218-
self,
219-
*,
220-
# TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)``
221-
# ids: Optional[Union[Sequence[int], Slice]] = None,
222-
row_ids: Optional[Sequence[int]] = None,
223-
col_ids: Optional[Sequence[int]] = None,
224-
result_order: Optional[str] = None,
225-
# TODO: batch_size
226-
# TODO: partition,
227-
# TODO: batch_format,
228-
# TODO: platform_config,
229-
) -> pa.Table:
230-
"""
231-
This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
232-
"""
233-
return util_arrow.concat_tables(
234-
self.read(
235-
row_ids=row_ids,
236-
col_ids=col_ids,
237-
result_order=result_order,
238-
)
239181
)
240182

241183
def write_tensor(
242184
self,
243185
coords: SOMADenseNdCoordinates,
244-
*,
245-
row_ids: Optional[Sequence[int]] = None,
246-
col_ids: Optional[Sequence[int]] = None,
247-
set_index: Optional[bool] = False,
248-
) -> pa.Table:
249-
"""
250-
This is a convenience method around ``read_as_pandas``. It iterates the return value from ``read_as_pandas`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
251-
"""
252-
dataframes = []
253-
generator = self.read_as_pandas(
254-
row_ids=row_ids,
255-
col_ids=col_ids,
256-
set_index=set_index,
257-
)
258-
for dataframe in generator:
259-
dataframes.append(dataframe)
260-
return pd.concat(dataframes)
261-
262-
def write(
263-
self,
264-
# TODO: rework callsites with regard to the very latest spec rev
265-
# coords: Union[tuple, tuple[slice], NTuple, List[int]],
266-
coords: Any,
267186
values: pa.Tensor,
268187
) -> None:
269188
"""

apis/python/src/tiledbsoma/soma_indexed_dataframe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ def read_all(
281281
"""
282282
This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
283283
"""
284-
return util_arrow.concat_tables(
284+
return pa.concat_tables(
285285
self.read(ids=ids, value_filter=value_filter, column_names=column_names)
286286
)
287287

apis/python/src/tiledbsoma/soma_sparse_nd_array.py

Lines changed: 0 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -158,17 +158,6 @@ def read_sparse_tensor(
158158
*,
159159
format: Literal["coo", "csr", "csc"] = "coo",
160160
) -> Iterator[Union[pa.SparseCOOTensor, pa.SparseCSCMatrix, pa.SparseCSRMatrix]]:
161-
# TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)``
162-
# row_ids: Optional[Union[Sequence[int], Slice]] = None,
163-
# col_ids: Optional[Union[Sequence[int], Slice]] = None,
164-
row_ids: Optional[Sequence[int]] = None,
165-
col_ids: Optional[Sequence[int]] = None,
166-
result_order: Optional[str] = None,
167-
# TODO: batch_size
168-
# TODO: partition,
169-
# TODO: batch_format,
170-
# TODO: platform_config,
171-
) -> Iterator[pa.Table]:
172161
"""
173162
Read a use-defined slice of the SparseNdArray and return as an Arrow sparse tensor.
174163
@@ -226,16 +215,6 @@ def read_sparse_tensor(
226215
yield pa.SparseCSCMatrix.from_scipy(scipy_coo.tocsc())
227216

228217
def read_table(self, coords: SOMASparseNdCoordinates) -> Iterator[pa.Table]:
229-
for table in iterator:
230-
yield table
231-
232-
def read_as_pandas(
233-
self,
234-
*,
235-
row_ids: Optional[Sequence[int]] = None,
236-
col_ids: Optional[Sequence[int]] = None,
237-
set_index: Optional[bool] = False,
238-
) -> pd.DataFrame:
239218
"""
240219
Read a user-defined slice of the sparse array and return in COO format
241220
as an Arrow Table
@@ -244,53 +223,6 @@ def read_as_pandas(
244223
query = A.query(
245224
return_arrow=True,
246225
return_incomplete=True,
247-
dim_names = None
248-
if set_index:
249-
dim_names = self._tiledb_dim_names()
250-
251-
with self._tiledb_open() as A:
252-
query = A.query(return_incomplete=True)
253-
254-
if row_ids is None:
255-
if col_ids is None:
256-
iterator = query.df[:, :]
257-
else:
258-
iterator = query.df[:, col_ids]
259-
else:
260-
if col_ids is None:
261-
iterator = query.df[row_ids, :]
262-
else:
263-
iterator = query.df[row_ids, col_ids]
264-
265-
for df in iterator:
266-
# Make this opt-in only. For large arrays, this df.set_index is time-consuming
267-
# so we should not do it without direction.
268-
if set_index:
269-
df.set_index(dim_names, inplace=True)
270-
yield df
271-
272-
def read_all(
273-
self,
274-
*,
275-
# TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)``
276-
# row_ids: Optional[Union[Sequence[int], Slice]] = None,
277-
# col_ids: Optional[Union[Sequence[int], Slice]] = None,
278-
row_ids: Optional[Sequence[int]] = None,
279-
col_ids: Optional[Sequence[int]] = None,
280-
result_order: Optional[str] = None,
281-
# TODO: batch_size
282-
# TODO: partition,
283-
# TODO: batch_format,
284-
# TODO: platform_config,
285-
) -> pa.Table:
286-
"""
287-
This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
288-
"""
289-
return util_arrow.concat_tables(
290-
self.read(
291-
row_ids=row_ids,
292-
col_ids=col_ids,
293-
result_order=result_order,
294226
)
295227
for arrow_tbl in query.df[coords]:
296228
yield arrow_tbl
@@ -306,15 +238,8 @@ def read_as_pandas(self, coords: SOMASparseNdCoordinates) -> Iterator[pd.DataFra
306238
def read_as_pandas_all(
307239
self, coords: Optional[SOMASparseNdCoordinates] = None
308240
) -> pd.DataFrame:
309-
self,
310-
*,
311-
row_ids: Optional[Sequence[int]] = None,
312-
col_ids: Optional[Sequence[int]] = None,
313-
set_index: Optional[bool] = False,
314-
) -> pa.Table:
315241
"""
316242
Return the sparse array as a single Pandas DataFrame containing COO data.
317-
This is a convenience method around ``read_as_pandas``. It iterates the return value from ``read_as_pandas`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
318243
"""
319244
if coords is None:
320245
coords = (slice(None),) * self.ndims

apis/python/src/tiledbsoma/util_arrow.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Iterator, Optional, Union
1+
from typing import Optional, Union
22

33
import numpy as np
44
import pyarrow as pa
@@ -138,14 +138,3 @@ def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table:
138138
else:
139139
new_fields.append(old_field)
140140
return pa.Table.from_arrays(new_fields, names=names)
141-
142-
143-
def concat_tables(table_generator: Iterator[Any]) -> pa.Table:
144-
"""
145-
Iterates a generator of ``pyarrow.Table`` (e.g. ``SOMADataFrame.read``) and returns a concatenation of all the table-pieces found. The nominal use is to simply unit-test cases.
146-
"""
147-
tables = []
148-
for table in table_generator:
149-
tables.append(table)
150-
assert len(tables) > 0
151-
return pa.concat_tables(tables)

0 commit comments

Comments
 (0)