remove util_arrow.concat_tables

johnkerl · johnkerl · commit 8193c210eede · 2022-10-03T23:10:53.000-04:00
diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py
@@ -245,7 +245,7 @@ def read_all(
         """
         This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
         """
-        return util_arrow.concat_tables(
+        return pa.concat_tables(
             self.read(
                 ids=ids,
                 value_filter=value_filter,
@@ -271,13 +271,13 @@ def _get_is_sparse(self) -> bool:
 
         return self._cached_is_sparse
 
-    def write(self, values: pa.Table) -> None:
+    def write(self, values: pa.RecordBatch) -> None:
         """
-        Write an Arrow.Table to the persistent object.
+        Write an Arrow.RecordBatch to the persistent object.
 
-        :param values: An Arrow.Table containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``.
+        :param values: An Arrow.RecordBatch containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``.
 
-        The ``values`` Arrow Table must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written.
+        The ``values`` Arrow RecordBatch must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written.
         """
         self._shape = None  # cache-invalidate
 
diff --git a/apis/python/src/tiledbsoma/soma_dense_nd_array.py b/apis/python/src/tiledbsoma/soma_dense_nd_array.py
@@ -168,21 +168,6 @@ def read_tensor(
             )
 
     def read_numpy(
-            if row_ids is None:
-                if col_ids is None:
-                    iterator = query.df[:, :]
-                else:
-                    iterator = query.df[:, col_ids]
-            else:
-                if col_ids is None:
-                    iterator = query.df[row_ids, :]
-                else:
-                    iterator = query.df[row_ids, col_ids]
-
-            for table in iterator:
-                yield table
-
-    def read_as_pandas(
         self,
         coords: SOMADenseNdCoordinates,
         *,
@@ -193,77 +178,11 @@ def read_as_pandas(
         """
         return cast(
             np.ndarray, self.read_tensor(coords, result_order=result_order).to_numpy()
-        with self._tiledb_open() as A:
-            query = A.query(return_incomplete=True)
-
-            if row_ids is None:
-                if col_ids is None:
-                    iterator = query.df[:, :]
-                else:
-                    iterator = query.df[:, col_ids]
-            else:
-                if col_ids is None:
-                    iterator = query.df[row_ids, :]
-                else:
-                    iterator = query.df[row_ids, col_ids]
-
-            for df in iterator:
-                # Make this opt-in only.  For large arrays, this df.set_index is time-consuming
-                # so we should not do it without direction.
-                if set_index:
-                    df.set_index(self._tiledb_dim_names(), inplace=True)
-                yield df
-
-    def read_all(
-        self,
-        *,
-        # TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)``
-        # ids: Optional[Union[Sequence[int], Slice]] = None,
-        row_ids: Optional[Sequence[int]] = None,
-        col_ids: Optional[Sequence[int]] = None,
-        result_order: Optional[str] = None,
-        # TODO: batch_size
-        # TODO: partition,
-        # TODO: batch_format,
-        # TODO: platform_config,
-    ) -> pa.Table:
-        """
-        This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
-        """
-        return util_arrow.concat_tables(
-            self.read(
-                row_ids=row_ids,
-                col_ids=col_ids,
-                result_order=result_order,
-            )
         )
 
     def write_tensor(
         self,
         coords: SOMADenseNdCoordinates,
-        *,
-        row_ids: Optional[Sequence[int]] = None,
-        col_ids: Optional[Sequence[int]] = None,
-        set_index: Optional[bool] = False,
-    ) -> pa.Table:
-        """
-        This is a convenience method around ``read_as_pandas``. It iterates the return value from ``read_as_pandas`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
-        """
-        dataframes = []
-        generator = self.read_as_pandas(
-            row_ids=row_ids,
-            col_ids=col_ids,
-            set_index=set_index,
-        )
-        for dataframe in generator:
-            dataframes.append(dataframe)
-        return pd.concat(dataframes)
-
-    def write(
-        self,
-        # TODO: rework callsites with regard to the very latest spec rev
-        # coords: Union[tuple, tuple[slice], NTuple, List[int]],
-        coords: Any,
         values: pa.Tensor,
     ) -> None:
         """
diff --git a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py
@@ -281,7 +281,7 @@ def read_all(
         """
         This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
         """
-        return util_arrow.concat_tables(
+        return pa.concat_tables(
             self.read(ids=ids, value_filter=value_filter, column_names=column_names)
         )
 
diff --git a/apis/python/src/tiledbsoma/soma_sparse_nd_array.py b/apis/python/src/tiledbsoma/soma_sparse_nd_array.py
@@ -158,17 +158,6 @@ def read_sparse_tensor(
         *,
         format: Literal["coo", "csr", "csc"] = "coo",
     ) -> Iterator[Union[pa.SparseCOOTensor, pa.SparseCSCMatrix, pa.SparseCSRMatrix]]:
-        # TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)``
-        # row_ids: Optional[Union[Sequence[int], Slice]] = None,
-        # col_ids: Optional[Union[Sequence[int], Slice]] = None,
-        row_ids: Optional[Sequence[int]] = None,
-        col_ids: Optional[Sequence[int]] = None,
-        result_order: Optional[str] = None,
-        # TODO: batch_size
-        # TODO: partition,
-        # TODO: batch_format,
-        # TODO: platform_config,
-    ) -> Iterator[pa.Table]:
         """
         Read a use-defined slice of the SparseNdArray and return as an Arrow sparse tensor.
 
@@ -226,16 +215,6 @@ def read_sparse_tensor(
                         yield pa.SparseCSCMatrix.from_scipy(scipy_coo.tocsc())
 
     def read_table(self, coords: SOMASparseNdCoordinates) -> Iterator[pa.Table]:
-            for table in iterator:
-                yield table
-
-    def read_as_pandas(
-        self,
-        *,
-        row_ids: Optional[Sequence[int]] = None,
-        col_ids: Optional[Sequence[int]] = None,
-        set_index: Optional[bool] = False,
-    ) -> pd.DataFrame:
         """
         Read a user-defined slice of the sparse array and return in COO format
         as an Arrow Table
@@ -244,53 +223,6 @@ def read_as_pandas(
             query = A.query(
                 return_arrow=True,
                 return_incomplete=True,
-        dim_names = None
-        if set_index:
-            dim_names = self._tiledb_dim_names()
-
-        with self._tiledb_open() as A:
-            query = A.query(return_incomplete=True)
-
-            if row_ids is None:
-                if col_ids is None:
-                    iterator = query.df[:, :]
-                else:
-                    iterator = query.df[:, col_ids]
-            else:
-                if col_ids is None:
-                    iterator = query.df[row_ids, :]
-                else:
-                    iterator = query.df[row_ids, col_ids]
-
-            for df in iterator:
-                # Make this opt-in only.  For large arrays, this df.set_index is time-consuming
-                # so we should not do it without direction.
-                if set_index:
-                    df.set_index(dim_names, inplace=True)
-                yield df
-
-    def read_all(
-        self,
-        *,
-        # TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)``
-        # row_ids: Optional[Union[Sequence[int], Slice]] = None,
-        # col_ids: Optional[Union[Sequence[int], Slice]] = None,
-        row_ids: Optional[Sequence[int]] = None,
-        col_ids: Optional[Sequence[int]] = None,
-        result_order: Optional[str] = None,
-        # TODO: batch_size
-        # TODO: partition,
-        # TODO: batch_format,
-        # TODO: platform_config,
-    ) -> pa.Table:
-        """
-        This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
-        """
-        return util_arrow.concat_tables(
-            self.read(
-                row_ids=row_ids,
-                col_ids=col_ids,
-                result_order=result_order,
             )
             for arrow_tbl in query.df[coords]:
                 yield arrow_tbl
@@ -306,15 +238,8 @@ def read_as_pandas(self, coords: SOMASparseNdCoordinates) -> Iterator[pd.DataFra
     def read_as_pandas_all(
         self, coords: Optional[SOMASparseNdCoordinates] = None
     ) -> pd.DataFrame:
-        self,
-        *,
-        row_ids: Optional[Sequence[int]] = None,
-        col_ids: Optional[Sequence[int]] = None,
-        set_index: Optional[bool] = False,
-    ) -> pa.Table:
         """
         Return the sparse array as a single Pandas DataFrame containing COO data.
-        This is a convenience method around ``read_as_pandas``. It iterates the return value from ``read_as_pandas`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
         """
         if coords is None:
             coords = (slice(None),) * self.ndims
diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
@@ -1,4 +1,4 @@
-from typing import Iterator, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 import pyarrow as pa
@@ -138,14 +138,3 @@ def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table:
         else:
             new_fields.append(old_field)
     return pa.Table.from_arrays(new_fields, names=names)
-
-
-def concat_tables(table_generator: Iterator[Any]) -> pa.Table:
-    """
-    Iterates a generator of ``pyarrow.Table`` (e.g. ``SOMADataFrame.read``) and returns a concatenation of all the table-pieces found. The nominal use is to simply unit-test cases.
-    """
-    tables = []
-    for table in table_generator:
-        tables.append(table)
-    assert len(tables) > 0
-    return pa.concat_tables(tables)

Original file line number	Diff line number	Diff line change
`@@ -281,7 +281,7 @@ def read_all(`
`281`	`281`	`"""`
`282`	`282`	This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
`283`	`283`	`"""`
`284`		`- return util_arrow.concat_tables(`
	`284`	`+ return pa.concat_tables(`
`285`	`285`	`self.read(ids=ids, value_filter=value_filter, column_names=column_names)`
`286`	`286`	`)`
`287`	`287`