Conform to spec by reading as pyarrow.Table not pyarrow.RecordBatch

johnkerl · johnkerl · commit c8345a5a44c7 · 2022-10-03T15:35:16.000-04:00
diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py
@@ -169,9 +169,9 @@ def read(
         # TODO: batch_size
         # TODO: partition,
         # TODO: platform_config,
-    ) -> Iterator[pa.RecordBatch]:
+    ) -> Iterator[pa.Table]:
         """
-        Read a user-defined subset of data, addressed by the dataframe indexing column, optionally filtered, and return results as one or more ``Arrow.RecordBatch``.
+        Read a user-defined subset of data, addressed by the dataframe indexing column, optionally filtered, and return results as one or more ``Arrow.Table``.
 
         :param ids: Which rows to read. Defaults to ``None``, meaning no constraint -- all rows.
 
@@ -217,18 +217,16 @@ def read(
             else:
                 iterator = query.df[ids]
 
-            for df in iterator:
-                batches = df.to_batches()
-                for batch in batches:
-                    # XXX COMMENT MORE
-                    # This is the 'decode on read' part of our logic; in dim_select we have the
-                    # 'encode on write' part.
-                    # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
-                    #
-                    # Also: don't materialize these on read
-                    # TODO: get the arrow syntax for drop
-                    # df.drop(ROWID, axis=1)
-                    yield util_arrow.ascii_to_unicode_pyarrow_readback(batch)
+            for table in iterator:
+                # XXX COMMENT MORE
+                # This is the 'decode on read' part of our logic; in dim_select we have the
+                # 'encode on write' part.
+                # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
+                #
+                # Also: don't materialize these on read
+                # TODO: get the arrow syntax for drop
+                # df.drop(ROWID, axis=1)
+                yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
 
     def read_all(
         self,
@@ -243,11 +241,11 @@ def read_all(
         # TODO: partition,
         # TODO: result_order,
         # TODO: platform_config,
-    ) -> pa.RecordBatch:
+    ) -> pa.Table:
         """
-        This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the record batches found. Its nominal use is to simply unit-test cases.
+        This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
         """
-        return util_arrow.concat_batches(
+        return util_arrow.concat_tables(
             self.read(
                 ids=ids,
                 value_filter=value_filter,
@@ -273,13 +271,13 @@ def _get_is_sparse(self) -> bool:
 
         return self._cached_is_sparse
 
-    def write(self, values: pa.RecordBatch) -> None:
+    def write(self, values: pa.Table) -> None:
         """
-        Write an Arrow.RecordBatch to the persistent object.
+        Write an Arrow.Table to the persistent object.
 
-        :param values: An Arrow.RecordBatch containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``.
+        :param values: An Arrow.Table containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``.
 
-        The ``values`` Arrow RecordBatch must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written.
+        The ``values`` Arrow Table must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written.
         """
         self._shape = None  # cache-invalidate
 
diff --git a/apis/python/src/tiledbsoma/soma_dense_nd_array.py b/apis/python/src/tiledbsoma/soma_dense_nd_array.py
@@ -168,6 +168,21 @@ def read_tensor(
             )
 
     def read_numpy(
+            if row_ids is None:
+                if col_ids is None:
+                    iterator = query.df[:, :]
+                else:
+                    iterator = query.df[:, col_ids]
+            else:
+                if col_ids is None:
+                    iterator = query.df[row_ids, :]
+                else:
+                    iterator = query.df[row_ids, col_ids]
+
+            for table in iterator:
+                yield table
+
+    def read_as_pandas(
         self,
         coords: SOMADenseNdCoordinates,
         *,
@@ -178,11 +193,77 @@ def read_numpy(
         """
         return cast(
             np.ndarray, self.read_tensor(coords, result_order=result_order).to_numpy()
+        with self._tiledb_open() as A:
+            query = A.query(return_incomplete=True)
+
+            if row_ids is None:
+                if col_ids is None:
+                    iterator = query.df[:, :]
+                else:
+                    iterator = query.df[:, col_ids]
+            else:
+                if col_ids is None:
+                    iterator = query.df[row_ids, :]
+                else:
+                    iterator = query.df[row_ids, col_ids]
+
+            for df in iterator:
+                # Make this opt-in only.  For large arrays, this df.set_index is time-consuming
+                # so we should not do it without direction.
+                if set_index:
+                    df.set_index(self._tiledb_dim_names(), inplace=True)
+                yield df
+
+    def read_all(
+        self,
+        *,
+        # TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)``
+        # ids: Optional[Union[Sequence[int], Slice]] = None,
+        row_ids: Optional[Sequence[int]] = None,
+        col_ids: Optional[Sequence[int]] = None,
+        result_order: Optional[str] = None,
+        # TODO: batch_size
+        # TODO: partition,
+        # TODO: batch_format,
+        # TODO: platform_config,
+    ) -> pa.Table:
+        """
+        This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
+        """
+        return util_arrow.concat_tables(
+            self.read(
+                row_ids=row_ids,
+                col_ids=col_ids,
+                result_order=result_order,
+            )
         )
 
     def write_tensor(
         self,
         coords: SOMADenseNdCoordinates,
+        *,
+        row_ids: Optional[Sequence[int]] = None,
+        col_ids: Optional[Sequence[int]] = None,
+        set_index: Optional[bool] = False,
+    ) -> pa.Table:
+        """
+        This is a convenience method around ``read_as_pandas``. It iterates the return value from ``read_as_pandas`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
+        """
+        dataframes = []
+        generator = self.read_as_pandas(
+            row_ids=row_ids,
+            col_ids=col_ids,
+            set_index=set_index,
+        )
+        for dataframe in generator:
+            dataframes.append(dataframe)
+        return pd.concat(dataframes)
+
+    def write(
+        self,
+        # TODO: rework callsites with regard to the very latest spec rev
+        # coords: Union[tuple, tuple[slice], NTuple, List[int]],
+        coords: Any,
         values: pa.Tensor,
     ) -> None:
         """
diff --git a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py
@@ -209,9 +209,9 @@ def read(
         column_names: Optional[Sequence[str]] = None,
         result_order: Optional[SOMAResultOrder] = None,
         # TODO: more arguments
-    ) -> Iterator[pa.RecordBatch]:
+    ) -> Iterator[pa.Table]:
         """
-        Read a user-defined subset of data, addressed by the dataframe indexing columns, optionally filtered, and return results as one or more Arrow.RecordBatch.
+        Read a user-defined subset of data, addressed by the dataframe indexing columns, optionally filtered, and return results as one or more Arrow.Table.
 
         :param ids: for each index dimension, which rows to read. Defaults to ``None``, meaning no constraint -- all IDs.
 
@@ -258,14 +258,12 @@ def read(
             else:
                 iterator = query.df[ids]
 
-            for df in iterator:
-                batches = df.to_batches()
-                for batch in batches:
-                    # XXX COMMENT MORE
-                    # This is the 'decode on read' part of our logic; in dim_select we have the
-                    # 'encode on write' part.
-                    # Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99.
-                    yield util_arrow.ascii_to_unicode_pyarrow_readback(batch)
+            for table in iterator:
+                # XXX COMMENT MORE
+                # This is the 'decode on read' part of our logic; in dim_select we have the
+                # 'encode on write' part.
+                # Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99.
+                yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
 
     def read_all(
         self,
@@ -279,19 +277,19 @@ def read_all(
         # TODO: batch_size
         # TODO: partition,
         # TODO: platform_config,
-    ) -> pa.RecordBatch:
+    ) -> pa.Table:
         """
-        This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the record batches found. Its nominal use is to simply unit-test cases.
+        This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
         """
-        return util_arrow.concat_batches(
+        return util_arrow.concat_tables(
             self.read(ids=ids, value_filter=value_filter, column_names=column_names)
         )
 
-    def write(self, values: pa.RecordBatch) -> None:
+    def write(self, values: pa.Table) -> None:
         """
-        Write an Arrow.RecordBatch to the persistent object. As duplicate index values are not allowed, index values already present in the object are overwritten and new index values are added.
+        Write an Arrow.Table to the persistent object. As duplicate index values are not allowed, index values already present in the object are overwritten and new index values are added.
 
-        :param values: An Arrow.RecordBatch containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMAIndexedDataFrame``.
+        :param values: An Arrow.Table containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMAIndexedDataFrame``.
         """
         self._shape = None  # cache-invalidate
 
diff --git a/apis/python/src/tiledbsoma/soma_sparse_nd_array.py b/apis/python/src/tiledbsoma/soma_sparse_nd_array.py
@@ -158,6 +158,17 @@ def read_sparse_tensor(
         *,
         format: Literal["coo", "csr", "csc"] = "coo",
     ) -> Iterator[Union[pa.SparseCOOTensor, pa.SparseCSCMatrix, pa.SparseCSRMatrix]]:
+        # TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)``
+        # row_ids: Optional[Union[Sequence[int], Slice]] = None,
+        # col_ids: Optional[Union[Sequence[int], Slice]] = None,
+        row_ids: Optional[Sequence[int]] = None,
+        col_ids: Optional[Sequence[int]] = None,
+        result_order: Optional[str] = None,
+        # TODO: batch_size
+        # TODO: partition,
+        # TODO: batch_format,
+        # TODO: platform_config,
+    ) -> Iterator[pa.Table]:
         """
         Read a use-defined slice of the SparseNdArray and return as an Arrow sparse tensor.
 
@@ -215,6 +226,16 @@ def read_sparse_tensor(
                         yield pa.SparseCSCMatrix.from_scipy(scipy_coo.tocsc())
 
     def read_table(self, coords: SOMASparseNdCoordinates) -> Iterator[pa.Table]:
+            for table in iterator:
+                yield table
+
+    def read_as_pandas(
+        self,
+        *,
+        row_ids: Optional[Sequence[int]] = None,
+        col_ids: Optional[Sequence[int]] = None,
+        set_index: Optional[bool] = False,
+    ) -> pd.DataFrame:
         """
         Read a user-defined slice of the sparse array and return in COO format
         as an Arrow Table
@@ -223,6 +244,53 @@ def read_table(self, coords: SOMASparseNdCoordinates) -> Iterator[pa.Table]:
             query = A.query(
                 return_arrow=True,
                 return_incomplete=True,
+        dim_names = None
+        if set_index:
+            dim_names = self._tiledb_dim_names()
+
+        with self._tiledb_open() as A:
+            query = A.query(return_incomplete=True)
+
+            if row_ids is None:
+                if col_ids is None:
+                    iterator = query.df[:, :]
+                else:
+                    iterator = query.df[:, col_ids]
+            else:
+                if col_ids is None:
+                    iterator = query.df[row_ids, :]
+                else:
+                    iterator = query.df[row_ids, col_ids]
+
+            for df in iterator:
+                # Make this opt-in only.  For large arrays, this df.set_index is time-consuming
+                # so we should not do it without direction.
+                if set_index:
+                    df.set_index(dim_names, inplace=True)
+                yield df
+
+    def read_all(
+        self,
+        *,
+        # TODO: find the right syntax to get the typechecker to accept args like ``ids=slice(0,10)``
+        # row_ids: Optional[Union[Sequence[int], Slice]] = None,
+        # col_ids: Optional[Union[Sequence[int], Slice]] = None,
+        row_ids: Optional[Sequence[int]] = None,
+        col_ids: Optional[Sequence[int]] = None,
+        result_order: Optional[str] = None,
+        # TODO: batch_size
+        # TODO: partition,
+        # TODO: batch_format,
+        # TODO: platform_config,
+    ) -> pa.Table:
+        """
+        This is a convenience method around ``read``. It iterates the return value from ``read`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
+        """
+        return util_arrow.concat_tables(
+            self.read(
+                row_ids=row_ids,
+                col_ids=col_ids,
+                result_order=result_order,
             )
             for arrow_tbl in query.df[coords]:
                 yield arrow_tbl
@@ -238,8 +306,15 @@ def read_as_pandas(self, coords: SOMASparseNdCoordinates) -> Iterator[pd.DataFra
     def read_as_pandas_all(
         self, coords: Optional[SOMASparseNdCoordinates] = None
     ) -> pd.DataFrame:
+        self,
+        *,
+        row_ids: Optional[Sequence[int]] = None,
+        col_ids: Optional[Sequence[int]] = None,
+        set_index: Optional[bool] = False,
+    ) -> pa.Table:
         """
         Return the sparse array as a single Pandas DataFrame containing COO data.
+        This is a convenience method around ``read_as_pandas``. It iterates the return value from ``read_as_pandas`` and returns a concatenation of all the table-pieces found. Its nominal use is to simply unit-test cases.
         """
         if coords is None:
             coords = (slice(None),) * self.ndims
diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
@@ -121,36 +121,31 @@ def get_arrow_schema_from_tiledb_uri(
     return pa.schema(arrow_schema_dict)
 
 
-def ascii_to_unicode_pyarrow_readback(record_batch: pa.RecordBatch) -> pa.RecordBatch:
+def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table:
     """
     Implements the 'decode on read' part of our ASCII/Unicode logic
     """
     # TODO: COMMENT/LINK HEAVILY
-    names = [ofield.name for ofield in record_batch.schema]
+    names = [ofield.name for ofield in table.schema]
     new_fields = []
     for name in names:
-        old_field = record_batch[name]
-        if isinstance(old_field, pa.LargeBinaryArray):
+        old_field = table[name]
+        if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar):
             nfield = pa.array(
                 [element.as_py().decode("utf-8") for element in old_field]
             )
             new_fields.append(nfield)
         else:
             new_fields.append(old_field)
-    return pa.RecordBatch.from_arrays(new_fields, names=names)
+    return pa.Table.from_arrays(new_fields, names=names)
 
 
-def concat_batches(batch_generator: Iterator[pa.RecordBatch]) -> pa.RecordBatch:
+def concat_tables(table_generator: Iterator[Any]) -> pa.Table:
     """
-    Iterates a generator of ``pyarrow.RecordBatch`` (e.g. ``SOMADataFrame.read``) and returns a concatenation of all the record batches found. The nominal use is to simply unit-test cases.
+    Iterates a generator of ``pyarrow.Table`` (e.g. ``SOMADataFrame.read``) and returns a concatenation of all the table-pieces found. The nominal use is to simply unit-test cases.
     """
-    batches = []
-    for batch in batch_generator:
-        batches.append(batch)
-    assert len(batches) > 0
-    names = [field.name for field in batches[0].schema]
-    arrays = []
-    for name in names:
-        array = pa.concat_arrays([batch[name] for batch in batches])
-        arrays.append(array)
-    return pa.RecordBatch.from_arrays(arrays, names=names)
+    tables = []
+    for table in table_generator:
+        tables.append(table)
+    assert len(tables) > 0
+    return pa.concat_tables(tables)
diff --git a/apis/python/tests/test_soma_collection.py b/apis/python/tests/test_soma_collection.py
@@ -25,7 +25,7 @@ def create_and_populate_dataframe(dataframe: soma.SOMADataFrame) -> None:
     pydict["foo"] = [10, 20, 30, 40, 50]
     pydict["bar"] = [4.1, 5.2, 6.3, 7.4, 8.5]
     pydict["baz"] = ["apple", "ball", "cat", "dog", "egg"]
-    rb = pa.RecordBatch.from_pydict(pydict)
+    rb = pa.Table.from_pydict(pydict)
     dataframe.write(rb)
 
 
diff --git a/apis/python/tests/test_soma_dataframe.py b/apis/python/tests/test_soma_dataframe.py
diff --git a/apis/python/tests/test_soma_experiment_basic.py b/apis/python/tests/test_soma_experiment_basic.py
diff --git a/apis/python/tests/test_soma_indexed_dataframe.py b/apis/python/tests/test_soma_indexed_dataframe.py