Use true ASCII attributes in dataframes

johnkerl · johnkerl · commit d236a9bd7cf3 · 2022-10-04T11:24:24.000-04:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -20,7 +20,6 @@ jobs:
           - runs-on: ubuntu-22.04
             cc: gcc-11
             cxx: g++-11
-          # Pending https://github.com/actions/runner-images/issues/6350
           - runs-on: macos-11
             cc: gcc-11
             cxx: g++-11
diff --git a/.github/workflows/cpp-ci.yml b/.github/workflows/cpp-ci.yml
@@ -18,6 +18,7 @@ jobs:
             cc: gcc-11
             cxx: g++-11
           # Pending https://github.com/actions/runner-images/issues/6350
+          # - runs-on: macos-12
           - runs-on: macos-11
             cc: gcc-11
             cxx: g++-11
diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py
@@ -5,7 +5,7 @@
 import pyarrow as pa
 import tiledb
 
-from . import util, util_arrow, util_pandas, util_tiledb
+from . import util, util_arrow, util_tiledb
 from .logging import log_io
 from .soma_collection import SOMACollectionBase
 from .tiledb_array import TileDBArray
@@ -218,15 +218,10 @@ def read(
                 iterator = query.df[ids]
 
             for table in iterator:
-                # XXX COMMENT MORE
-                # This is the 'decode on read' part of our logic; in dim_select we have the
-                # 'encode on write' part.
-                # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
-                #
-                # Also: don't materialize these on read
+                # Don't materialize these on read
                 # TODO: get the arrow syntax for drop
                 # df.drop(ROWID, axis=1)
-                yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
+                yield table
 
     def read_all(
         self,
@@ -360,11 +355,6 @@ def read_as_pandas(
 
             for df in iterator:
 
-                # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on
-                # write' part.
-                # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
-                df = util_pandas.ascii_to_unicode_pandas_readback(df)
-
                 if id_column_name is not None:
                     df.reset_index(inplace=True)
                     df.set_index(id_column_name, inplace=True)
@@ -445,39 +435,13 @@ def write_from_pandas(
 
         dataframe.set_index(ROWID, inplace=True)
 
-        # ISSUE:
-        #
-        # TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB
-        # QueryCondition API. While this needs to be addressed -- global collaborators will want to
-        # write annotation-dataframe values in Unicode -- until then, to make obs/var data possible
-        # to query, we need to store these as ASCII.
-        #
-        # This is (besides collation) a storage-level issue not a presentation-level issue: At write
-        # time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since
-        # SOMA is an API: utf8-decode those strings when a query is done & give the user back
-        # "α,β,γ".
-        #
-        # CONTEXT:
-        # https://github.com/single-cell-data/TileDB-SOMA/issues/99
-        # https://github.com/single-cell-data/TileDB-SOMA/pull/101
-        # https://github.com/single-cell-data/TileDB-SOMA/issues/106
-        # https://github.com/single-cell-data/TileDB-SOMA/pull/117
-        #
-        # IMPLEMENTATION:
-        # Python types -- float, string, what have you -- appear as dtype('O') which is not useful.
-        # Also, ``tiledb.from_pandas`` has ``column_types`` but that _forces_ things to string to a
-        # particular if they shouldn't be.
-        #
-        # Instead, we use ``dataframe.convert_dtypes`` to get a little jump on what ``tiledb.from_pandas``
-        # is going to be doing anyway, namely, type-inferring to see what is going to be a string.
-        #
-        # TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
+        # Force ASCII storage if string, in order to make obs/var columns queryable.
+        # TODO: when UTF-8 attributes are fully supported we can remove this.
         column_types = {}
         for column_name in dataframe.keys():
             dfc = dataframe[column_name]
             if len(dfc) > 0 and type(dfc[0]) == str:
-                # Force ASCII storage if string, in order to make obs/var columns queryable.
-                column_types[column_name] = np.dtype("S")
+                column_types[column_name] = "ascii"
 
         tiledb.from_pandas(
             uri=self.uri,
diff --git a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py
@@ -259,11 +259,7 @@ def read(
                 iterator = query.df[ids]
 
             for table in iterator:
-                # XXX COMMENT MORE
-                # This is the 'decode on read' part of our logic; in dim_select we have the
-                # 'encode on write' part.
-                # Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99.
-                yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
+                yield table
 
     def read_all(
         self,
diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
@@ -23,9 +23,7 @@
     #
     # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
     #
-    pa.string(): np.dtype(
-        "S"
-    ),  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
+    pa.string(): "ascii",  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
     pa.binary(): np.dtype("S"),
     pa.timestamp("s"): "datetime64[s]",
     pa.timestamp("ms"): "datetime64[ms]",
@@ -39,7 +37,7 @@
 }
 
 
-def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
+def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
     """
     Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
     Building block for Arrow-to-TileDB schema translation.
@@ -61,7 +59,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
         arrow_type = ARROW_TO_TDB[t]
         if isinstance(arrow_type, Exception):
             raise arrow_type
-        return np.dtype(arrow_type)
+        if arrow_type == "ascii":
+            return arrow_type
+        else:
+            return np.dtype(arrow_type)
 
     if not pa.types.is_primitive(t):
         raise TypeError(f"Type {str(t)} - unsupported type")
@@ -83,11 +84,11 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
         raise TypeError("Unsupported Arrow type") from exc
 
 
-def get_arrow_type_from_tiledb_dtype(tiledb_dtype: np.dtype) -> pa.DataType:
+def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType:
     """
     TODO: COMMENT
     """
-    if tiledb_dtype.name == "bytes":
+    if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes":
         # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
         return pa.string()
     else:
@@ -119,22 +120,3 @@ def get_arrow_schema_from_tiledb_uri(
             arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype)
 
     return pa.schema(arrow_schema_dict)
-
-
-def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table:
-    """
-    Implements the 'decode on read' part of our ASCII/Unicode logic
-    """
-    # TODO: COMMENT/LINK HEAVILY
-    names = [ofield.name for ofield in table.schema]
-    new_fields = []
-    for name in names:
-        old_field = table[name]
-        if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar):
-            nfield = pa.array(
-                [element.as_py().decode("utf-8") for element in old_field]
-            )
-            new_fields.append(nfield)
-        else:
-            new_fields.append(old_field)
-    return pa.Table.from_arrays(new_fields, names=names)
diff --git a/apis/python/src/tiledbsoma/util_pandas.py b/apis/python/src/tiledbsoma/util_pandas.py
diff --git a/apis/python/tests/test_type_system.py b/apis/python/tests/test_type_system.py
@@ -65,7 +65,7 @@ def test_supported_types_supported(arrow_type):
         pytest.xfail("Awaiting UTF-8 support - see issue #338")
 
     tdb_dtype = tiledb_type_from_arrow_type(arrow_type)
-    assert isinstance(tdb_dtype, np.dtype)
+    assert isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii"
     rt_arrow_type = get_arrow_type_from_tiledb_dtype(tdb_dtype)
     assert isinstance(rt_arrow_type, pa.DataType)
     assert arrow_type == rt_arrow_type