Use true ASCII attributes in dataframes (#359)

johnkerl · web-flow · commit b58971637c2a · 2022-10-11T14:56:38.000-04:00
* remove util_arrow.concat_tables

* read/write Table

* read/write Table

* read/write Table

* Use true ASCII attributes in dataframes

* iterate on string vs large_string

* Fix ascii/binary issues in TileDB-&gt;Arrow and TileDB-&gt;Pandas-&gt;Arrows test cases

* more

* fix unit tests

Co-authored-by: John Kerl &lt;john.kerl@tiledb.com&gt;
diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py
@@ -5,7 +5,7 @@
 import pyarrow as pa
 import tiledb
 
-from . import util, util_arrow, util_pandas, util_tiledb
+from . import util, util_arrow, util_tiledb
 from .logging import log_io
 from .soma_collection import SOMACollectionBase
 from .tiledb_array import TileDBArray
@@ -205,11 +205,7 @@ def read(
                 iterator = query.df[ids]
 
             for table in iterator:
-                # XXX COMMENT MORE
-                # This is the 'decode on read' part of our logic; in dim_select we have the
-                # 'encode on write' part.
-                # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
-                yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
+                yield table
 
     def read_all(
         self,
@@ -275,7 +271,7 @@ def write(self, values: pa.Table) -> None:
             if name != ROWID:
                 attr_cols_map[name] = np.asarray(
                     values.column(name).to_pandas(
-                        types_mapper=util_arrow.tiledb_type_from_arrow_type,
+                        types_mapper=util_arrow.tiledb_type_from_arrow_type_for_write,
                     )
                 )
 
@@ -343,11 +339,6 @@ def read_as_pandas(
 
             for df in iterator:
 
-                # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on
-                # write' part.
-                # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
-                df = util_pandas.ascii_to_unicode_pandas_readback(df)
-
                 if id_column_name is not None:
                     df.reset_index(inplace=True)
                     df.set_index(id_column_name, inplace=True)
@@ -428,39 +419,15 @@ def write_from_pandas(
 
         dataframe.set_index(ROWID, inplace=True)
 
-        # ISSUE:
-        #
-        # TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB
-        # QueryCondition API. While this needs to be addressed -- global collaborators will want to
-        # write annotation-dataframe values in Unicode -- until then, to make obs/var data possible
-        # to query, we need to store these as ASCII.
-        #
-        # This is (besides collation) a storage-level issue not a presentation-level issue: At write
-        # time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since
-        # SOMA is an API: utf8-decode those strings when a query is done & give the user back
-        # "α,β,γ".
-        #
-        # CONTEXT:
-        # https://github.com/single-cell-data/TileDB-SOMA/issues/99
-        # https://github.com/single-cell-data/TileDB-SOMA/pull/101
-        # https://github.com/single-cell-data/TileDB-SOMA/issues/106
-        # https://github.com/single-cell-data/TileDB-SOMA/pull/117
-        #
-        # IMPLEMENTATION:
-        # Python types -- float, string, what have you -- appear as dtype('O') which is not useful.
-        # Also, ``tiledb.from_pandas`` has ``column_types`` but that _forces_ things to string to a
-        # particular if they shouldn't be.
-        #
-        # Instead, we use ``dataframe.convert_dtypes`` to get a little jump on what ``tiledb.from_pandas``
-        # is going to be doing anyway, namely, type-inferring to see what is going to be a string.
-        #
-        # TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
+        # Force ASCII storage if string, in order to make obs/var columns queryable.
+        # TODO: when UTF-8 attributes are fully supported we can remove this.
         column_types = {}
         for column_name in dataframe.keys():
             dfc = dataframe[column_name]
             if len(dfc) > 0 and type(dfc[0]) == str:
-                # Force ASCII storage if string, in order to make obs/var columns queryable.
-                column_types[column_name] = np.dtype("S")
+                column_types[column_name] = "ascii"
+            if len(dfc) > 0 and type(dfc[0]) == bytes:
+                column_types[column_name] = "bytes"
 
         tiledb.from_pandas(
             uri=self.uri,
diff --git a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py
@@ -250,11 +250,7 @@ def read(
                 iterator = query.df[ids]
 
             for table in iterator:
-                # XXX COMMENT MORE
-                # This is the 'decode on read' part of our logic; in dim_select we have the
-                # 'encode on write' part.
-                # Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99.
-                yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
+                yield table
 
     def read_all(
         self,
diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
@@ -9,24 +9,28 @@
 of representing full type semantics, and correctly performing a
 round trip conversion (eg, T == to_arrow(to_tiledb(T)))
 
-Most primitive types are simple - eg, uint8. Of particular challenge
+Most primitive types are simple -- e.g., uint8. Of particular challenge
 are datetime/timestamps as TileDB has no distinction between a "datetime" and
 a "timedelta". The best Arrow match is TimestampType, as long as that
 TimestampType instance does NOT have a timezone set.
 
 Because of our round-trip requirement, all other Arrow temporal types
 are unsupported (even though they are just int64 under the covers).
+
+We auto-promote Arrow's string and binary to large_string and large_binary,
+respectively, as this is what TileDB stores -- a sequence of bytes preceded
+by a 64-bit (not 32-bit) length int.
 """
 ARROW_TO_TDB = {
     # Dict of types unsupported by to_pandas_dtype, which require overrides.
     # If the value is an instance of Exception, it will be raised.
     #
     # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
     #
-    pa.string(): np.dtype(
-        "S"
-    ),  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
-    pa.binary(): np.dtype("S"),
+    pa.string(): "ascii",  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
+    pa.large_string(): "ascii",  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
+    pa.binary(): "bytes",  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
+    pa.large_binary(): "bytes",  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
     pa.timestamp("s"): "datetime64[s]",
     pa.timestamp("ms"): "datetime64[ms]",
     pa.timestamp("us"): "datetime64[us]",
@@ -39,7 +43,21 @@
 }
 
 
-def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
+def tiledb_type_from_arrow_type_for_write(t: pa.DataType) -> Union[type, np.dtype, str]:
+    """
+    Same as ``tiledb_type_from_arrow_type`` except that this is used for writing to a TileDB array.
+    The syntax of TileDB-Py is such that when we want to create a schema with an ASCII column,
+    we use the string ``"ascii"`` in place of a dtype. But when we want to write data, we need to
+    use a dtype of ``np.str``, which is now deprecated in favor of simply ``str``.
+    """
+    retval = tiledb_type_from_arrow_type(t)
+    if retval == "ascii":
+        return str
+    else:
+        return retval
+
+
+def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
     """
     Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
     Building block for Arrow-to-TileDB schema translation.
@@ -61,6 +79,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
         arrow_type = ARROW_TO_TDB[t]
         if isinstance(arrow_type, Exception):
             raise arrow_type
+        if arrow_type == "ascii":
+            return arrow_type
+        if arrow_type == "bytes":
+            return arrow_type  # np.int8()
         return np.dtype(arrow_type)
 
     if not pa.types.is_primitive(t):
@@ -83,15 +105,16 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
         raise TypeError("Unsupported Arrow type") from exc
 
 
-def get_arrow_type_from_tiledb_dtype(tiledb_dtype: np.dtype) -> pa.DataType:
+def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType:
     """
     TODO: COMMENT
     """
-    if tiledb_dtype.name == "bytes":
+    if tiledb_dtype == "bytes":
+        return pa.large_binary()
+    if isinstance(tiledb_dtype, str) and tiledb_dtype == "ascii":
         # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
-        return pa.string()
-    else:
-        return pa.from_numpy_dtype(tiledb_dtype)
+        return pa.large_string()
+    return pa.from_numpy_dtype(tiledb_dtype)
 
 
 def get_arrow_schema_from_tiledb_uri(
@@ -119,26 +142,3 @@ def get_arrow_schema_from_tiledb_uri(
             arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype)
 
     return pa.schema(arrow_schema_dict)
-
-
-def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table:
-    """
-    Implements the 'decode on read' part of our ASCII/Unicode logic
-    """
-    # TODO: COMMENT/LINK HEAVILY
-    names = [ofield.name for ofield in table.schema]
-    new_fields = []
-    for name in names:
-        old_field = table[name]
-        # Preferred syntax:
-        # if len(old_field) > 0 and pa.types.is_large_binary(old_field[0]):
-        # but:
-        # AttributeError: 'pyarrow.lib.UInt64Scalar' object has no attribute 'id'
-        if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar):
-            nfield = pa.array(
-                [element.as_py().decode("utf-8") for element in old_field]
-            )
-            new_fields.append(nfield)
-        else:
-            new_fields.append(old_field)
-    return pa.Table.from_arrays(new_fields, names=names)
diff --git a/apis/python/src/tiledbsoma/util_pandas.py b/apis/python/src/tiledbsoma/util_pandas.py
diff --git a/apis/python/tests/test_soma_collection.py b/apis/python/tests/test_soma_collection.py
@@ -15,7 +15,7 @@ def create_and_populate_dataframe(dataframe: soma.SOMADataFrame) -> None:
         [
             ("foo", pa.int32()),
             ("bar", pa.float64()),
-            ("baz", pa.string()),
+            ("baz", pa.large_string()),
         ]
     )
 
@@ -108,7 +108,7 @@ def soma_object(request, tmp_path):
 
     elif class_name == "SOMADataFrame":
         so = soma.SOMADataFrame(uri=uri)
-        so.create(pa.schema([("A", pa.int32()), ("B", pa.string())]))
+        so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())]))
 
     elif class_name == "SOMAIndexedDataFrame":
         so = soma.SOMAIndexedDataFrame(uri=uri)
diff --git a/apis/python/tests/test_soma_dataframe.py b/apis/python/tests/test_soma_dataframe.py
@@ -13,7 +13,7 @@ def test_soma_dataframe_non_indexed(tmp_path):
         [
             ("foo", pa.int32()),
             ("bar", pa.float64()),
-            ("baz", pa.string()),
+            ("baz", pa.large_string()),
         ]
     )
     sdf.create(schema=asch)
@@ -120,7 +120,7 @@ def simple_soma_data_frame(tmp_path):
             ("soma_rowid", pa.uint64()),
             ("A", pa.int64()),
             ("B", pa.float64()),
-            ("C", pa.string()),
+            ("C", pa.large_string()),
         ]
     )
     sdf = t.SOMADataFrame(uri=tmp_path.as_posix())
@@ -174,37 +174,57 @@ def test_SOMADataFrame_read_column_names(simple_soma_data_frame, ids, col_names)
     schema, sdf, n_data = simple_soma_data_frame
     assert sdf.exists()
 
-    def _check_tbl(tbl, col_names, ids):
+    def _check_tbl(tbl, col_names, ids, *, demote):
         assert tbl.num_columns == (
             len(schema.names) if col_names is None else len(col_names)
         )
         assert tbl.num_rows == (n_data if ids is None else len(ids))
-        assert tbl.schema == pa.schema(
-            [
-                schema.field(f)
-                for f in (col_names if col_names is not None else schema.names)
-            ]
-        )
 
+        if demote:
+            assert tbl.schema == pa.schema(
+                [
+                    pa.field(schema.field(f).name, pa.string())
+                    if schema.field(f).type == pa.large_string()
+                    else schema.field(f)
+                    for f in (col_names if col_names is not None else schema.names)
+                ]
+            )
+        else:
+            assert tbl.schema == pa.schema(
+                [
+                    schema.field(f)
+                    for f in (col_names if col_names is not None else schema.names)
+                ]
+            )
+
+    # TileDB ASCII -> Arrow large_string
     _check_tbl(
         sdf.read_all(ids=ids, column_names=col_names),
         col_names,
         ids,
+        demote=False,
     )
+
     _check_tbl(
         sdf.read_all(column_names=col_names),
         col_names,
         None,
+        demote=False,
     )
+
+    # TileDB ASCII -> Pandas string -> Arrow string (not large_string)
     _check_tbl(
         pa.Table.from_pandas(
             pd.concat(sdf.read_as_pandas(ids=ids, column_names=col_names))
         ),
         col_names,
         ids,
+        demote=True,
     )
+
     _check_tbl(
         pa.Table.from_pandas(sdf.read_as_pandas_all(column_names=col_names)),
         col_names,
         None,
+        demote=True,
     )
diff --git a/apis/python/tests/test_soma_experiment_basic.py b/apis/python/tests/test_soma_experiment_basic.py
@@ -14,7 +14,7 @@ def create_and_populate_obs(obs: soma.SOMADataFrame) -> soma.SOMADataFrame:
         [
             ("foo", pa.int32()),
             ("bar", pa.float64()),
-            ("baz", pa.string()),
+            ("baz", pa.large_string()),
         ]
     )
 
@@ -37,7 +37,7 @@ def create_and_populate_var(var: soma.SOMADataFrame) -> soma.SOMADataFrame:
 
     var_arrow_schema = pa.schema(
         [
-            ("quux", pa.string()),
+            ("quux", pa.large_string()),
             ("xyzzy", pa.float64()),
         ]
     )
diff --git a/apis/python/tests/test_soma_indexed_dataframe.py b/apis/python/tests/test_soma_indexed_dataframe.py
@@ -21,6 +21,14 @@ def _schema():
 def test_soma_indexed_dataframe(tmp_path, arrow_schema):
     sdf = t.SOMAIndexedDataFrame(uri=tmp_path.as_posix())
 
+    asch = pa.schema(
+        [
+            ("foo", pa.int32()),
+            ("bar", pa.float64()),
+            ("baz", pa.large_string()),
+        ]
+    )
+
     # Create
     asch = arrow_schema()
     sdf.create(schema=asch, index_column_names=["foo"])
@@ -72,7 +80,7 @@ def simple_soma_indexed_data_frame(tmp_path):
             ("index", pa.uint64()),
             ("A", pa.int64()),
             ("B", pa.float64()),
-            ("C", pa.string()),
+            ("C", pa.large_string()),
         ]
     )
     index_column_names = ["index"]
diff --git a/apis/python/tests/test_soma_metadata.py b/apis/python/tests/test_soma_metadata.py
@@ -33,7 +33,7 @@ def soma_object(request, tmp_path):
 
     elif class_name == "SOMADataFrame":
         so = soma.SOMADataFrame(uri=uri)
-        so.create(pa.schema([("A", pa.int32()), ("B", pa.string())]))
+        so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())]))
 
     elif class_name == "SOMAIndexedDataFrame":
         so = soma.SOMAIndexedDataFrame(uri=uri)
diff --git a/apis/python/tests/test_type_system.py b/apis/python/tests/test_type_system.py
diff --git a/libtiledbsoma/test/test_query_condition.py b/libtiledbsoma/test/test_query_condition.py

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ def create_and_populate_dataframe(dataframe: soma.SOMADataFrame) -> None:`
`15`	`15`	`[`
`16`	`16`	`("foo", pa.int32()),`
`17`	`17`	`("bar", pa.float64()),`
`18`		`- ("baz", pa.string()),`
	`18`	`+ ("baz", pa.large_string()),`
`19`	`19`	`]`
`20`	`20`	`)`
`21`	`21`
`@@ -108,7 +108,7 @@ def soma_object(request, tmp_path):`
`108`	`108`
`109`	`109`	`elif class_name == "SOMADataFrame":`
`110`	`110`	`so = soma.SOMADataFrame(uri=uri)`
`111`		`- so.create(pa.schema([("A", pa.int32()), ("B", pa.string())]))`
	`111`	`+ so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())]))`
`112`	`112`
`113`	`113`	`elif class_name == "SOMAIndexedDataFrame":`
`114`	`114`	`so = soma.SOMAIndexedDataFrame(uri=uri)`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ def create_and_populate_obs(obs: soma.SOMADataFrame) -> soma.SOMADataFrame:`
`14`	`14`	`[`
`15`	`15`	`("foo", pa.int32()),`
`16`	`16`	`("bar", pa.float64()),`
`17`		`- ("baz", pa.string()),`
	`17`	`+ ("baz", pa.large_string()),`
`18`	`18`	`]`
`19`	`19`	`)`
`20`	`20`
`@@ -37,7 +37,7 @@ def create_and_populate_var(var: soma.SOMADataFrame) -> soma.SOMADataFrame:`
`37`	`37`
`38`	`38`	`var_arrow_schema = pa.schema(`
`39`	`39`	`[`
`40`		`- ("quux", pa.string()),`
	`40`	`+ ("quux", pa.large_string()),`
`41`	`41`	`("xyzzy", pa.float64()),`
`42`	`42`	`]`
`43`	`43`	`)`