From 938d70d49aa3969a4fee15e7b3ce79f248368e08 Mon Sep 17 00:00:00 2001
From: John Kerl <kerl.john.r@gmail.com>
Date: Mon, 3 Oct 2022 15:48:52 -0400
Subject: [PATCH 1/9] remove util_arrow.concat_tables

---
 apis/python/src/tiledbsoma/soma_dataframe.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py
index 830f2b2a30..25b611c094 100644
--- a/apis/python/src/tiledbsoma/soma_dataframe.py
+++ b/apis/python/src/tiledbsoma/soma_dataframe.py
@@ -254,13 +254,13 @@ def _get_is_sparse(self) -> bool:
 
         return self._cached_is_sparse
 
-    def write(self, values: pa.Table) -> None:
+    def write(self, values: pa.RecordBatch) -> None:
         """
-        Write an Arrow.Table to the persistent object.
+        Write an Arrow.RecordBatch to the persistent object.
 
-        :param values: An Arrow.Table containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``.
+        :param values: An Arrow.RecordBatch containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``.
 
-        The ``values`` Arrow Table must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written.
+        The ``values`` Arrow RecordBatch must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written.
         """
         self._shape = None  # cache-invalidate
 

From 0979f0c493d01dcac7f138bd8793e61ebca4e31c Mon Sep 17 00:00:00 2001
From: John Kerl <kerl.john.r@gmail.com>
Date: Tue, 4 Oct 2022 11:19:43 -0400
Subject: [PATCH 2/9] read/write Table

---
 apis/python/src/tiledbsoma/soma_dataframe.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py
index 25b611c094..830f2b2a30 100644
--- a/apis/python/src/tiledbsoma/soma_dataframe.py
+++ b/apis/python/src/tiledbsoma/soma_dataframe.py
@@ -254,13 +254,13 @@ def _get_is_sparse(self) -> bool:
 
         return self._cached_is_sparse
 
-    def write(self, values: pa.RecordBatch) -> None:
+    def write(self, values: pa.Table) -> None:
         """
-        Write an Arrow.RecordBatch to the persistent object.
+        Write an Arrow.Table to the persistent object.
 
-        :param values: An Arrow.RecordBatch containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``.
+        :param values: An Arrow.Table containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``.
 
-        The ``values`` Arrow RecordBatch must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written.
+        The ``values`` Arrow Table must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written.
         """
         self._shape = None  # cache-invalidate
 

From f208fe07d1bd4178d16bfa4bc35dec901165647c Mon Sep 17 00:00:00 2001
From: John Kerl <kerl.john.r@gmail.com>
Date: Tue, 4 Oct 2022 10:43:32 -0400
Subject: [PATCH 3/9] read/write Table

---
 .github/workflows/ci.yml     | 2 ++
 .github/workflows/cpp-ci.yml | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3d9e23d64a..c2a0bed359 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,6 +20,8 @@ jobs:
           - runs-on: ubuntu-22.04
             cc: gcc-11
             cxx: g++-11
+          - runs-on: macos-11
+          # Pending https://github.com/actions/runner-images/issues/6350
           - runs-on: macos-11
             cc: gcc-11
             cxx: g++-11
diff --git a/.github/workflows/cpp-ci.yml b/.github/workflows/cpp-ci.yml
index 0b0345e0b1..e22f826532 100644
--- a/.github/workflows/cpp-ci.yml
+++ b/.github/workflows/cpp-ci.yml
@@ -18,7 +18,6 @@ jobs:
             cc: gcc-11
             cxx: g++-11
           # Pending https://github.com/actions/runner-images/issues/6350
-          # - runs-on: macos-12
           - runs-on: macos-11
             cc: gcc-11
             cxx: g++-11

From 821a64c24d582b42a8cd0c28afd54d26906fc335 Mon Sep 17 00:00:00 2001
From: John Kerl <kerl.john.r@gmail.com>
Date: Tue, 4 Oct 2022 10:43:32 -0400
Subject: [PATCH 4/9] read/write Table

---
 .github/workflows/ci.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c2a0bed359..8b0cb87f1d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,7 +20,6 @@ jobs:
           - runs-on: ubuntu-22.04
             cc: gcc-11
             cxx: g++-11
-          - runs-on: macos-11
           # Pending https://github.com/actions/runner-images/issues/6350
           - runs-on: macos-11
             cc: gcc-11

From 124e2753ce2853f6446672e147b933cb7a4a0563 Mon Sep 17 00:00:00 2001
From: John Kerl <john.kerl@tiledb.com>
Date: Tue, 4 Oct 2022 11:24:24 -0400
Subject: [PATCH 5/9] Use true ASCII attributes in dataframes

---
 .github/workflows/ci.yml                      |  1 -
 .github/workflows/cpp-ci.yml                  |  1 +
 apis/python/src/tiledbsoma/soma_dataframe.py  | 45 +++----------------
 .../src/tiledbsoma/soma_indexed_dataframe.py  |  6 +--
 apis/python/src/tiledbsoma/util_arrow.py      | 38 ++++------------
 apis/python/src/tiledbsoma/util_pandas.py     | 13 ------
 apis/python/tests/test_type_system.py         |  2 +-
 7 files changed, 16 insertions(+), 90 deletions(-)
 delete mode 100644 apis/python/src/tiledbsoma/util_pandas.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8b0cb87f1d..3d9e23d64a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,7 +20,6 @@ jobs:
           - runs-on: ubuntu-22.04
             cc: gcc-11
             cxx: g++-11
-          # Pending https://github.com/actions/runner-images/issues/6350
           - runs-on: macos-11
             cc: gcc-11
             cxx: g++-11
diff --git a/.github/workflows/cpp-ci.yml b/.github/workflows/cpp-ci.yml
index e22f826532..0b0345e0b1 100644
--- a/.github/workflows/cpp-ci.yml
+++ b/.github/workflows/cpp-ci.yml
@@ -18,6 +18,7 @@ jobs:
             cc: gcc-11
             cxx: g++-11
           # Pending https://github.com/actions/runner-images/issues/6350
+          # - runs-on: macos-12
           - runs-on: macos-11
             cc: gcc-11
             cxx: g++-11
diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py
index 830f2b2a30..e3793056ea 100644
--- a/apis/python/src/tiledbsoma/soma_dataframe.py
+++ b/apis/python/src/tiledbsoma/soma_dataframe.py
@@ -5,7 +5,7 @@
 import pyarrow as pa
 import tiledb
 
-from . import util, util_arrow, util_pandas, util_tiledb
+from . import util, util_arrow, util_tiledb
 from .logging import log_io
 from .soma_collection import SOMACollectionBase
 from .tiledb_array import TileDBArray
@@ -205,11 +205,7 @@ def read(
                 iterator = query.df[ids]
 
             for table in iterator:
-                # XXX COMMENT MORE
-                # This is the 'decode on read' part of our logic; in dim_select we have the
-                # 'encode on write' part.
-                # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
-                yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
+                yield table
 
     def read_all(
         self,
@@ -343,11 +339,6 @@ def read_as_pandas(
 
             for df in iterator:
 
-                # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on
-                # write' part.
-                # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
-                df = util_pandas.ascii_to_unicode_pandas_readback(df)
-
                 if id_column_name is not None:
                     df.reset_index(inplace=True)
                     df.set_index(id_column_name, inplace=True)
@@ -428,39 +419,13 @@ def write_from_pandas(
 
         dataframe.set_index(ROWID, inplace=True)
 
-        # ISSUE:
-        #
-        # TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB
-        # QueryCondition API. While this needs to be addressed -- global collaborators will want to
-        # write annotation-dataframe values in Unicode -- until then, to make obs/var data possible
-        # to query, we need to store these as ASCII.
-        #
-        # This is (besides collation) a storage-level issue not a presentation-level issue: At write
-        # time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since
-        # SOMA is an API: utf8-decode those strings when a query is done & give the user back
-        # "α,β,γ".
-        #
-        # CONTEXT:
-        # https://github.com/single-cell-data/TileDB-SOMA/issues/99
-        # https://github.com/single-cell-data/TileDB-SOMA/pull/101
-        # https://github.com/single-cell-data/TileDB-SOMA/issues/106
-        # https://github.com/single-cell-data/TileDB-SOMA/pull/117
-        #
-        # IMPLEMENTATION:
-        # Python types -- float, string, what have you -- appear as dtype('O') which is not useful.
-        # Also, ``tiledb.from_pandas`` has ``column_types`` but that _forces_ things to string to a
-        # particular if they shouldn't be.
-        #
-        # Instead, we use ``dataframe.convert_dtypes`` to get a little jump on what ``tiledb.from_pandas``
-        # is going to be doing anyway, namely, type-inferring to see what is going to be a string.
-        #
-        # TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
+        # Force ASCII storage if string, in order to make obs/var columns queryable.
+        # TODO: when UTF-8 attributes are fully supported we can remove this.
         column_types = {}
         for column_name in dataframe.keys():
             dfc = dataframe[column_name]
             if len(dfc) > 0 and type(dfc[0]) == str:
-                # Force ASCII storage if string, in order to make obs/var columns queryable.
-                column_types[column_name] = np.dtype("S")
+                column_types[column_name] = "ascii"
 
         tiledb.from_pandas(
             uri=self.uri,
diff --git a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py
index 1bb13f4f37..5e7fc84dd1 100644
--- a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py
+++ b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py
@@ -250,11 +250,7 @@ def read(
                 iterator = query.df[ids]
 
             for table in iterator:
-                # XXX COMMENT MORE
-                # This is the 'decode on read' part of our logic; in dim_select we have the
-                # 'encode on write' part.
-                # Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99.
-                yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
+                yield table
 
     def read_all(
         self,
diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
index cafba6013a..8ceb7d4627 100644
--- a/apis/python/src/tiledbsoma/util_arrow.py
+++ b/apis/python/src/tiledbsoma/util_arrow.py
@@ -23,9 +23,7 @@
     #
     # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
     #
-    pa.string(): np.dtype(
-        "S"
-    ),  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
+    pa.string(): "ascii",  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
     pa.binary(): np.dtype("S"),
     pa.timestamp("s"): "datetime64[s]",
     pa.timestamp("ms"): "datetime64[ms]",
@@ -39,7 +37,7 @@
 }
 
 
-def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
+def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
     """
     Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
     Building block for Arrow-to-TileDB schema translation.
@@ -61,7 +59,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
         arrow_type = ARROW_TO_TDB[t]
         if isinstance(arrow_type, Exception):
             raise arrow_type
-        return np.dtype(arrow_type)
+        if arrow_type == "ascii":
+            return arrow_type
+        else:
+            return np.dtype(arrow_type)
 
     if not pa.types.is_primitive(t):
         raise TypeError(f"Type {str(t)} - unsupported type")
@@ -83,11 +84,11 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
         raise TypeError("Unsupported Arrow type") from exc
 
 
-def get_arrow_type_from_tiledb_dtype(tiledb_dtype: np.dtype) -> pa.DataType:
+def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType:
     """
     TODO: COMMENT
     """
-    if tiledb_dtype.name == "bytes":
+    if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes":
         # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
         return pa.string()
     else:
@@ -119,26 +120,3 @@ def get_arrow_schema_from_tiledb_uri(
             arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype)
 
     return pa.schema(arrow_schema_dict)
-
-
-def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table:
-    """
-    Implements the 'decode on read' part of our ASCII/Unicode logic
-    """
-    # TODO: COMMENT/LINK HEAVILY
-    names = [ofield.name for ofield in table.schema]
-    new_fields = []
-    for name in names:
-        old_field = table[name]
-        # Preferred syntax:
-        # if len(old_field) > 0 and pa.types.is_large_binary(old_field[0]):
-        # but:
-        # AttributeError: 'pyarrow.lib.UInt64Scalar' object has no attribute 'id'
-        if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar):
-            nfield = pa.array(
-                [element.as_py().decode("utf-8") for element in old_field]
-            )
-            new_fields.append(nfield)
-        else:
-            new_fields.append(old_field)
-    return pa.Table.from_arrays(new_fields, names=names)
diff --git a/apis/python/src/tiledbsoma/util_pandas.py b/apis/python/src/tiledbsoma/util_pandas.py
deleted file mode 100644
index b5caf01255..0000000000
--- a/apis/python/src/tiledbsoma/util_pandas.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import pandas as pd
-
-
-def ascii_to_unicode_pandas_readback(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Implements the 'decode on read' part of our ASCII/Unicode logic.
-    """
-    # TODO: COMMENT/LINK HEAVILY
-    for k in df:
-        dfk = df[k]
-        if len(dfk) > 0 and type(dfk.iat[0]) == bytes:
-            df[k] = dfk.map(lambda e: e.decode())
-    return df
diff --git a/apis/python/tests/test_type_system.py b/apis/python/tests/test_type_system.py
index 74af8fb5bb..4e0d515be4 100644
--- a/apis/python/tests/test_type_system.py
+++ b/apis/python/tests/test_type_system.py
@@ -65,7 +65,7 @@ def test_supported_types_supported(arrow_type):
         pytest.xfail("Awaiting UTF-8 support - see issue #338")
 
     tdb_dtype = tiledb_type_from_arrow_type(arrow_type)
-    assert isinstance(tdb_dtype, np.dtype)
+    assert isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii"
     rt_arrow_type = get_arrow_type_from_tiledb_dtype(tdb_dtype)
     assert isinstance(rt_arrow_type, pa.DataType)
     assert arrow_type == rt_arrow_type

From 324654b2cf9d2b186ab79ee69fcc1ae1ecb0bf22 Mon Sep 17 00:00:00 2001
From: John Kerl <kerl.john.r@gmail.com>
Date: Wed, 5 Oct 2022 15:12:17 -0400
Subject: [PATCH 6/9] iterate on string vs large_string

---
 apis/python/src/tiledbsoma/util_arrow.py         |  4 +++-
 apis/python/tests/test_soma_collection.py        |  4 ++--
 apis/python/tests/test_soma_dataframe.py         |  4 ++--
 apis/python/tests/test_soma_experiment_basic.py  |  4 ++--
 apis/python/tests/test_soma_indexed_dataframe.py | 10 +++++++++-
 apis/python/tests/test_soma_metadata.py          |  2 +-
 apis/python/tests/test_type_system.py            |  2 ++
 7 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
index 8ceb7d4627..d48420de6e 100644
--- a/apis/python/src/tiledbsoma/util_arrow.py
+++ b/apis/python/src/tiledbsoma/util_arrow.py
@@ -24,7 +24,9 @@
     # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
     #
     pa.string(): "ascii",  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
+    pa.large_string(): "ascii",  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
     pa.binary(): np.dtype("S"),
+    pa.large_binary(): np.dtype("S"),
     pa.timestamp("s"): "datetime64[s]",
     pa.timestamp("ms"): "datetime64[ms]",
     pa.timestamp("us"): "datetime64[us]",
@@ -90,7 +92,7 @@ def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.D
     """
     if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes":
         # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
-        return pa.string()
+        return pa.large_string()
     else:
         return pa.from_numpy_dtype(tiledb_dtype)
 
diff --git a/apis/python/tests/test_soma_collection.py b/apis/python/tests/test_soma_collection.py
index b022e5d5b3..6cd92bf502 100644
--- a/apis/python/tests/test_soma_collection.py
+++ b/apis/python/tests/test_soma_collection.py
@@ -15,7 +15,7 @@ def create_and_populate_dataframe(dataframe: soma.SOMADataFrame) -> None:
         [
             ("foo", pa.int32()),
             ("bar", pa.float64()),
-            ("baz", pa.string()),
+            ("baz", pa.large_string()),
         ]
     )
 
@@ -108,7 +108,7 @@ def soma_object(request, tmp_path):
 
     elif class_name == "SOMADataFrame":
         so = soma.SOMADataFrame(uri=uri)
-        so.create(pa.schema([("A", pa.int32()), ("B", pa.string())]))
+        so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())]))
 
     elif class_name == "SOMAIndexedDataFrame":
         so = soma.SOMAIndexedDataFrame(uri=uri)
diff --git a/apis/python/tests/test_soma_dataframe.py b/apis/python/tests/test_soma_dataframe.py
index 1ea1706e9a..b09b3a71a8 100644
--- a/apis/python/tests/test_soma_dataframe.py
+++ b/apis/python/tests/test_soma_dataframe.py
@@ -13,7 +13,7 @@ def test_soma_dataframe_non_indexed(tmp_path):
         [
             ("foo", pa.int32()),
             ("bar", pa.float64()),
-            ("baz", pa.string()),
+            ("baz", pa.large_string()),
         ]
     )
     sdf.create(schema=asch)
@@ -120,7 +120,7 @@ def simple_soma_data_frame(tmp_path):
             ("soma_rowid", pa.uint64()),
             ("A", pa.int64()),
             ("B", pa.float64()),
-            ("C", pa.string()),
+            ("C", pa.large_string()),
         ]
     )
     sdf = t.SOMADataFrame(uri=tmp_path.as_posix())
diff --git a/apis/python/tests/test_soma_experiment_basic.py b/apis/python/tests/test_soma_experiment_basic.py
index 622f257242..e5ee78d903 100644
--- a/apis/python/tests/test_soma_experiment_basic.py
+++ b/apis/python/tests/test_soma_experiment_basic.py
@@ -14,7 +14,7 @@ def create_and_populate_obs(obs: soma.SOMADataFrame) -> soma.SOMADataFrame:
         [
             ("foo", pa.int32()),
             ("bar", pa.float64()),
-            ("baz", pa.string()),
+            ("baz", pa.large_string()),
         ]
     )
 
@@ -37,7 +37,7 @@ def create_and_populate_var(var: soma.SOMADataFrame) -> soma.SOMADataFrame:
 
     var_arrow_schema = pa.schema(
         [
-            ("quux", pa.string()),
+            ("quux", pa.large_string()),
             ("xyzzy", pa.float64()),
         ]
     )
diff --git a/apis/python/tests/test_soma_indexed_dataframe.py b/apis/python/tests/test_soma_indexed_dataframe.py
index c9766f925e..1ad87e8d48 100644
--- a/apis/python/tests/test_soma_indexed_dataframe.py
+++ b/apis/python/tests/test_soma_indexed_dataframe.py
@@ -21,6 +21,14 @@ def _schema():
 def test_soma_indexed_dataframe(tmp_path, arrow_schema):
     sdf = t.SOMAIndexedDataFrame(uri=tmp_path.as_posix())
 
+    asch = pa.schema(
+        [
+            ("foo", pa.int32()),
+            ("bar", pa.float64()),
+            ("baz", pa.large_string()),
+        ]
+    )
+
     # Create
     asch = arrow_schema()
     sdf.create(schema=asch, index_column_names=["foo"])
@@ -72,7 +80,7 @@ def simple_soma_indexed_data_frame(tmp_path):
             ("index", pa.uint64()),
             ("A", pa.int64()),
             ("B", pa.float64()),
-            ("C", pa.string()),
+            ("C", pa.large_string()),
         ]
     )
     index_column_names = ["index"]
diff --git a/apis/python/tests/test_soma_metadata.py b/apis/python/tests/test_soma_metadata.py
index da8b1174b1..757aacf498 100644
--- a/apis/python/tests/test_soma_metadata.py
+++ b/apis/python/tests/test_soma_metadata.py
@@ -33,7 +33,7 @@ def soma_object(request, tmp_path):
 
     elif class_name == "SOMADataFrame":
         so = soma.SOMADataFrame(uri=uri)
-        so.create(pa.schema([("A", pa.int32()), ("B", pa.string())]))
+        so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())]))
 
     elif class_name == "SOMAIndexedDataFrame":
         so = soma.SOMAIndexedDataFrame(uri=uri)
diff --git a/apis/python/tests/test_type_system.py b/apis/python/tests/test_type_system.py
index 4e0d515be4..1e18df0c67 100644
--- a/apis/python/tests/test_type_system.py
+++ b/apis/python/tests/test_type_system.py
@@ -25,7 +25,9 @@
     pa.timestamp("us"),
     pa.timestamp("ns"),
     pa.string(),
+    pa.large_string(),
     pa.binary(),
+    pa.large_binary(),
 ]
 
 

From 8f2c6de7449bbc7aad812452693ee6a5d44531fd Mon Sep 17 00:00:00 2001
From: John Kerl <kerl.john.r@gmail.com>
Date: Mon, 10 Oct 2022 20:17:58 -0400
Subject: [PATCH 7/9] Fix ascii/binary issues in TileDB->Arrow and
 TileDB->Pandas->Arrows test cases

---
 apis/python/src/tiledbsoma/soma_dataframe.py |  2 +
 apis/python/src/tiledbsoma/util_arrow.py     | 22 ++++++---
 apis/python/tests/test_soma_dataframe.py     | 34 ++++++++++---
 apis/python/tests/test_type_system.py        | 52 +++++++++++++++-----
 4 files changed, 83 insertions(+), 27 deletions(-)

diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py
index e3793056ea..fb10998d08 100644
--- a/apis/python/src/tiledbsoma/soma_dataframe.py
+++ b/apis/python/src/tiledbsoma/soma_dataframe.py
@@ -426,6 +426,8 @@ def write_from_pandas(
             dfc = dataframe[column_name]
             if len(dfc) > 0 and type(dfc[0]) == str:
                 column_types[column_name] = "ascii"
+            if len(dfc) > 0 and type(dfc[0]) == bytes:
+                column_types[column_name] = "bytes"
 
         tiledb.from_pandas(
             uri=self.uri,
diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
index d48420de6e..e01cc70a88 100644
--- a/apis/python/src/tiledbsoma/util_arrow.py
+++ b/apis/python/src/tiledbsoma/util_arrow.py
@@ -9,13 +9,17 @@
 of representing full type semantics, and correctly performing a
 round trip conversion (eg, T == to_arrow(to_tiledb(T)))
 
-Most primitive types are simple - eg, uint8. Of particular challenge
+Most primitive types are simple -- e.g., uint8. Of particular challenge
 are datetime/timestamps as TileDB has no distinction between a "datetime" and
 a "timedelta". The best Arrow match is TimestampType, as long as that
 TimestampType instance does NOT have a timezone set.
 
 Because of our round-trip requirement, all other Arrow temporal types
 are unsupported (even though they are just int64 under the covers).
+
+We auto-promote Arrow's string and binary to large_string and large_binary,
+respectively, as this is what TileDB stores -- a sequence of bytes preceded
+by a 64-bit (not 32-bit) length int.
 """
 ARROW_TO_TDB = {
     # Dict of types unsupported by to_pandas_dtype, which require overrides.
@@ -25,8 +29,8 @@
     #
     pa.string(): "ascii",  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
     pa.large_string(): "ascii",  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
-    pa.binary(): np.dtype("S"),
-    pa.large_binary(): np.dtype("S"),
+    pa.binary(): "bytes",  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
+    pa.large_binary(): "bytes",  # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
     pa.timestamp("s"): "datetime64[s]",
     pa.timestamp("ms"): "datetime64[ms]",
     pa.timestamp("us"): "datetime64[us]",
@@ -63,8 +67,9 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
             raise arrow_type
         if arrow_type == "ascii":
             return arrow_type
-        else:
-            return np.dtype(arrow_type)
+        if arrow_type == "bytes":
+            return arrow_type  # np.int8()
+        return np.dtype(arrow_type)
 
     if not pa.types.is_primitive(t):
         raise TypeError(f"Type {str(t)} - unsupported type")
@@ -90,11 +95,12 @@ def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.D
     """
     TODO: COMMENT
     """
-    if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes":
+    if tiledb_dtype == "bytes":
+        return pa.large_binary()
+    if isinstance(tiledb_dtype, str) and tiledb_dtype == "ascii":
         # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
         return pa.large_string()
-    else:
-        return pa.from_numpy_dtype(tiledb_dtype)
+    return pa.from_numpy_dtype(tiledb_dtype)
 
 
 def get_arrow_schema_from_tiledb_uri(
diff --git a/apis/python/tests/test_soma_dataframe.py b/apis/python/tests/test_soma_dataframe.py
index b09b3a71a8..fee48d3f7f 100644
--- a/apis/python/tests/test_soma_dataframe.py
+++ b/apis/python/tests/test_soma_dataframe.py
@@ -174,37 +174,57 @@ def test_SOMADataFrame_read_column_names(simple_soma_data_frame, ids, col_names)
     schema, sdf, n_data = simple_soma_data_frame
     assert sdf.exists()
 
-    def _check_tbl(tbl, col_names, ids):
+    def _check_tbl(tbl, col_names, ids, *, demote):
         assert tbl.num_columns == (
             len(schema.names) if col_names is None else len(col_names)
         )
         assert tbl.num_rows == (n_data if ids is None else len(ids))
-        assert tbl.schema == pa.schema(
-            [
-                schema.field(f)
-                for f in (col_names if col_names is not None else schema.names)
-            ]
-        )
 
+        if demote:
+            assert tbl.schema == pa.schema(
+                [
+                    pa.field(schema.field(f).name, pa.string())
+                    if schema.field(f).type == pa.large_string()
+                    else schema.field(f)
+                    for f in (col_names if col_names is not None else schema.names)
+                ]
+            )
+        else:
+            assert tbl.schema == pa.schema(
+                [
+                    schema.field(f)
+                    for f in (col_names if col_names is not None else schema.names)
+                ]
+            )
+
+    # TileDB ASCII -> Arrow large_string
     _check_tbl(
         sdf.read_all(ids=ids, column_names=col_names),
         col_names,
         ids,
+        demote=False,
     )
+
     _check_tbl(
         sdf.read_all(column_names=col_names),
         col_names,
         None,
+        demote=False,
     )
+
+    # TileDB ASCII -> Pandas string -> Arrow string (not large_string)
     _check_tbl(
         pa.Table.from_pandas(
             pd.concat(sdf.read_as_pandas(ids=ids, column_names=col_names))
         ),
         col_names,
         ids,
+        demote=True,
     )
+
     _check_tbl(
         pa.Table.from_pandas(sdf.read_as_pandas_all(column_names=col_names)),
         col_names,
         None,
+        demote=True,
     )
diff --git a/apis/python/tests/test_type_system.py b/apis/python/tests/test_type_system.py
index 1e18df0c67..62988001ff 100644
--- a/apis/python/tests/test_type_system.py
+++ b/apis/python/tests/test_type_system.py
@@ -24,12 +24,20 @@
     pa.timestamp("ms"),
     pa.timestamp("us"),
     pa.timestamp("ns"),
-    pa.string(),
+    # We use Arrow's large_string for ASCII and, ultimately, for Unicode as well
+    # https://github.com/single-cell-data/TileDB-SOMA/issues/99
+    # https://github.com/single-cell-data/TileDB-SOMA/pull/359
+    # https://github.com/single-cell-data/TileDB-SOMA/issues/274
     pa.large_string(),
-    pa.binary(),
     pa.large_binary(),
 ]
 
+"""Arrow types we expect to auto-promote"""
+PROMOTED_ARROW_TYPES = [
+    (pa.string(), pa.large_string()),
+    # XXX (pa.binary(), pa.large_binary()),
+]
+
 
 """Arrow types we expect to fail"""
 UNSUPPORTED_ARROW_TYPES = [
@@ -46,10 +54,13 @@
     pa.duration("us"),
     pa.duration("ns"),
     pa.month_day_nano_interval(),
+    # We use Arrow's large_string for ASCII and, ultimately, for Unicode as well
+    # https://github.com/single-cell-data/TileDB-SOMA/issues/99
+    # https://github.com/single-cell-data/TileDB-SOMA/pull/359
+    # https://github.com/single-cell-data/TileDB-SOMA/issues/274
+    pa.string(),
     pa.binary(),
     pa.binary(10),
-    pa.large_binary(),
-    pa.large_string(),
     pa.decimal128(1),
     pa.decimal128(38),
     pa.list_(pa.int8()),
@@ -61,20 +72,37 @@
 
 
 @pytest.mark.parametrize("arrow_type", SUPPORTED_ARROW_TYPES)
-def test_supported_types_supported(arrow_type):
+def test_arrow_types_supported(arrow_type):
     """Verify round-trip conversion of types"""
-    if pa.types.is_binary(arrow_type):
-        pytest.xfail("Awaiting UTF-8 support - see issue #338")
+    # if pa.types.is_binary(arrow_type):
+    # pytest.xfail("Awaiting UTF-8 support - see issue #274")
 
     tdb_dtype = tiledb_type_from_arrow_type(arrow_type)
-    assert isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii"
-    rt_arrow_type = get_arrow_type_from_tiledb_dtype(tdb_dtype)
-    assert isinstance(rt_arrow_type, pa.DataType)
-    assert arrow_type == rt_arrow_type
+    assert (
+        isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii" or tdb_dtype == "bytes"
+    )
+    arrow_rt_type = get_arrow_type_from_tiledb_dtype(tdb_dtype)
+    assert isinstance(arrow_rt_type, pa.DataType)
+    assert arrow_type == arrow_rt_type
+
+
+@pytest.mark.parametrize("arrow_from_to_pair", PROMOTED_ARROW_TYPES)
+def test_arrow_types_promoted(arrow_from_to_pair):
+    """Verify round-trip conversion of types"""
+    arrow_from_type = arrow_from_to_pair[0]
+    arrow_to_type = arrow_from_to_pair[1]
+
+    tdb_dtype = tiledb_type_from_arrow_type(arrow_from_type)
+    assert (
+        isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii" or tdb_dtype == "bytes"
+    )
+    arrow_rt_type = get_arrow_type_from_tiledb_dtype(tdb_dtype)
+    assert isinstance(arrow_rt_type, pa.DataType)
+    assert arrow_to_type == arrow_rt_type
 
 
 @pytest.mark.parametrize("arrow_type", UNSUPPORTED_ARROW_TYPES)
-def test_supported_types_unsupported(arrow_type):
+def test_arrow_types_unsupported(arrow_type):
     """Verify correct error for unsupported types"""
     with pytest.raises(TypeError):
         tiledb_type_from_arrow_type(arrow_type, match=r".*unsupported type.*")

From 07327904ff7d1aa54d3c209c1315ce4090c6a442 Mon Sep 17 00:00:00 2001
From: John Kerl <john.kerl@tiledb.com>
Date: Tue, 11 Oct 2022 17:55:39 +0000
Subject: [PATCH 8/9] more

---
 libtiledbsoma/test/test_query_condition.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libtiledbsoma/test/test_query_condition.py b/libtiledbsoma/test/test_query_condition.py
index 57f4788085..1e3f8e09eb 100755
--- a/libtiledbsoma/test/test_query_condition.py
+++ b/libtiledbsoma/test/test_query_condition.py
@@ -54,7 +54,7 @@ def test_query_condition_int():
 
 def test_query_condition_string():
     uri = os.path.join(SOMA_URI, "obs")
-    condition = "louvain == b'NK cells'"
+    condition = 'louvain == "NK cells"'
 
     pandas = pandas_query(uri, condition)
 
@@ -90,7 +90,7 @@ def test_query_condition_and():
 
 def test_query_condition_and_or():
     uri = os.path.join(SOMA_URI, "obs")
-    condition = "(percent_mito > 0.02 and n_genes > 700) or (percent_mito < 0.015 and louvain == b'B cells')"
+    condition = "(percent_mito > 0.02 and n_genes > 700) or (percent_mito < 0.015 and louvain == 'B cells')"
 
     pandas = pandas_query(uri, condition)
 

From 0905781b1c017d38f4818153448010b359ac1db5 Mon Sep 17 00:00:00 2001
From: John Kerl <john.kerl@tiledb.com>
Date: Tue, 11 Oct 2022 18:42:59 +0000
Subject: [PATCH 9/9] fix unit tests

---
 apis/python/src/tiledbsoma/soma_dataframe.py |  2 +-
 apis/python/src/tiledbsoma/util_arrow.py     | 14 ++++++++++++++
 libtiledbsoma/test/test_query_condition.py   |  2 +-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py
index fb10998d08..e046fbcf47 100644
--- a/apis/python/src/tiledbsoma/soma_dataframe.py
+++ b/apis/python/src/tiledbsoma/soma_dataframe.py
@@ -271,7 +271,7 @@ def write(self, values: pa.Table) -> None:
             if name != ROWID:
                 attr_cols_map[name] = np.asarray(
                     values.column(name).to_pandas(
-                        types_mapper=util_arrow.tiledb_type_from_arrow_type,
+                        types_mapper=util_arrow.tiledb_type_from_arrow_type_for_write,
                     )
                 )
 
diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py
index e01cc70a88..a67de8118c 100644
--- a/apis/python/src/tiledbsoma/util_arrow.py
+++ b/apis/python/src/tiledbsoma/util_arrow.py
@@ -43,6 +43,20 @@
 }
 
 
+def tiledb_type_from_arrow_type_for_write(t: pa.DataType) -> Union[type, np.dtype, str]:
+    """
+    Same as ``tiledb_type_from_arrow_type`` except that this is used for writing to a TileDB array.
+    The syntax of TileDB-Py is such that when we want to create a schema with an ASCII column,
+    we use the string ``"ascii"`` in place of a dtype. But when we want to write data, we need to
+    use a dtype of ``np.str``, which is now deprecated in favor of simply ``str``.
+    """
+    retval = tiledb_type_from_arrow_type(t)
+    if retval == "ascii":
+        return str
+    else:
+        return retval
+
+
 def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
     """
     Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
diff --git a/libtiledbsoma/test/test_query_condition.py b/libtiledbsoma/test/test_query_condition.py
index 1e3f8e09eb..77b6977d49 100755
--- a/libtiledbsoma/test/test_query_condition.py
+++ b/libtiledbsoma/test/test_query_condition.py
@@ -90,7 +90,7 @@ def test_query_condition_and():
 
 def test_query_condition_and_or():
     uri = os.path.join(SOMA_URI, "obs")
-    condition = "(percent_mito > 0.02 and n_genes > 700) or (percent_mito < 0.015 and louvain == 'B cells')"
+    condition = '(percent_mito > 0.02 and n_genes > 700) or (percent_mito < 0.015 and louvain == "B cells")'
 
     pandas = pandas_query(uri, condition)