From 938d70d49aa3969a4fee15e7b3ce79f248368e08 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 3 Oct 2022 15:48:52 -0400 Subject: [PATCH 1/9] remove util_arrow.concat_tables --- apis/python/src/tiledbsoma/soma_dataframe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py index 830f2b2a30..25b611c094 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_dataframe.py @@ -254,13 +254,13 @@ def _get_is_sparse(self) -> bool: return self._cached_is_sparse - def write(self, values: pa.Table) -> None: + def write(self, values: pa.RecordBatch) -> None: """ - Write an Arrow.Table to the persistent object. + Write an Arrow.RecordBatch to the persistent object. - :param values: An Arrow.Table containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``. + :param values: An Arrow.RecordBatch containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``. - The ``values`` Arrow Table must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written. + The ``values`` Arrow RecordBatch must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written. """ self._shape = None # cache-invalidate From 0979f0c493d01dcac7f138bd8793e61ebca4e31c Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 4 Oct 2022 11:19:43 -0400 Subject: [PATCH 2/9] read/write Table --- apis/python/src/tiledbsoma/soma_dataframe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py index 25b611c094..830f2b2a30 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_dataframe.py @@ -254,13 +254,13 @@ def _get_is_sparse(self) -> bool: return self._cached_is_sparse - def write(self, values: pa.RecordBatch) -> None: + def write(self, values: pa.Table) -> None: """ - Write an Arrow.RecordBatch to the persistent object. + Write an Arrow.Table to the persistent object. - :param values: An Arrow.RecordBatch containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``. + :param values: An Arrow.Table containing all columns, including the index columns. The schema for the values must match the schema for the ``SOMADataFrame``. - The ``values`` Arrow RecordBatch must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written. + The ``values`` Arrow Table must contain a ``soma_rowid`` (uint64) column, indicating which rows are being written. """ self._shape = None # cache-invalidate From f208fe07d1bd4178d16bfa4bc35dec901165647c Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 4 Oct 2022 10:43:32 -0400 Subject: [PATCH 3/9] read/write Table --- .github/workflows/ci.yml | 2 ++ .github/workflows/cpp-ci.yml | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3d9e23d64a..c2a0bed359 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,6 +20,8 @@ jobs: - runs-on: ubuntu-22.04 cc: gcc-11 cxx: g++-11 + - runs-on: macos-11 + # Pending https://github.com/actions/runner-images/issues/6350 - runs-on: macos-11 cc: gcc-11 cxx: g++-11 diff --git a/.github/workflows/cpp-ci.yml b/.github/workflows/cpp-ci.yml index 0b0345e0b1..e22f826532 100644 --- a/.github/workflows/cpp-ci.yml +++ b/.github/workflows/cpp-ci.yml @@ -18,7 +18,6 @@ jobs: cc: gcc-11 cxx: g++-11 # Pending https://github.com/actions/runner-images/issues/6350 - # - runs-on: macos-12 - runs-on: macos-11 cc: gcc-11 cxx: g++-11 From 821a64c24d582b42a8cd0c28afd54d26906fc335 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 4 Oct 2022 10:43:32 -0400 Subject: [PATCH 4/9] read/write Table --- .github/workflows/ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c2a0bed359..8b0cb87f1d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,6 @@ jobs: - runs-on: ubuntu-22.04 cc: gcc-11 cxx: g++-11 - - runs-on: macos-11 # Pending https://github.com/actions/runner-images/issues/6350 - runs-on: macos-11 cc: gcc-11 From 124e2753ce2853f6446672e147b933cb7a4a0563 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 4 Oct 2022 11:24:24 -0400 Subject: [PATCH 5/9] Use true ASCII attributes in dataframes --- .github/workflows/ci.yml | 1 - .github/workflows/cpp-ci.yml | 1 + apis/python/src/tiledbsoma/soma_dataframe.py | 45 +++---------------- .../src/tiledbsoma/soma_indexed_dataframe.py | 6 +-- apis/python/src/tiledbsoma/util_arrow.py | 38 ++++------------ apis/python/src/tiledbsoma/util_pandas.py | 13 ------ apis/python/tests/test_type_system.py | 2 +- 7 files changed, 16 insertions(+), 90 deletions(-) delete mode 100644 apis/python/src/tiledbsoma/util_pandas.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8b0cb87f1d..3d9e23d64a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,6 @@ jobs: - runs-on: ubuntu-22.04 cc: gcc-11 cxx: g++-11 - # Pending https://github.com/actions/runner-images/issues/6350 - runs-on: macos-11 cc: gcc-11 cxx: g++-11 diff --git a/.github/workflows/cpp-ci.yml b/.github/workflows/cpp-ci.yml index e22f826532..0b0345e0b1 100644 --- a/.github/workflows/cpp-ci.yml +++ b/.github/workflows/cpp-ci.yml @@ -18,6 +18,7 @@ jobs: cc: gcc-11 cxx: g++-11 # Pending https://github.com/actions/runner-images/issues/6350 + # - runs-on: macos-12 - runs-on: macos-11 cc: gcc-11 cxx: g++-11 diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py index 830f2b2a30..e3793056ea 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_dataframe.py @@ -5,7 +5,7 @@ import pyarrow as pa import tiledb -from . import util, util_arrow, util_pandas, util_tiledb +from . import util, util_arrow, util_tiledb from .logging import log_io from .soma_collection import SOMACollectionBase from .tiledb_array import TileDBArray @@ -205,11 +205,7 @@ def read( iterator = query.df[ids] for table in iterator: - # XXX COMMENT MORE - # This is the 'decode on read' part of our logic; in dim_select we have the - # 'encode on write' part. - # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99. - yield util_arrow.ascii_to_unicode_pyarrow_readback(table) + yield table def read_all( self, @@ -343,11 +339,6 @@ def read_as_pandas( for df in iterator: - # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on - # write' part. - # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99. - df = util_pandas.ascii_to_unicode_pandas_readback(df) - if id_column_name is not None: df.reset_index(inplace=True) df.set_index(id_column_name, inplace=True) @@ -428,39 +419,13 @@ def write_from_pandas( dataframe.set_index(ROWID, inplace=True) - # ISSUE: - # - # TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB - # QueryCondition API. While this needs to be addressed -- global collaborators will want to - # write annotation-dataframe values in Unicode -- until then, to make obs/var data possible - # to query, we need to store these as ASCII. - # - # This is (besides collation) a storage-level issue not a presentation-level issue: At write - # time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since - # SOMA is an API: utf8-decode those strings when a query is done & give the user back - # "α,β,γ". - # - # CONTEXT: - # https://github.com/single-cell-data/TileDB-SOMA/issues/99 - # https://github.com/single-cell-data/TileDB-SOMA/pull/101 - # https://github.com/single-cell-data/TileDB-SOMA/issues/106 - # https://github.com/single-cell-data/TileDB-SOMA/pull/117 - # - # IMPLEMENTATION: - # Python types -- float, string, what have you -- appear as dtype('O') which is not useful. - # Also, ``tiledb.from_pandas`` has ``column_types`` but that _forces_ things to string to a - # particular if they shouldn't be. - # - # Instead, we use ``dataframe.convert_dtypes`` to get a little jump on what ``tiledb.from_pandas`` - # is going to be doing anyway, namely, type-inferring to see what is going to be a string. - # - # TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this. + # Force ASCII storage if string, in order to make obs/var columns queryable. + # TODO: when UTF-8 attributes are fully supported we can remove this. column_types = {} for column_name in dataframe.keys(): dfc = dataframe[column_name] if len(dfc) > 0 and type(dfc[0]) == str: - # Force ASCII storage if string, in order to make obs/var columns queryable. - column_types[column_name] = np.dtype("S") + column_types[column_name] = "ascii" tiledb.from_pandas( uri=self.uri, diff --git a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py index 1bb13f4f37..5e7fc84dd1 100644 --- a/apis/python/src/tiledbsoma/soma_indexed_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_indexed_dataframe.py @@ -250,11 +250,7 @@ def read( iterator = query.df[ids] for table in iterator: - # XXX COMMENT MORE - # This is the 'decode on read' part of our logic; in dim_select we have the - # 'encode on write' part. - # Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99. - yield util_arrow.ascii_to_unicode_pyarrow_readback(table) + yield table def read_all( self, diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py index cafba6013a..8ceb7d4627 100644 --- a/apis/python/src/tiledbsoma/util_arrow.py +++ b/apis/python/src/tiledbsoma/util_arrow.py @@ -23,9 +23,7 @@ # # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table. # - pa.string(): np.dtype( - "S" - ), # XXX TODO: temporary work-around until UTF8 support is native. GH #338. + pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338. pa.binary(): np.dtype("S"), pa.timestamp("s"): "datetime64[s]", pa.timestamp("ms"): "datetime64[ms]", @@ -39,7 +37,7 @@ } -def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]: +def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]: """ Given an Arrow type, return the corresponding TileDB type as a Numpy dtype. Building block for Arrow-to-TileDB schema translation. @@ -61,7 +59,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]: arrow_type = ARROW_TO_TDB[t] if isinstance(arrow_type, Exception): raise arrow_type - return np.dtype(arrow_type) + if arrow_type == "ascii": + return arrow_type + else: + return np.dtype(arrow_type) if not pa.types.is_primitive(t): raise TypeError(f"Type {str(t)} - unsupported type") @@ -83,11 +84,11 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]: raise TypeError("Unsupported Arrow type") from exc -def get_arrow_type_from_tiledb_dtype(tiledb_dtype: np.dtype) -> pa.DataType: +def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType: """ TODO: COMMENT """ - if tiledb_dtype.name == "bytes": + if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes": # XXX TODO: temporary work-around until UTF8 support is native. GH #338. return pa.string() else: @@ -119,26 +120,3 @@ def get_arrow_schema_from_tiledb_uri( arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype) return pa.schema(arrow_schema_dict) - - -def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table: - """ - Implements the 'decode on read' part of our ASCII/Unicode logic - """ - # TODO: COMMENT/LINK HEAVILY - names = [ofield.name for ofield in table.schema] - new_fields = [] - for name in names: - old_field = table[name] - # Preferred syntax: - # if len(old_field) > 0 and pa.types.is_large_binary(old_field[0]): - # but: - # AttributeError: 'pyarrow.lib.UInt64Scalar' object has no attribute 'id' - if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar): - nfield = pa.array( - [element.as_py().decode("utf-8") for element in old_field] - ) - new_fields.append(nfield) - else: - new_fields.append(old_field) - return pa.Table.from_arrays(new_fields, names=names) diff --git a/apis/python/src/tiledbsoma/util_pandas.py b/apis/python/src/tiledbsoma/util_pandas.py deleted file mode 100644 index b5caf01255..0000000000 --- a/apis/python/src/tiledbsoma/util_pandas.py +++ /dev/null @@ -1,13 +0,0 @@ -import pandas as pd - - -def ascii_to_unicode_pandas_readback(df: pd.DataFrame) -> pd.DataFrame: - """ - Implements the 'decode on read' part of our ASCII/Unicode logic. - """ - # TODO: COMMENT/LINK HEAVILY - for k in df: - dfk = df[k] - if len(dfk) > 0 and type(dfk.iat[0]) == bytes: - df[k] = dfk.map(lambda e: e.decode()) - return df diff --git a/apis/python/tests/test_type_system.py b/apis/python/tests/test_type_system.py index 74af8fb5bb..4e0d515be4 100644 --- a/apis/python/tests/test_type_system.py +++ b/apis/python/tests/test_type_system.py @@ -65,7 +65,7 @@ def test_supported_types_supported(arrow_type): pytest.xfail("Awaiting UTF-8 support - see issue #338") tdb_dtype = tiledb_type_from_arrow_type(arrow_type) - assert isinstance(tdb_dtype, np.dtype) + assert isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii" rt_arrow_type = get_arrow_type_from_tiledb_dtype(tdb_dtype) assert isinstance(rt_arrow_type, pa.DataType) assert arrow_type == rt_arrow_type From 324654b2cf9d2b186ab79ee69fcc1ae1ecb0bf22 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 5 Oct 2022 15:12:17 -0400 Subject: [PATCH 6/9] iterate on string vs large_string --- apis/python/src/tiledbsoma/util_arrow.py | 4 +++- apis/python/tests/test_soma_collection.py | 4 ++-- apis/python/tests/test_soma_dataframe.py | 4 ++-- apis/python/tests/test_soma_experiment_basic.py | 4 ++-- apis/python/tests/test_soma_indexed_dataframe.py | 10 +++++++++- apis/python/tests/test_soma_metadata.py | 2 +- apis/python/tests/test_type_system.py | 2 ++ 7 files changed, 21 insertions(+), 9 deletions(-) diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py index 8ceb7d4627..d48420de6e 100644 --- a/apis/python/src/tiledbsoma/util_arrow.py +++ b/apis/python/src/tiledbsoma/util_arrow.py @@ -24,7 +24,9 @@ # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table. # pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338. + pa.large_string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338. pa.binary(): np.dtype("S"), + pa.large_binary(): np.dtype("S"), pa.timestamp("s"): "datetime64[s]", pa.timestamp("ms"): "datetime64[ms]", pa.timestamp("us"): "datetime64[us]", @@ -90,7 +92,7 @@ def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.D """ if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes": # XXX TODO: temporary work-around until UTF8 support is native. GH #338. - return pa.string() + return pa.large_string() else: return pa.from_numpy_dtype(tiledb_dtype) diff --git a/apis/python/tests/test_soma_collection.py b/apis/python/tests/test_soma_collection.py index b022e5d5b3..6cd92bf502 100644 --- a/apis/python/tests/test_soma_collection.py +++ b/apis/python/tests/test_soma_collection.py @@ -15,7 +15,7 @@ def create_and_populate_dataframe(dataframe: soma.SOMADataFrame) -> None: [ ("foo", pa.int32()), ("bar", pa.float64()), - ("baz", pa.string()), + ("baz", pa.large_string()), ] ) @@ -108,7 +108,7 @@ def soma_object(request, tmp_path): elif class_name == "SOMADataFrame": so = soma.SOMADataFrame(uri=uri) - so.create(pa.schema([("A", pa.int32()), ("B", pa.string())])) + so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())])) elif class_name == "SOMAIndexedDataFrame": so = soma.SOMAIndexedDataFrame(uri=uri) diff --git a/apis/python/tests/test_soma_dataframe.py b/apis/python/tests/test_soma_dataframe.py index 1ea1706e9a..b09b3a71a8 100644 --- a/apis/python/tests/test_soma_dataframe.py +++ b/apis/python/tests/test_soma_dataframe.py @@ -13,7 +13,7 @@ def test_soma_dataframe_non_indexed(tmp_path): [ ("foo", pa.int32()), ("bar", pa.float64()), - ("baz", pa.string()), + ("baz", pa.large_string()), ] ) sdf.create(schema=asch) @@ -120,7 +120,7 @@ def simple_soma_data_frame(tmp_path): ("soma_rowid", pa.uint64()), ("A", pa.int64()), ("B", pa.float64()), - ("C", pa.string()), + ("C", pa.large_string()), ] ) sdf = t.SOMADataFrame(uri=tmp_path.as_posix()) diff --git a/apis/python/tests/test_soma_experiment_basic.py b/apis/python/tests/test_soma_experiment_basic.py index 622f257242..e5ee78d903 100644 --- a/apis/python/tests/test_soma_experiment_basic.py +++ b/apis/python/tests/test_soma_experiment_basic.py @@ -14,7 +14,7 @@ def create_and_populate_obs(obs: soma.SOMADataFrame) -> soma.SOMADataFrame: [ ("foo", pa.int32()), ("bar", pa.float64()), - ("baz", pa.string()), + ("baz", pa.large_string()), ] ) @@ -37,7 +37,7 @@ def create_and_populate_var(var: soma.SOMADataFrame) -> soma.SOMADataFrame: var_arrow_schema = pa.schema( [ - ("quux", pa.string()), + ("quux", pa.large_string()), ("xyzzy", pa.float64()), ] ) diff --git a/apis/python/tests/test_soma_indexed_dataframe.py b/apis/python/tests/test_soma_indexed_dataframe.py index c9766f925e..1ad87e8d48 100644 --- a/apis/python/tests/test_soma_indexed_dataframe.py +++ b/apis/python/tests/test_soma_indexed_dataframe.py @@ -21,6 +21,14 @@ def _schema(): def test_soma_indexed_dataframe(tmp_path, arrow_schema): sdf = t.SOMAIndexedDataFrame(uri=tmp_path.as_posix()) + asch = pa.schema( + [ + ("foo", pa.int32()), + ("bar", pa.float64()), + ("baz", pa.large_string()), + ] + ) + # Create asch = arrow_schema() sdf.create(schema=asch, index_column_names=["foo"]) @@ -72,7 +80,7 @@ def simple_soma_indexed_data_frame(tmp_path): ("index", pa.uint64()), ("A", pa.int64()), ("B", pa.float64()), - ("C", pa.string()), + ("C", pa.large_string()), ] ) index_column_names = ["index"] diff --git a/apis/python/tests/test_soma_metadata.py b/apis/python/tests/test_soma_metadata.py index da8b1174b1..757aacf498 100644 --- a/apis/python/tests/test_soma_metadata.py +++ b/apis/python/tests/test_soma_metadata.py @@ -33,7 +33,7 @@ def soma_object(request, tmp_path): elif class_name == "SOMADataFrame": so = soma.SOMADataFrame(uri=uri) - so.create(pa.schema([("A", pa.int32()), ("B", pa.string())])) + so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())])) elif class_name == "SOMAIndexedDataFrame": so = soma.SOMAIndexedDataFrame(uri=uri) diff --git a/apis/python/tests/test_type_system.py b/apis/python/tests/test_type_system.py index 4e0d515be4..1e18df0c67 100644 --- a/apis/python/tests/test_type_system.py +++ b/apis/python/tests/test_type_system.py @@ -25,7 +25,9 @@ pa.timestamp("us"), pa.timestamp("ns"), pa.string(), + pa.large_string(), pa.binary(), + pa.large_binary(), ] From 8f2c6de7449bbc7aad812452693ee6a5d44531fd Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 10 Oct 2022 20:17:58 -0400 Subject: [PATCH 7/9] Fix ascii/binary issues in TileDB->Arrow and TileDB->Pandas->Arrows test cases --- apis/python/src/tiledbsoma/soma_dataframe.py | 2 + apis/python/src/tiledbsoma/util_arrow.py | 22 ++++++--- apis/python/tests/test_soma_dataframe.py | 34 ++++++++++--- apis/python/tests/test_type_system.py | 52 +++++++++++++++----- 4 files changed, 83 insertions(+), 27 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py index e3793056ea..fb10998d08 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_dataframe.py @@ -426,6 +426,8 @@ def write_from_pandas( dfc = dataframe[column_name] if len(dfc) > 0 and type(dfc[0]) == str: column_types[column_name] = "ascii" + if len(dfc) > 0 and type(dfc[0]) == bytes: + column_types[column_name] = "bytes" tiledb.from_pandas( uri=self.uri, diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py index d48420de6e..e01cc70a88 100644 --- a/apis/python/src/tiledbsoma/util_arrow.py +++ b/apis/python/src/tiledbsoma/util_arrow.py @@ -9,13 +9,17 @@ of representing full type semantics, and correctly performing a round trip conversion (eg, T == to_arrow(to_tiledb(T))) -Most primitive types are simple - eg, uint8. Of particular challenge +Most primitive types are simple -- e.g., uint8. Of particular challenge are datetime/timestamps as TileDB has no distinction between a "datetime" and a "timedelta". The best Arrow match is TimestampType, as long as that TimestampType instance does NOT have a timezone set. Because of our round-trip requirement, all other Arrow temporal types are unsupported (even though they are just int64 under the covers). + +We auto-promote Arrow's string and binary to large_string and large_binary, +respectively, as this is what TileDB stores -- a sequence of bytes preceded +by a 64-bit (not 32-bit) length int. """ ARROW_TO_TDB = { # Dict of types unsupported by to_pandas_dtype, which require overrides. @@ -25,8 +29,8 @@ # pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338. pa.large_string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338. - pa.binary(): np.dtype("S"), - pa.large_binary(): np.dtype("S"), + pa.binary(): "bytes", # XXX TODO: temporary work-around until UTF8 support is native. GH #338. + pa.large_binary(): "bytes", # XXX TODO: temporary work-around until UTF8 support is native. GH #338. pa.timestamp("s"): "datetime64[s]", pa.timestamp("ms"): "datetime64[ms]", pa.timestamp("us"): "datetime64[us]", @@ -63,8 +67,9 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]: raise arrow_type if arrow_type == "ascii": return arrow_type - else: - return np.dtype(arrow_type) + if arrow_type == "bytes": + return arrow_type # np.int8() + return np.dtype(arrow_type) if not pa.types.is_primitive(t): raise TypeError(f"Type {str(t)} - unsupported type") @@ -90,11 +95,12 @@ def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.D """ TODO: COMMENT """ - if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes": + if tiledb_dtype == "bytes": + return pa.large_binary() + if isinstance(tiledb_dtype, str) and tiledb_dtype == "ascii": # XXX TODO: temporary work-around until UTF8 support is native. GH #338. return pa.large_string() - else: - return pa.from_numpy_dtype(tiledb_dtype) + return pa.from_numpy_dtype(tiledb_dtype) def get_arrow_schema_from_tiledb_uri( diff --git a/apis/python/tests/test_soma_dataframe.py b/apis/python/tests/test_soma_dataframe.py index b09b3a71a8..fee48d3f7f 100644 --- a/apis/python/tests/test_soma_dataframe.py +++ b/apis/python/tests/test_soma_dataframe.py @@ -174,37 +174,57 @@ def test_SOMADataFrame_read_column_names(simple_soma_data_frame, ids, col_names) schema, sdf, n_data = simple_soma_data_frame assert sdf.exists() - def _check_tbl(tbl, col_names, ids): + def _check_tbl(tbl, col_names, ids, *, demote): assert tbl.num_columns == ( len(schema.names) if col_names is None else len(col_names) ) assert tbl.num_rows == (n_data if ids is None else len(ids)) - assert tbl.schema == pa.schema( - [ - schema.field(f) - for f in (col_names if col_names is not None else schema.names) - ] - ) + if demote: + assert tbl.schema == pa.schema( + [ + pa.field(schema.field(f).name, pa.string()) + if schema.field(f).type == pa.large_string() + else schema.field(f) + for f in (col_names if col_names is not None else schema.names) + ] + ) + else: + assert tbl.schema == pa.schema( + [ + schema.field(f) + for f in (col_names if col_names is not None else schema.names) + ] + ) + + # TileDB ASCII -> Arrow large_string _check_tbl( sdf.read_all(ids=ids, column_names=col_names), col_names, ids, + demote=False, ) + _check_tbl( sdf.read_all(column_names=col_names), col_names, None, + demote=False, ) + + # TileDB ASCII -> Pandas string -> Arrow string (not large_string) _check_tbl( pa.Table.from_pandas( pd.concat(sdf.read_as_pandas(ids=ids, column_names=col_names)) ), col_names, ids, + demote=True, ) + _check_tbl( pa.Table.from_pandas(sdf.read_as_pandas_all(column_names=col_names)), col_names, None, + demote=True, ) diff --git a/apis/python/tests/test_type_system.py b/apis/python/tests/test_type_system.py index 1e18df0c67..62988001ff 100644 --- a/apis/python/tests/test_type_system.py +++ b/apis/python/tests/test_type_system.py @@ -24,12 +24,20 @@ pa.timestamp("ms"), pa.timestamp("us"), pa.timestamp("ns"), - pa.string(), + # We use Arrow's large_string for ASCII and, ultimately, for Unicode as well + # https://github.com/single-cell-data/TileDB-SOMA/issues/99 + # https://github.com/single-cell-data/TileDB-SOMA/pull/359 + # https://github.com/single-cell-data/TileDB-SOMA/issues/274 pa.large_string(), - pa.binary(), pa.large_binary(), ] +"""Arrow types we expect to auto-promote""" +PROMOTED_ARROW_TYPES = [ + (pa.string(), pa.large_string()), + # XXX (pa.binary(), pa.large_binary()), +] + """Arrow types we expect to fail""" UNSUPPORTED_ARROW_TYPES = [ @@ -46,10 +54,13 @@ pa.duration("us"), pa.duration("ns"), pa.month_day_nano_interval(), + # We use Arrow's large_string for ASCII and, ultimately, for Unicode as well + # https://github.com/single-cell-data/TileDB-SOMA/issues/99 + # https://github.com/single-cell-data/TileDB-SOMA/pull/359 + # https://github.com/single-cell-data/TileDB-SOMA/issues/274 + pa.string(), pa.binary(), pa.binary(10), - pa.large_binary(), - pa.large_string(), pa.decimal128(1), pa.decimal128(38), pa.list_(pa.int8()), @@ -61,20 +72,37 @@ @pytest.mark.parametrize("arrow_type", SUPPORTED_ARROW_TYPES) -def test_supported_types_supported(arrow_type): +def test_arrow_types_supported(arrow_type): """Verify round-trip conversion of types""" - if pa.types.is_binary(arrow_type): - pytest.xfail("Awaiting UTF-8 support - see issue #338") + # if pa.types.is_binary(arrow_type): + # pytest.xfail("Awaiting UTF-8 support - see issue #274") tdb_dtype = tiledb_type_from_arrow_type(arrow_type) - assert isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii" - rt_arrow_type = get_arrow_type_from_tiledb_dtype(tdb_dtype) - assert isinstance(rt_arrow_type, pa.DataType) - assert arrow_type == rt_arrow_type + assert ( + isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii" or tdb_dtype == "bytes" + ) + arrow_rt_type = get_arrow_type_from_tiledb_dtype(tdb_dtype) + assert isinstance(arrow_rt_type, pa.DataType) + assert arrow_type == arrow_rt_type + + +@pytest.mark.parametrize("arrow_from_to_pair", PROMOTED_ARROW_TYPES) +def test_arrow_types_promoted(arrow_from_to_pair): + """Verify round-trip conversion of types""" + arrow_from_type = arrow_from_to_pair[0] + arrow_to_type = arrow_from_to_pair[1] + + tdb_dtype = tiledb_type_from_arrow_type(arrow_from_type) + assert ( + isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii" or tdb_dtype == "bytes" + ) + arrow_rt_type = get_arrow_type_from_tiledb_dtype(tdb_dtype) + assert isinstance(arrow_rt_type, pa.DataType) + assert arrow_to_type == arrow_rt_type @pytest.mark.parametrize("arrow_type", UNSUPPORTED_ARROW_TYPES) -def test_supported_types_unsupported(arrow_type): +def test_arrow_types_unsupported(arrow_type): """Verify correct error for unsupported types""" with pytest.raises(TypeError): tiledb_type_from_arrow_type(arrow_type, match=r".*unsupported type.*") From 07327904ff7d1aa54d3c209c1315ce4090c6a442 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 11 Oct 2022 17:55:39 +0000 Subject: [PATCH 8/9] more --- libtiledbsoma/test/test_query_condition.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libtiledbsoma/test/test_query_condition.py b/libtiledbsoma/test/test_query_condition.py index 57f4788085..1e3f8e09eb 100755 --- a/libtiledbsoma/test/test_query_condition.py +++ b/libtiledbsoma/test/test_query_condition.py @@ -54,7 +54,7 @@ def test_query_condition_int(): def test_query_condition_string(): uri = os.path.join(SOMA_URI, "obs") - condition = "louvain == b'NK cells'" + condition = 'louvain == "NK cells"' pandas = pandas_query(uri, condition) @@ -90,7 +90,7 @@ def test_query_condition_and(): def test_query_condition_and_or(): uri = os.path.join(SOMA_URI, "obs") - condition = "(percent_mito > 0.02 and n_genes > 700) or (percent_mito < 0.015 and louvain == b'B cells')" + condition = "(percent_mito > 0.02 and n_genes > 700) or (percent_mito < 0.015 and louvain == 'B cells')" pandas = pandas_query(uri, condition) From 0905781b1c017d38f4818153448010b359ac1db5 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 11 Oct 2022 18:42:59 +0000 Subject: [PATCH 9/9] fix unit tests --- apis/python/src/tiledbsoma/soma_dataframe.py | 2 +- apis/python/src/tiledbsoma/util_arrow.py | 14 ++++++++++++++ libtiledbsoma/test/test_query_condition.py | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.py b/apis/python/src/tiledbsoma/soma_dataframe.py index fb10998d08..e046fbcf47 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.py +++ b/apis/python/src/tiledbsoma/soma_dataframe.py @@ -271,7 +271,7 @@ def write(self, values: pa.Table) -> None: if name != ROWID: attr_cols_map[name] = np.asarray( values.column(name).to_pandas( - types_mapper=util_arrow.tiledb_type_from_arrow_type, + types_mapper=util_arrow.tiledb_type_from_arrow_type_for_write, ) ) diff --git a/apis/python/src/tiledbsoma/util_arrow.py b/apis/python/src/tiledbsoma/util_arrow.py index e01cc70a88..a67de8118c 100644 --- a/apis/python/src/tiledbsoma/util_arrow.py +++ b/apis/python/src/tiledbsoma/util_arrow.py @@ -43,6 +43,20 @@ } +def tiledb_type_from_arrow_type_for_write(t: pa.DataType) -> Union[type, np.dtype, str]: + """ + Same as ``tiledb_type_from_arrow_type`` except that this is used for writing to a TileDB array. + The syntax of TileDB-Py is such that when we want to create a schema with an ASCII column, + we use the string ``"ascii"`` in place of a dtype. But when we want to write data, we need to + use a dtype of ``np.str``, which is now deprecated in favor of simply ``str``. + """ + retval = tiledb_type_from_arrow_type(t) + if retval == "ascii": + return str + else: + return retval + + def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]: """ Given an Arrow type, return the corresponding TileDB type as a Numpy dtype. diff --git a/libtiledbsoma/test/test_query_condition.py b/libtiledbsoma/test/test_query_condition.py index 1e3f8e09eb..77b6977d49 100755 --- a/libtiledbsoma/test/test_query_condition.py +++ b/libtiledbsoma/test/test_query_condition.py @@ -90,7 +90,7 @@ def test_query_condition_and(): def test_query_condition_and_or(): uri = os.path.join(SOMA_URI, "obs") - condition = "(percent_mito > 0.02 and n_genes > 700) or (percent_mito < 0.015 and louvain == 'B cells')" + condition = '(percent_mito > 0.02 and n_genes > 700) or (percent_mito < 0.015 and louvain == "B cells")' pandas = pandas_query(uri, condition)