Skip to content

Commit e5d5e8d

Browse files
committed
Use true ASCII attributes in dataframes
1 parent abaf128 commit e5d5e8d

File tree

7 files changed

+16
-90
lines changed

7 files changed

+16
-90
lines changed

.github/workflows/ci.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ jobs:
2020
- runs-on: ubuntu-22.04
2121
cc: gcc-11
2222
cxx: g++-11
23-
# Pending https://github.com/actions/runner-images/issues/6350
2423
- runs-on: macos-11
2524
cc: gcc-11
2625
cxx: g++-11

.github/workflows/cpp-ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ jobs:
1818
cc: gcc-11
1919
cxx: g++-11
2020
# Pending https://github.com/actions/runner-images/issues/6350
21+
# - runs-on: macos-12
2122
- runs-on: macos-11
2223
cc: gcc-11
2324
cxx: g++-11

apis/python/src/tiledbsoma/soma_dataframe.py

Lines changed: 5 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pyarrow as pa
66
import tiledb
77

8-
from . import util, util_arrow, util_pandas, util_tiledb
8+
from . import util, util_arrow, util_tiledb
99
from .logging import log_io
1010
from .soma_collection import SOMACollectionBase
1111
from .tiledb_array import TileDBArray
@@ -218,11 +218,7 @@ def read(
218218
iterator = query.df[ids]
219219

220220
for table in iterator:
221-
# XXX COMMENT MORE
222-
# This is the 'decode on read' part of our logic; in dim_select we have the
223-
# 'encode on write' part.
224-
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
225-
yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
221+
yield table
226222

227223
def read_all(
228224
self,
@@ -356,11 +352,6 @@ def read_as_pandas(
356352

357353
for df in iterator:
358354

359-
# This is the 'decode on read' part of our logic; in dim_select we have the 'encode on
360-
# write' part.
361-
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
362-
df = util_pandas.ascii_to_unicode_pandas_readback(df)
363-
364355
if id_column_name is not None:
365356
df.reset_index(inplace=True)
366357
df.set_index(id_column_name, inplace=True)
@@ -441,39 +432,13 @@ def write_from_pandas(
441432

442433
dataframe.set_index(ROWID, inplace=True)
443434

444-
# ISSUE:
445-
#
446-
# TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB
447-
# QueryCondition API. While this needs to be addressed -- global collaborators will want to
448-
# write annotation-dataframe values in Unicode -- until then, to make obs/var data possible
449-
# to query, we need to store these as ASCII.
450-
#
451-
# This is (besides collation) a storage-level issue not a presentation-level issue: At write
452-
# time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since
453-
# SOMA is an API: utf8-decode those strings when a query is done & give the user back
454-
# "α,β,γ".
455-
#
456-
# CONTEXT:
457-
# https://github.com/single-cell-data/TileDB-SOMA/issues/99
458-
# https://github.com/single-cell-data/TileDB-SOMA/pull/101
459-
# https://github.com/single-cell-data/TileDB-SOMA/issues/106
460-
# https://github.com/single-cell-data/TileDB-SOMA/pull/117
461-
#
462-
# IMPLEMENTATION:
463-
# Python types -- float, string, what have you -- appear as dtype('O') which is not useful.
464-
# Also, ``tiledb.from_pandas`` has ``column_types`` but that _forces_ things to string to a
465-
# particular if they shouldn't be.
466-
#
467-
# Instead, we use ``dataframe.convert_dtypes`` to get a little jump on what ``tiledb.from_pandas``
468-
# is going to be doing anyway, namely, type-inferring to see what is going to be a string.
469-
#
470-
# TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
435+
# Force ASCII storage if string, in order to make obs/var columns queryable.
436+
# TODO: when UTF-8 attributes are fully supported we can remove this.
471437
column_types = {}
472438
for column_name in dataframe.keys():
473439
dfc = dataframe[column_name]
474440
if len(dfc) > 0 and type(dfc[0]) == str:
475-
# Force ASCII storage if string, in order to make obs/var columns queryable.
476-
column_types[column_name] = np.dtype("S")
441+
column_types[column_name] = "ascii"
477442

478443
tiledb.from_pandas(
479444
uri=self.uri,

apis/python/src/tiledbsoma/soma_indexed_dataframe.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -259,11 +259,7 @@ def read(
259259
iterator = query.df[ids]
260260

261261
for table in iterator:
262-
# XXX COMMENT MORE
263-
# This is the 'decode on read' part of our logic; in dim_select we have the
264-
# 'encode on write' part.
265-
# Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99.
266-
yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
262+
yield table
267263

268264
def read_all(
269265
self,

apis/python/src/tiledbsoma/util_arrow.py

Lines changed: 8 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,7 @@
2323
#
2424
# IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
2525
#
26-
pa.string(): np.dtype(
27-
"S"
28-
), # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
26+
pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
2927
pa.binary(): np.dtype("S"),
3028
pa.timestamp("s"): "datetime64[s]",
3129
pa.timestamp("ms"): "datetime64[ms]",
@@ -39,7 +37,7 @@
3937
}
4038

4139

42-
def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
40+
def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
4341
"""
4442
Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
4543
Building block for Arrow-to-TileDB schema translation.
@@ -61,7 +59,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
6159
arrow_type = ARROW_TO_TDB[t]
6260
if isinstance(arrow_type, Exception):
6361
raise arrow_type
64-
return np.dtype(arrow_type)
62+
if arrow_type == "ascii":
63+
return arrow_type
64+
else:
65+
return np.dtype(arrow_type)
6566

6667
if not pa.types.is_primitive(t):
6768
raise TypeError(f"Type {str(t)} - unsupported type")
@@ -83,11 +84,11 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
8384
raise TypeError("Unsupported Arrow type") from exc
8485

8586

86-
def get_arrow_type_from_tiledb_dtype(tiledb_dtype: np.dtype) -> pa.DataType:
87+
def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType:
8788
"""
8889
TODO: COMMENT
8990
"""
90-
if tiledb_dtype.name == "bytes":
91+
if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes":
9192
# XXX TODO: temporary work-around until UTF8 support is native. GH #338.
9293
return pa.string()
9394
else:
@@ -119,26 +120,3 @@ def get_arrow_schema_from_tiledb_uri(
119120
arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype)
120121

121122
return pa.schema(arrow_schema_dict)
122-
123-
124-
def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table:
125-
"""
126-
Implements the 'decode on read' part of our ASCII/Unicode logic
127-
"""
128-
# TODO: COMMENT/LINK HEAVILY
129-
names = [ofield.name for ofield in table.schema]
130-
new_fields = []
131-
for name in names:
132-
old_field = table[name]
133-
# Preferred syntax:
134-
# if len(old_field) > 0 and pa.types.is_large_binary(old_field[0]):
135-
# but:
136-
# AttributeError: 'pyarrow.lib.UInt64Scalar' object has no attribute 'id'
137-
if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar):
138-
nfield = pa.array(
139-
[element.as_py().decode("utf-8") for element in old_field]
140-
)
141-
new_fields.append(nfield)
142-
else:
143-
new_fields.append(old_field)
144-
return pa.Table.from_arrays(new_fields, names=names)

apis/python/src/tiledbsoma/util_pandas.py

Lines changed: 0 additions & 13 deletions
This file was deleted.

apis/python/tests/test_type_system.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def test_supported_types_supported(arrow_type):
6565
pytest.xfail("Awaiting UTF-8 support - see issue #338")
6666

6767
tdb_dtype = tiledb_type_from_arrow_type(arrow_type)
68-
assert isinstance(tdb_dtype, np.dtype)
68+
assert isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii"
6969
rt_arrow_type = get_arrow_type_from_tiledb_dtype(tdb_dtype)
7070
assert isinstance(rt_arrow_type, pa.DataType)
7171
assert arrow_type == rt_arrow_type

0 commit comments

Comments
 (0)