Skip to content

Commit 0661600

Browse files
committed
Use true ASCII attributes in dataframes
1 parent 649081f commit 0661600

File tree

5 files changed

+14
-49
lines changed

5 files changed

+14
-49
lines changed

apis/python/src/tiledbsoma/soma_dataframe.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pyarrow as pa
66
import tiledb
77

8-
from . import util, util_arrow, util_pandas, util_tiledb
8+
from . import util, util_arrow, util_tiledb
99
from .logging import log_io
1010
from .soma_collection import SOMACollectionBase
1111
from .tiledb_array import TileDBArray
@@ -226,7 +226,7 @@ def read(
226226
# Also: don't materialize these on read
227227
# TODO: get the arrow syntax for drop
228228
# df.drop(ROWID, axis=1)
229-
yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
229+
yield table
230230

231231
def read_all(
232232
self,
@@ -363,7 +363,7 @@ def read_as_pandas(
363363
# This is the 'decode on read' part of our logic; in dim_select we have the 'encode on
364364
# write' part.
365365
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
366-
df = util_pandas.ascii_to_unicode_pandas_readback(df)
366+
df = df
367367

368368
if id_column_name is not None:
369369
df.reset_index(inplace=True)
@@ -477,7 +477,7 @@ def write_from_pandas(
477477
dfc = dataframe[column_name]
478478
if len(dfc) > 0 and type(dfc[0]) == str:
479479
# Force ASCII storage if string, in order to make obs/var columns queryable.
480-
column_types[column_name] = np.dtype("S")
480+
column_types[column_name] = "ascii"
481481

482482
tiledb.from_pandas(
483483
uri=self.uri,

apis/python/src/tiledbsoma/soma_indexed_dataframe.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -259,11 +259,7 @@ def read(
259259
iterator = query.df[ids]
260260

261261
for table in iterator:
262-
# XXX COMMENT MORE
263-
# This is the 'decode on read' part of our logic; in dim_select we have the
264-
# 'encode on write' part.
265-
# Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99.
266-
yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
262+
yield table
267263

268264
def read_all(
269265
self,

apis/python/src/tiledbsoma/util_arrow.py

Lines changed: 8 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,7 @@
2323
#
2424
# IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
2525
#
26-
pa.string(): np.dtype(
27-
"S"
28-
), # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
26+
pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
2927
pa.binary(): np.dtype("S"),
3028
pa.timestamp("s"): "datetime64[s]",
3129
pa.timestamp("ms"): "datetime64[ms]",
@@ -39,7 +37,7 @@
3937
}
4038

4139

42-
def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
40+
def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
4341
"""
4442
Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
4543
Building block for Arrow-to-TileDB schema translation.
@@ -61,7 +59,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
6159
arrow_type = ARROW_TO_TDB[t]
6260
if isinstance(arrow_type, Exception):
6361
raise arrow_type
64-
return np.dtype(arrow_type)
62+
if arrow_type == "ascii":
63+
return arrow_type
64+
else:
65+
return np.dtype(arrow_type)
6566

6667
if not pa.types.is_primitive(t):
6768
raise TypeError(f"Type {str(t)} - unsupported type")
@@ -83,11 +84,11 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
8384
raise TypeError("Unsupported Arrow type") from exc
8485

8586

86-
def get_arrow_type_from_tiledb_dtype(tiledb_dtype: np.dtype) -> pa.DataType:
87+
def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType:
8788
"""
8889
TODO: COMMENT
8990
"""
90-
if tiledb_dtype.name == "bytes":
91+
if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes":
9192
# XXX TODO: temporary work-around until UTF8 support is native. GH #338.
9293
return pa.string()
9394
else:
@@ -119,22 +120,3 @@ def get_arrow_schema_from_tiledb_uri(
119120
arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype)
120121

121122
return pa.schema(arrow_schema_dict)
122-
123-
124-
def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table:
125-
"""
126-
Implements the 'decode on read' part of our ASCII/Unicode logic
127-
"""
128-
# TODO: COMMENT/LINK HEAVILY
129-
names = [ofield.name for ofield in table.schema]
130-
new_fields = []
131-
for name in names:
132-
old_field = table[name]
133-
if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar):
134-
nfield = pa.array(
135-
[element.as_py().decode("utf-8") for element in old_field]
136-
)
137-
new_fields.append(nfield)
138-
else:
139-
new_fields.append(old_field)
140-
return pa.Table.from_arrays(new_fields, names=names)

apis/python/src/tiledbsoma/util_pandas.py

Lines changed: 0 additions & 13 deletions
This file was deleted.

apis/python/tests/test_type_system.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def test_supported_types_supported(arrow_type):
6565
pytest.xfail("Awaiting UTF-8 support - see issue #338")
6666

6767
tdb_dtype = tiledb_type_from_arrow_type(arrow_type)
68-
assert isinstance(tdb_dtype, np.dtype)
68+
assert isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii"
6969
rt_arrow_type = get_arrow_type_from_tiledb_dtype(tdb_dtype)
7070
assert isinstance(rt_arrow_type, pa.DataType)
7171
assert arrow_type == rt_arrow_type

0 commit comments

Comments
 (0)