Skip to content

Commit b589716

Browse files
authored
Use true ASCII attributes in dataframes (#359)
* remove util_arrow.concat_tables * read/write Table * read/write Table * read/write Table * Use true ASCII attributes in dataframes * iterate on string vs large_string * Fix ascii/binary issues in TileDB->Arrow and TileDB->Pandas->Arrows test cases * more * fix unit tests Co-authored-by: John Kerl <john.kerl@tiledb.com>
1 parent 36731f2 commit b589716

File tree

11 files changed

+130
-122
lines changed

11 files changed

+130
-122
lines changed

apis/python/src/tiledbsoma/soma_dataframe.py

Lines changed: 8 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pyarrow as pa
66
import tiledb
77

8-
from . import util, util_arrow, util_pandas, util_tiledb
8+
from . import util, util_arrow, util_tiledb
99
from .logging import log_io
1010
from .soma_collection import SOMACollectionBase
1111
from .tiledb_array import TileDBArray
@@ -205,11 +205,7 @@ def read(
205205
iterator = query.df[ids]
206206

207207
for table in iterator:
208-
# XXX COMMENT MORE
209-
# This is the 'decode on read' part of our logic; in dim_select we have the
210-
# 'encode on write' part.
211-
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
212-
yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
208+
yield table
213209

214210
def read_all(
215211
self,
@@ -275,7 +271,7 @@ def write(self, values: pa.Table) -> None:
275271
if name != ROWID:
276272
attr_cols_map[name] = np.asarray(
277273
values.column(name).to_pandas(
278-
types_mapper=util_arrow.tiledb_type_from_arrow_type,
274+
types_mapper=util_arrow.tiledb_type_from_arrow_type_for_write,
279275
)
280276
)
281277

@@ -343,11 +339,6 @@ def read_as_pandas(
343339

344340
for df in iterator:
345341

346-
# This is the 'decode on read' part of our logic; in dim_select we have the 'encode on
347-
# write' part.
348-
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
349-
df = util_pandas.ascii_to_unicode_pandas_readback(df)
350-
351342
if id_column_name is not None:
352343
df.reset_index(inplace=True)
353344
df.set_index(id_column_name, inplace=True)
@@ -428,39 +419,15 @@ def write_from_pandas(
428419

429420
dataframe.set_index(ROWID, inplace=True)
430421

431-
# ISSUE:
432-
#
433-
# TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB
434-
# QueryCondition API. While this needs to be addressed -- global collaborators will want to
435-
# write annotation-dataframe values in Unicode -- until then, to make obs/var data possible
436-
# to query, we need to store these as ASCII.
437-
#
438-
# This is (besides collation) a storage-level issue not a presentation-level issue: At write
439-
# time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since
440-
# SOMA is an API: utf8-decode those strings when a query is done & give the user back
441-
# "α,β,γ".
442-
#
443-
# CONTEXT:
444-
# https://github.com/single-cell-data/TileDB-SOMA/issues/99
445-
# https://github.com/single-cell-data/TileDB-SOMA/pull/101
446-
# https://github.com/single-cell-data/TileDB-SOMA/issues/106
447-
# https://github.com/single-cell-data/TileDB-SOMA/pull/117
448-
#
449-
# IMPLEMENTATION:
450-
# Python types -- float, string, what have you -- appear as dtype('O') which is not useful.
451-
# Also, ``tiledb.from_pandas`` has ``column_types`` but that _forces_ things to string to a
452-
# particular if they shouldn't be.
453-
#
454-
# Instead, we use ``dataframe.convert_dtypes`` to get a little jump on what ``tiledb.from_pandas``
455-
# is going to be doing anyway, namely, type-inferring to see what is going to be a string.
456-
#
457-
# TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
422+
# Force ASCII storage if string, in order to make obs/var columns queryable.
423+
# TODO: when UTF-8 attributes are fully supported we can remove this.
458424
column_types = {}
459425
for column_name in dataframe.keys():
460426
dfc = dataframe[column_name]
461427
if len(dfc) > 0 and type(dfc[0]) == str:
462-
# Force ASCII storage if string, in order to make obs/var columns queryable.
463-
column_types[column_name] = np.dtype("S")
428+
column_types[column_name] = "ascii"
429+
if len(dfc) > 0 and type(dfc[0]) == bytes:
430+
column_types[column_name] = "bytes"
464431

465432
tiledb.from_pandas(
466433
uri=self.uri,

apis/python/src/tiledbsoma/soma_indexed_dataframe.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -250,11 +250,7 @@ def read(
250250
iterator = query.df[ids]
251251

252252
for table in iterator:
253-
# XXX COMMENT MORE
254-
# This is the 'decode on read' part of our logic; in dim_select we have the
255-
# 'encode on write' part.
256-
# Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99.
257-
yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
253+
yield table
258254

259255
def read_all(
260256
self,

apis/python/src/tiledbsoma/util_arrow.py

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,28 @@
99
of representing full type semantics, and correctly performing a
1010
round trip conversion (eg, T == to_arrow(to_tiledb(T)))
1111
12-
Most primitive types are simple - eg, uint8. Of particular challenge
12+
Most primitive types are simple -- e.g., uint8. Of particular challenge
1313
are datetime/timestamps as TileDB has no distinction between a "datetime" and
1414
a "timedelta". The best Arrow match is TimestampType, as long as that
1515
TimestampType instance does NOT have a timezone set.
1616
1717
Because of our round-trip requirement, all other Arrow temporal types
1818
are unsupported (even though they are just int64 under the covers).
19+
20+
We auto-promote Arrow's string and binary to large_string and large_binary,
21+
respectively, as this is what TileDB stores -- a sequence of bytes preceded
22+
by a 64-bit (not 32-bit) length int.
1923
"""
2024
ARROW_TO_TDB = {
2125
# Dict of types unsupported by to_pandas_dtype, which require overrides.
2226
# If the value is an instance of Exception, it will be raised.
2327
#
2428
# IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
2529
#
26-
pa.string(): np.dtype(
27-
"S"
28-
), # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
29-
pa.binary(): np.dtype("S"),
30+
pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
31+
pa.large_string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
32+
pa.binary(): "bytes", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
33+
pa.large_binary(): "bytes", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
3034
pa.timestamp("s"): "datetime64[s]",
3135
pa.timestamp("ms"): "datetime64[ms]",
3236
pa.timestamp("us"): "datetime64[us]",
@@ -39,7 +43,21 @@
3943
}
4044

4145

42-
def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
46+
def tiledb_type_from_arrow_type_for_write(t: pa.DataType) -> Union[type, np.dtype, str]:
47+
"""
48+
Same as ``tiledb_type_from_arrow_type`` except that this is used for writing to a TileDB array.
49+
The syntax of TileDB-Py is such that when we want to create a schema with an ASCII column,
50+
we use the string ``"ascii"`` in place of a dtype. But when we want to write data, we need to
51+
use a dtype of ``np.str``, which is now deprecated in favor of simply ``str``.
52+
"""
53+
retval = tiledb_type_from_arrow_type(t)
54+
if retval == "ascii":
55+
return str
56+
else:
57+
return retval
58+
59+
60+
def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
4361
"""
4462
Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
4563
Building block for Arrow-to-TileDB schema translation.
@@ -61,6 +79,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
6179
arrow_type = ARROW_TO_TDB[t]
6280
if isinstance(arrow_type, Exception):
6381
raise arrow_type
82+
if arrow_type == "ascii":
83+
return arrow_type
84+
if arrow_type == "bytes":
85+
return arrow_type # np.int8()
6486
return np.dtype(arrow_type)
6587

6688
if not pa.types.is_primitive(t):
@@ -83,15 +105,16 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
83105
raise TypeError("Unsupported Arrow type") from exc
84106

85107

86-
def get_arrow_type_from_tiledb_dtype(tiledb_dtype: np.dtype) -> pa.DataType:
108+
def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType:
87109
"""
88110
TODO: COMMENT
89111
"""
90-
if tiledb_dtype.name == "bytes":
112+
if tiledb_dtype == "bytes":
113+
return pa.large_binary()
114+
if isinstance(tiledb_dtype, str) and tiledb_dtype == "ascii":
91115
# XXX TODO: temporary work-around until UTF8 support is native. GH #338.
92-
return pa.string()
93-
else:
94-
return pa.from_numpy_dtype(tiledb_dtype)
116+
return pa.large_string()
117+
return pa.from_numpy_dtype(tiledb_dtype)
95118

96119

97120
def get_arrow_schema_from_tiledb_uri(
@@ -119,26 +142,3 @@ def get_arrow_schema_from_tiledb_uri(
119142
arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype)
120143

121144
return pa.schema(arrow_schema_dict)
122-
123-
124-
def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table:
125-
"""
126-
Implements the 'decode on read' part of our ASCII/Unicode logic
127-
"""
128-
# TODO: COMMENT/LINK HEAVILY
129-
names = [ofield.name for ofield in table.schema]
130-
new_fields = []
131-
for name in names:
132-
old_field = table[name]
133-
# Preferred syntax:
134-
# if len(old_field) > 0 and pa.types.is_large_binary(old_field[0]):
135-
# but:
136-
# AttributeError: 'pyarrow.lib.UInt64Scalar' object has no attribute 'id'
137-
if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar):
138-
nfield = pa.array(
139-
[element.as_py().decode("utf-8") for element in old_field]
140-
)
141-
new_fields.append(nfield)
142-
else:
143-
new_fields.append(old_field)
144-
return pa.Table.from_arrays(new_fields, names=names)

apis/python/src/tiledbsoma/util_pandas.py

Lines changed: 0 additions & 13 deletions
This file was deleted.

apis/python/tests/test_soma_collection.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def create_and_populate_dataframe(dataframe: soma.SOMADataFrame) -> None:
1515
[
1616
("foo", pa.int32()),
1717
("bar", pa.float64()),
18-
("baz", pa.string()),
18+
("baz", pa.large_string()),
1919
]
2020
)
2121

@@ -108,7 +108,7 @@ def soma_object(request, tmp_path):
108108

109109
elif class_name == "SOMADataFrame":
110110
so = soma.SOMADataFrame(uri=uri)
111-
so.create(pa.schema([("A", pa.int32()), ("B", pa.string())]))
111+
so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())]))
112112

113113
elif class_name == "SOMAIndexedDataFrame":
114114
so = soma.SOMAIndexedDataFrame(uri=uri)

apis/python/tests/test_soma_dataframe.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def test_soma_dataframe_non_indexed(tmp_path):
1313
[
1414
("foo", pa.int32()),
1515
("bar", pa.float64()),
16-
("baz", pa.string()),
16+
("baz", pa.large_string()),
1717
]
1818
)
1919
sdf.create(schema=asch)
@@ -120,7 +120,7 @@ def simple_soma_data_frame(tmp_path):
120120
("soma_rowid", pa.uint64()),
121121
("A", pa.int64()),
122122
("B", pa.float64()),
123-
("C", pa.string()),
123+
("C", pa.large_string()),
124124
]
125125
)
126126
sdf = t.SOMADataFrame(uri=tmp_path.as_posix())
@@ -174,37 +174,57 @@ def test_SOMADataFrame_read_column_names(simple_soma_data_frame, ids, col_names)
174174
schema, sdf, n_data = simple_soma_data_frame
175175
assert sdf.exists()
176176

177-
def _check_tbl(tbl, col_names, ids):
177+
def _check_tbl(tbl, col_names, ids, *, demote):
178178
assert tbl.num_columns == (
179179
len(schema.names) if col_names is None else len(col_names)
180180
)
181181
assert tbl.num_rows == (n_data if ids is None else len(ids))
182-
assert tbl.schema == pa.schema(
183-
[
184-
schema.field(f)
185-
for f in (col_names if col_names is not None else schema.names)
186-
]
187-
)
188182

183+
if demote:
184+
assert tbl.schema == pa.schema(
185+
[
186+
pa.field(schema.field(f).name, pa.string())
187+
if schema.field(f).type == pa.large_string()
188+
else schema.field(f)
189+
for f in (col_names if col_names is not None else schema.names)
190+
]
191+
)
192+
else:
193+
assert tbl.schema == pa.schema(
194+
[
195+
schema.field(f)
196+
for f in (col_names if col_names is not None else schema.names)
197+
]
198+
)
199+
200+
# TileDB ASCII -> Arrow large_string
189201
_check_tbl(
190202
sdf.read_all(ids=ids, column_names=col_names),
191203
col_names,
192204
ids,
205+
demote=False,
193206
)
207+
194208
_check_tbl(
195209
sdf.read_all(column_names=col_names),
196210
col_names,
197211
None,
212+
demote=False,
198213
)
214+
215+
# TileDB ASCII -> Pandas string -> Arrow string (not large_string)
199216
_check_tbl(
200217
pa.Table.from_pandas(
201218
pd.concat(sdf.read_as_pandas(ids=ids, column_names=col_names))
202219
),
203220
col_names,
204221
ids,
222+
demote=True,
205223
)
224+
206225
_check_tbl(
207226
pa.Table.from_pandas(sdf.read_as_pandas_all(column_names=col_names)),
208227
col_names,
209228
None,
229+
demote=True,
210230
)

apis/python/tests/test_soma_experiment_basic.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def create_and_populate_obs(obs: soma.SOMADataFrame) -> soma.SOMADataFrame:
1414
[
1515
("foo", pa.int32()),
1616
("bar", pa.float64()),
17-
("baz", pa.string()),
17+
("baz", pa.large_string()),
1818
]
1919
)
2020

@@ -37,7 +37,7 @@ def create_and_populate_var(var: soma.SOMADataFrame) -> soma.SOMADataFrame:
3737

3838
var_arrow_schema = pa.schema(
3939
[
40-
("quux", pa.string()),
40+
("quux", pa.large_string()),
4141
("xyzzy", pa.float64()),
4242
]
4343
)

apis/python/tests/test_soma_indexed_dataframe.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@ def _schema():
2121
def test_soma_indexed_dataframe(tmp_path, arrow_schema):
2222
sdf = t.SOMAIndexedDataFrame(uri=tmp_path.as_posix())
2323

24+
asch = pa.schema(
25+
[
26+
("foo", pa.int32()),
27+
("bar", pa.float64()),
28+
("baz", pa.large_string()),
29+
]
30+
)
31+
2432
# Create
2533
asch = arrow_schema()
2634
sdf.create(schema=asch, index_column_names=["foo"])
@@ -72,7 +80,7 @@ def simple_soma_indexed_data_frame(tmp_path):
7280
("index", pa.uint64()),
7381
("A", pa.int64()),
7482
("B", pa.float64()),
75-
("C", pa.string()),
83+
("C", pa.large_string()),
7684
]
7785
)
7886
index_column_names = ["index"]

apis/python/tests/test_soma_metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def soma_object(request, tmp_path):
3333

3434
elif class_name == "SOMADataFrame":
3535
so = soma.SOMADataFrame(uri=uri)
36-
so.create(pa.schema([("A", pa.int32()), ("B", pa.string())]))
36+
so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())]))
3737

3838
elif class_name == "SOMAIndexedDataFrame":
3939
so = soma.SOMAIndexedDataFrame(uri=uri)

0 commit comments

Comments
 (0)