Skip to content

Commit 8f2c6de

Browse files
committed
Fix ascii/binary issues in TileDB->Arrow and TileDB->Pandas->Arrows test cases
1 parent 324654b commit 8f2c6de

File tree

4 files changed

+83
-27
lines changed

4 files changed

+83
-27
lines changed

apis/python/src/tiledbsoma/soma_dataframe.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,8 @@ def write_from_pandas(
426426
dfc = dataframe[column_name]
427427
if len(dfc) > 0 and type(dfc[0]) == str:
428428
column_types[column_name] = "ascii"
429+
if len(dfc) > 0 and type(dfc[0]) == bytes:
430+
column_types[column_name] = "bytes"
429431

430432
tiledb.from_pandas(
431433
uri=self.uri,

apis/python/src/tiledbsoma/util_arrow.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,17 @@
99
of representing full type semantics, and correctly performing a
1010
round trip conversion (eg, T == to_arrow(to_tiledb(T)))
1111
12-
Most primitive types are simple - eg, uint8. Of particular challenge
12+
Most primitive types are simple -- e.g., uint8. Of particular challenge
1313
are datetime/timestamps as TileDB has no distinction between a "datetime" and
1414
a "timedelta". The best Arrow match is TimestampType, as long as that
1515
TimestampType instance does NOT have a timezone set.
1616
1717
Because of our round-trip requirement, all other Arrow temporal types
1818
are unsupported (even though they are just int64 under the covers).
19+
20+
We auto-promote Arrow's string and binary to large_string and large_binary,
21+
respectively, as this is what TileDB stores -- a sequence of bytes preceded
22+
by a 64-bit (not 32-bit) length int.
1923
"""
2024
ARROW_TO_TDB = {
2125
# Dict of types unsupported by to_pandas_dtype, which require overrides.
@@ -25,8 +29,8 @@
2529
#
2630
pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
2731
pa.large_string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
28-
pa.binary(): np.dtype("S"),
29-
pa.large_binary(): np.dtype("S"),
32+
pa.binary(): "bytes", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
33+
pa.large_binary(): "bytes", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
3034
pa.timestamp("s"): "datetime64[s]",
3135
pa.timestamp("ms"): "datetime64[ms]",
3236
pa.timestamp("us"): "datetime64[us]",
@@ -63,8 +67,9 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
6367
raise arrow_type
6468
if arrow_type == "ascii":
6569
return arrow_type
66-
else:
67-
return np.dtype(arrow_type)
70+
if arrow_type == "bytes":
71+
return arrow_type # np.int8()
72+
return np.dtype(arrow_type)
6873

6974
if not pa.types.is_primitive(t):
7075
raise TypeError(f"Type {str(t)} - unsupported type")
@@ -90,11 +95,12 @@ def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.D
9095
"""
9196
TODO: COMMENT
9297
"""
93-
if tiledb_dtype == "ascii" or tiledb_dtype.name == "bytes":
98+
if tiledb_dtype == "bytes":
99+
return pa.large_binary()
100+
if isinstance(tiledb_dtype, str) and tiledb_dtype == "ascii":
94101
# XXX TODO: temporary work-around until UTF8 support is native. GH #338.
95102
return pa.large_string()
96-
else:
97-
return pa.from_numpy_dtype(tiledb_dtype)
103+
return pa.from_numpy_dtype(tiledb_dtype)
98104

99105

100106
def get_arrow_schema_from_tiledb_uri(

apis/python/tests/test_soma_dataframe.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -174,37 +174,57 @@ def test_SOMADataFrame_read_column_names(simple_soma_data_frame, ids, col_names)
174174
schema, sdf, n_data = simple_soma_data_frame
175175
assert sdf.exists()
176176

177-
def _check_tbl(tbl, col_names, ids):
177+
def _check_tbl(tbl, col_names, ids, *, demote):
178178
assert tbl.num_columns == (
179179
len(schema.names) if col_names is None else len(col_names)
180180
)
181181
assert tbl.num_rows == (n_data if ids is None else len(ids))
182-
assert tbl.schema == pa.schema(
183-
[
184-
schema.field(f)
185-
for f in (col_names if col_names is not None else schema.names)
186-
]
187-
)
188182

183+
if demote:
184+
assert tbl.schema == pa.schema(
185+
[
186+
pa.field(schema.field(f).name, pa.string())
187+
if schema.field(f).type == pa.large_string()
188+
else schema.field(f)
189+
for f in (col_names if col_names is not None else schema.names)
190+
]
191+
)
192+
else:
193+
assert tbl.schema == pa.schema(
194+
[
195+
schema.field(f)
196+
for f in (col_names if col_names is not None else schema.names)
197+
]
198+
)
199+
200+
# TileDB ASCII -> Arrow large_string
189201
_check_tbl(
190202
sdf.read_all(ids=ids, column_names=col_names),
191203
col_names,
192204
ids,
205+
demote=False,
193206
)
207+
194208
_check_tbl(
195209
sdf.read_all(column_names=col_names),
196210
col_names,
197211
None,
212+
demote=False,
198213
)
214+
215+
# TileDB ASCII -> Pandas string -> Arrow string (not large_string)
199216
_check_tbl(
200217
pa.Table.from_pandas(
201218
pd.concat(sdf.read_as_pandas(ids=ids, column_names=col_names))
202219
),
203220
col_names,
204221
ids,
222+
demote=True,
205223
)
224+
206225
_check_tbl(
207226
pa.Table.from_pandas(sdf.read_as_pandas_all(column_names=col_names)),
208227
col_names,
209228
None,
229+
demote=True,
210230
)

apis/python/tests/test_type_system.py

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,20 @@
2424
pa.timestamp("ms"),
2525
pa.timestamp("us"),
2626
pa.timestamp("ns"),
27-
pa.string(),
27+
# We use Arrow's large_string for ASCII and, ultimately, for Unicode as well
28+
# https://github.com/single-cell-data/TileDB-SOMA/issues/99
29+
# https://github.com/single-cell-data/TileDB-SOMA/pull/359
30+
# https://github.com/single-cell-data/TileDB-SOMA/issues/274
2831
pa.large_string(),
29-
pa.binary(),
3032
pa.large_binary(),
3133
]
3234

35+
"""Arrow types we expect to auto-promote"""
36+
PROMOTED_ARROW_TYPES = [
37+
(pa.string(), pa.large_string()),
38+
# XXX (pa.binary(), pa.large_binary()),
39+
]
40+
3341

3442
"""Arrow types we expect to fail"""
3543
UNSUPPORTED_ARROW_TYPES = [
@@ -46,10 +54,13 @@
4654
pa.duration("us"),
4755
pa.duration("ns"),
4856
pa.month_day_nano_interval(),
57+
# We use Arrow's large_string for ASCII and, ultimately, for Unicode as well
58+
# https://github.com/single-cell-data/TileDB-SOMA/issues/99
59+
# https://github.com/single-cell-data/TileDB-SOMA/pull/359
60+
# https://github.com/single-cell-data/TileDB-SOMA/issues/274
61+
pa.string(),
4962
pa.binary(),
5063
pa.binary(10),
51-
pa.large_binary(),
52-
pa.large_string(),
5364
pa.decimal128(1),
5465
pa.decimal128(38),
5566
pa.list_(pa.int8()),
@@ -61,20 +72,37 @@
6172

6273

6374
@pytest.mark.parametrize("arrow_type", SUPPORTED_ARROW_TYPES)
64-
def test_supported_types_supported(arrow_type):
75+
def test_arrow_types_supported(arrow_type):
6576
"""Verify round-trip conversion of types"""
66-
if pa.types.is_binary(arrow_type):
67-
pytest.xfail("Awaiting UTF-8 support - see issue #338")
77+
# if pa.types.is_binary(arrow_type):
78+
# pytest.xfail("Awaiting UTF-8 support - see issue #274")
6879

6980
tdb_dtype = tiledb_type_from_arrow_type(arrow_type)
70-
assert isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii"
71-
rt_arrow_type = get_arrow_type_from_tiledb_dtype(tdb_dtype)
72-
assert isinstance(rt_arrow_type, pa.DataType)
73-
assert arrow_type == rt_arrow_type
81+
assert (
82+
isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii" or tdb_dtype == "bytes"
83+
)
84+
arrow_rt_type = get_arrow_type_from_tiledb_dtype(tdb_dtype)
85+
assert isinstance(arrow_rt_type, pa.DataType)
86+
assert arrow_type == arrow_rt_type
87+
88+
89+
@pytest.mark.parametrize("arrow_from_to_pair", PROMOTED_ARROW_TYPES)
90+
def test_arrow_types_promoted(arrow_from_to_pair):
91+
"""Verify round-trip conversion of types"""
92+
arrow_from_type = arrow_from_to_pair[0]
93+
arrow_to_type = arrow_from_to_pair[1]
94+
95+
tdb_dtype = tiledb_type_from_arrow_type(arrow_from_type)
96+
assert (
97+
isinstance(tdb_dtype, np.dtype) or tdb_dtype == "ascii" or tdb_dtype == "bytes"
98+
)
99+
arrow_rt_type = get_arrow_type_from_tiledb_dtype(tdb_dtype)
100+
assert isinstance(arrow_rt_type, pa.DataType)
101+
assert arrow_to_type == arrow_rt_type
74102

75103

76104
@pytest.mark.parametrize("arrow_type", UNSUPPORTED_ARROW_TYPES)
77-
def test_supported_types_unsupported(arrow_type):
105+
def test_arrow_types_unsupported(arrow_type):
78106
"""Verify correct error for unsupported types"""
79107
with pytest.raises(TypeError):
80108
tiledb_type_from_arrow_type(arrow_type, match=r".*unsupported type.*")

0 commit comments

Comments
 (0)