Skip to content

Commit 2040bef

Browse files
committed
Update ASCII storage for dataframes
1 parent 98a1a72 commit 2040bef

File tree

1 file changed

+10
-11
lines changed

1 file changed

+10
-11
lines changed

apis/python/src/tiledbsoma/annotation_dataframe.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from concurrent.futures import ThreadPoolExecutor
22
from typing import Optional, Sequence, Set, Tuple, Union
33

4-
import numpy as np
54
import pandas as pd
65
import pyarrow as pa
76
import tiledb
@@ -86,9 +85,10 @@ def ids(self) -> Sequence[str]:
8685
self.timing_end(s1)
8786
self.dim_name = A.domain.dim(0).name
8887

89-
# TileDB string dims are ASCII not UTF-8. Decode them so they readback
90-
# not like `b"AKR1C3"` but rather like `"AKR1C3"`.
9188
s2 = self.timing_start("ids", "tiledb_query")
89+
# TileDB string dims are ASCII not UTF-8. Decode them so they readback not like
90+
# `b"AKR1C3"` but rather like `"AKR1C3"`. Update as of
91+
# https://github.com/TileDB-Inc/TileDB-Py/pull/1304 these dims will read back OK.
9292
retval = A.query(attrs=[], dims=[self.dim_name])[:][self.dim_name].tolist()
9393
self.timing_end(s2)
9494

@@ -97,7 +97,12 @@ def ids(self) -> Sequence[str]:
9797
self.timing_end(s3)
9898

9999
self.timing_end(s0)
100-
return list(retval) # coerce to list to appease the linter
100+
101+
if len(retval) > 0 and isinstance(retval[0], bytes):
102+
return [e.decode() for e in retval]
103+
else:
104+
# list(...) is there to appease the linter which thinks we're returning `Any`
105+
return list(retval)
101106

102107
# ----------------------------------------------------------------
103108
def __repr__(self) -> str:
@@ -268,14 +273,8 @@ def _query_aux(
268273
# (('__pandas_index_dims', '{"obs_id": "<U0"}'),)
269274
# so the set_index is already done for us.
270275
#
271-
<<<<<<< HEAD:apis/python/src/tiledbsoma/annotation_dataframe.py
272276
# However if the data was written somehow else (e.g. by tiledbsoma-r) then we do.
273-
||||||| parent of 19963aa (tiledbsc-py stats experiment):apis/python/src/tiledbsc/annotation_dataframe.py
274-
# However if the data was written somehow else (e.g. by tiledbscr-r) then we do.
275-
=======
276-
# However if the data was written somehow else (e.g. by tiledbscr-r) then we do.
277277
s3 = self.timing_start("query", "set_index")
278-
>>>>>>> 19963aa (tiledbsc-py stats experiment):apis/python/src/tiledbsc/annotation_dataframe.py
279278
if not return_arrow:
280279
if isinstance(df.index, pd.RangeIndex) and self.dim_name in df.columns:
281280
df.set_index(self.dim_name, inplace=True)
@@ -451,7 +450,7 @@ def from_dataframe(self, dataframe: pd.DataFrame, extent: int = 2048) -> None:
451450
dfc = dataframe[column_name]
452451
if len(dfc) > 0 and type(dfc[0]) == str:
453452
# Force ASCII storage if string, in order to make obs/var columns queryable.
454-
column_types[column_name] = np.dtype("S")
453+
column_types[column_name] = "ascii"
455454

456455
tiledb.from_pandas(
457456
uri=self.uri,

0 commit comments

Comments
 (0)