|
5 | 5 | import pyarrow as pa |
6 | 6 | import tiledb |
7 | 7 |
|
8 | | -from . import util, util_arrow, util_pandas, util_tiledb |
| 8 | +from . import util, util_arrow, util_tiledb |
9 | 9 | from .logging import log_io |
10 | 10 | from .soma_collection import SOMACollectionBase |
11 | 11 | from .tiledb_array import TileDBArray |
@@ -218,11 +218,7 @@ def read( |
218 | 218 | iterator = query.df[ids] |
219 | 219 |
|
220 | 220 | for table in iterator: |
221 | | - # XXX COMMENT MORE |
222 | | - # This is the 'decode on read' part of our logic; in dim_select we have the |
223 | | - # 'encode on write' part. |
224 | | - # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99. |
225 | | - yield util_arrow.ascii_to_unicode_pyarrow_readback(table) |
| 221 | + yield table |
226 | 222 |
|
227 | 223 | def read_all( |
228 | 224 | self, |
@@ -356,11 +352,6 @@ def read_as_pandas( |
356 | 352 |
|
357 | 353 | for df in iterator: |
358 | 354 |
|
359 | | - # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on |
360 | | - # write' part. |
361 | | - # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99. |
362 | | - df = util_pandas.ascii_to_unicode_pandas_readback(df) |
363 | | - |
364 | 355 | if id_column_name is not None: |
365 | 356 | df.reset_index(inplace=True) |
366 | 357 | df.set_index(id_column_name, inplace=True) |
@@ -441,39 +432,13 @@ def write_from_pandas( |
441 | 432 |
|
442 | 433 | dataframe.set_index(ROWID, inplace=True) |
443 | 434 |
|
444 | | - # ISSUE: |
445 | | - # |
446 | | - # TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB |
447 | | - # QueryCondition API. While this needs to be addressed -- global collaborators will want to |
448 | | - # write annotation-dataframe values in Unicode -- until then, to make obs/var data possible |
449 | | - # to query, we need to store these as ASCII. |
450 | | - # |
451 | | - # This is (besides collation) a storage-level issue not a presentation-level issue: At write |
452 | | - # time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since |
453 | | - # SOMA is an API: utf8-decode those strings when a query is done & give the user back |
454 | | - # "α,β,γ". |
455 | | - # |
456 | | - # CONTEXT: |
457 | | - # https://github.com/single-cell-data/TileDB-SOMA/issues/99 |
458 | | - # https://github.com/single-cell-data/TileDB-SOMA/pull/101 |
459 | | - # https://github.com/single-cell-data/TileDB-SOMA/issues/106 |
460 | | - # https://github.com/single-cell-data/TileDB-SOMA/pull/117 |
461 | | - # |
462 | | - # IMPLEMENTATION: |
463 | | - # Python types -- float, string, what have you -- appear as dtype('O') which is not useful. |
464 | | - # Also, ``tiledb.from_pandas`` has ``column_types`` but that _forces_ things to string to a |
465 | | - # particular if they shouldn't be. |
466 | | - # |
467 | | - # Instead, we use ``dataframe.convert_dtypes`` to get a little jump on what ``tiledb.from_pandas`` |
468 | | - # is going to be doing anyway, namely, type-inferring to see what is going to be a string. |
469 | | - # |
470 | | - # TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this. |
| 435 | + # Force ASCII storage if string, in order to make obs/var columns queryable. |
| 436 | + # TODO: when UTF-8 attributes are fully supported we can remove this. |
471 | 437 | column_types = {} |
472 | 438 | for column_name in dataframe.keys(): |
473 | 439 | dfc = dataframe[column_name] |
474 | 440 | if len(dfc) > 0 and type(dfc[0]) == str: |
475 | | - # Force ASCII storage if string, in order to make obs/var columns queryable. |
476 | | - column_types[column_name] = np.dtype("S") |
| 441 | + column_types[column_name] = "ascii" |
477 | 442 |
|
478 | 443 | tiledb.from_pandas( |
479 | 444 | uri=self.uri, |
|
0 commit comments