|
5 | 5 | import pyarrow as pa |
6 | 6 | import tiledb |
7 | 7 |
|
8 | | -from . import util, util_arrow, util_pandas, util_tiledb |
| 8 | +from . import util, util_arrow, util_tiledb |
9 | 9 | from .logging import log_io |
10 | 10 | from .soma_collection import SOMACollectionBase |
11 | 11 | from .tiledb_array import TileDBArray |
@@ -218,15 +218,10 @@ def read( |
218 | 218 | iterator = query.df[ids] |
219 | 219 |
|
220 | 220 | for table in iterator: |
221 | | - # XXX COMMENT MORE |
222 | | - # This is the 'decode on read' part of our logic; in dim_select we have the |
223 | | - # 'encode on write' part. |
224 | | - # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99. |
225 | | - # |
226 | | - # Also: don't materialize these on read |
| 221 | + # Don't materialize these on read |
227 | 222 | # TODO: get the arrow syntax for drop |
228 | 223 | # df.drop(ROWID, axis=1) |
229 | | - yield util_arrow.ascii_to_unicode_pyarrow_readback(table) |
| 224 | + yield table |
230 | 225 |
|
231 | 226 | def read_all( |
232 | 227 | self, |
@@ -360,11 +355,6 @@ def read_as_pandas( |
360 | 355 |
|
361 | 356 | for df in iterator: |
362 | 357 |
|
363 | | - # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on |
364 | | - # write' part. |
365 | | - # Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99. |
366 | | - df = util_pandas.ascii_to_unicode_pandas_readback(df) |
367 | | - |
368 | 358 | if id_column_name is not None: |
369 | 359 | df.reset_index(inplace=True) |
370 | 360 | df.set_index(id_column_name, inplace=True) |
@@ -445,39 +435,13 @@ def write_from_pandas( |
445 | 435 |
|
446 | 436 | dataframe.set_index(ROWID, inplace=True) |
447 | 437 |
|
448 | | - # ISSUE: |
449 | | - # |
450 | | - # TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB |
451 | | - # QueryCondition API. While this needs to be addressed -- global collaborators will want to |
452 | | - # write annotation-dataframe values in Unicode -- until then, to make obs/var data possible |
453 | | - # to query, we need to store these as ASCII. |
454 | | - # |
455 | | - # This is (besides collation) a storage-level issue not a presentation-level issue: At write |
456 | | - # time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since |
457 | | - # SOMA is an API: utf8-decode those strings when a query is done & give the user back |
458 | | - # "α,β,γ". |
459 | | - # |
460 | | - # CONTEXT: |
461 | | - # https://github.com/single-cell-data/TileDB-SOMA/issues/99 |
462 | | - # https://github.com/single-cell-data/TileDB-SOMA/pull/101 |
463 | | - # https://github.com/single-cell-data/TileDB-SOMA/issues/106 |
464 | | - # https://github.com/single-cell-data/TileDB-SOMA/pull/117 |
465 | | - # |
466 | | - # IMPLEMENTATION: |
467 | | - # Python types -- float, string, what have you -- appear as dtype('O') which is not useful. |
468 | | - # Also, ``tiledb.from_pandas`` has ``column_types`` but that _forces_ things to string to a |
469 | | - # particular if they shouldn't be. |
470 | | - # |
471 | | - # Instead, we use ``dataframe.convert_dtypes`` to get a little jump on what ``tiledb.from_pandas`` |
472 | | - # is going to be doing anyway, namely, type-inferring to see what is going to be a string. |
473 | | - # |
474 | | - # TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this. |
| 438 | + # Force ASCII storage if string, in order to make obs/var columns queryable. |
| 439 | + # TODO: when UTF-8 attributes are fully supported we can remove this. |
475 | 440 | column_types = {} |
476 | 441 | for column_name in dataframe.keys(): |
477 | 442 | dfc = dataframe[column_name] |
478 | 443 | if len(dfc) > 0 and type(dfc[0]) == str: |
479 | | - # Force ASCII storage if string, in order to make obs/var columns queryable. |
480 | | - column_types[column_name] = np.dtype("S") |
| 444 | + column_types[column_name] = "ascii" |
481 | 445 |
|
482 | 446 | tiledb.from_pandas( |
483 | 447 | uri=self.uri, |
|
0 commit comments