99of representing full type semantics, and correctly performing a
1010round trip conversion (eg, T == to_arrow(to_tiledb(T)))
1111
12- Most primitive types are simple - eg , uint8. Of particular challenge
12+ Most primitive types are simple -- e.g. , uint8. Of particular challenge
1313are datetime/timestamps as TileDB has no distinction between a "datetime" and
1414a "timedelta". The best Arrow match is TimestampType, as long as that
1515TimestampType instance does NOT have a timezone set.
1616
1717Because of our round-trip requirement, all other Arrow temporal types
1818are unsupported (even though they are just int64 under the covers).
19+
20+ We auto-promote Arrow's string and binary to large_string and large_binary,
21+ respectively, as this is what TileDB stores -- a sequence of bytes preceded
22+ by a 64-bit (not 32-bit) length int.
1923"""
2024ARROW_TO_TDB = {
2125 # Dict of types unsupported by to_pandas_dtype, which require overrides.
2226 # If the value is an instance of Exception, it will be raised.
2327 #
2428 # IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
2529 #
26- pa .string (): np . dtype (
27- "S"
28- ) , # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
29- pa .binary (): np . dtype ( "S" ),
30+ pa .string (): "ascii" , # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
31+ pa . large_string (): "ascii" , # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
32+ pa . binary (): "bytes" , # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
33+ pa .large_binary (): "bytes" , # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
3034 pa .timestamp ("s" ): "datetime64[s]" ,
3135 pa .timestamp ("ms" ): "datetime64[ms]" ,
3236 pa .timestamp ("us" ): "datetime64[us]" ,
3943}
4044
4145
42- def tiledb_type_from_arrow_type (t : pa .DataType ) -> Union [type , np .dtype ]:
46+ def tiledb_type_from_arrow_type_for_write (t : pa .DataType ) -> Union [type , np .dtype , str ]:
47+ """
48+ Same as ``tiledb_type_from_arrow_type`` except that this is used for writing to a TileDB array.
49+ The syntax of TileDB-Py is such that when we want to create a schema with an ASCII column,
50+ we use the string ``"ascii"`` in place of a dtype. But when we want to write data, we need to
51+ use a dtype of ``np.str``, which is now deprecated in favor of simply ``str``.
52+ """
53+ retval = tiledb_type_from_arrow_type (t )
54+ if retval == "ascii" :
55+ return str
56+ else :
57+ return retval
58+
59+
60+ def tiledb_type_from_arrow_type (t : pa .DataType ) -> Union [type , np .dtype , str ]:
4361 """
4462 Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
4563 Building block for Arrow-to-TileDB schema translation.
@@ -61,6 +79,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
6179 arrow_type = ARROW_TO_TDB [t ]
6280 if isinstance (arrow_type , Exception ):
6381 raise arrow_type
82+ if arrow_type == "ascii" :
83+ return arrow_type
84+ if arrow_type == "bytes" :
85+ return arrow_type # np.int8()
6486 return np .dtype (arrow_type )
6587
6688 if not pa .types .is_primitive (t ):
@@ -83,15 +105,16 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
83105 raise TypeError ("Unsupported Arrow type" ) from exc
84106
85107
86- def get_arrow_type_from_tiledb_dtype (tiledb_dtype : np .dtype ) -> pa .DataType :
108+ def get_arrow_type_from_tiledb_dtype (tiledb_dtype : Union [ str , np .dtype ] ) -> pa .DataType :
87109 """
88110 TODO: COMMENT
89111 """
90- if tiledb_dtype .name == "bytes" :
112+ if tiledb_dtype == "bytes" :
113+ return pa .large_binary ()
114+ if isinstance (tiledb_dtype , str ) and tiledb_dtype == "ascii" :
91115 # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
92- return pa .string ()
93- else :
94- return pa .from_numpy_dtype (tiledb_dtype )
116+ return pa .large_string ()
117+ return pa .from_numpy_dtype (tiledb_dtype )
95118
96119
97120def get_arrow_schema_from_tiledb_uri (
@@ -119,26 +142,3 @@ def get_arrow_schema_from_tiledb_uri(
119142 arrow_schema_dict [name ] = get_arrow_type_from_tiledb_dtype (attr .dtype )
120143
121144 return pa .schema (arrow_schema_dict )
122-
123-
124- def ascii_to_unicode_pyarrow_readback (table : pa .Table ) -> pa .Table :
125- """
126- Implements the 'decode on read' part of our ASCII/Unicode logic
127- """
128- # TODO: COMMENT/LINK HEAVILY
129- names = [ofield .name for ofield in table .schema ]
130- new_fields = []
131- for name in names :
132- old_field = table [name ]
133- # Preferred syntax:
134- # if len(old_field) > 0 and pa.types.is_large_binary(old_field[0]):
135- # but:
136- # AttributeError: 'pyarrow.lib.UInt64Scalar' object has no attribute 'id'
137- if len (old_field ) > 0 and isinstance (old_field [0 ], pa .LargeBinaryScalar ):
138- nfield = pa .array (
139- [element .as_py ().decode ("utf-8" ) for element in old_field ]
140- )
141- new_fields .append (nfield )
142- else :
143- new_fields .append (old_field )
144- return pa .Table .from_arrays (new_fields , names = names )
0 commit comments