-
Notifications
You must be signed in to change notification settings - Fork 30
Expand file tree
/
Copy pathutil_arrow.py
More file actions
144 lines (123 loc) · 5.44 KB
/
util_arrow.py
File metadata and controls
144 lines (123 loc) · 5.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from typing import Optional, Union
import numpy as np
import pyarrow as pa
import tiledb
"""
Conversion to/from Arrow and TileDB type systems. Must be capable
of representing full type semantics, and correctly performing a
round trip conversion (eg, T == to_arrow(to_tiledb(T)))
Most primitive types are simple -- e.g., uint8. Of particular challenge
are datetime/timestamps as TileDB has no distinction between a "datetime" and
a "timedelta". The best Arrow match is TimestampType, as long as that
TimestampType instance does NOT have a timezone set.
Because of our round-trip requirement, all other Arrow temporal types
are unsupported (even though they are just int64 under the covers).
We auto-promote Arrow's string and binary to large_string and large_binary,
respectively, as this is what TileDB stores -- a sequence of bytes preceded
by a 64-bit (not 32-bit) length int.
"""
ARROW_TO_TDB = {
# Dict of types unsupported by to_pandas_dtype, which require overrides.
# If the value is an instance of Exception, it will be raised.
#
# IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
#
pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.large_string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.binary(): "bytes", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.large_binary(): "bytes", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.timestamp("s"): "datetime64[s]",
pa.timestamp("ms"): "datetime64[ms]",
pa.timestamp("us"): "datetime64[us]",
pa.timestamp("ns"): "datetime64[ns]",
#
# Unsupported types in TileDB type system
pa.float16(): TypeError("float16 - unsupported type (use float32)"),
pa.date32(): TypeError("32-bit date - unsupported type (use TimestampType)"),
pa.date64(): TypeError("64-bit date - unsupported type (use TimestampType)"),
}
def tiledb_type_from_arrow_type_for_write(t: pa.DataType) -> Union[type, np.dtype, str]:
"""
Same as ``tiledb_type_from_arrow_type`` except that this is used for writing to a TileDB array.
The syntax of TileDB-Py is such that when we want to create a schema with an ASCII column,
we use the string ``"ascii"`` in place of a dtype. But when we want to write data, we need to
use a dtype of ``np.str``, which is now deprecated in favor of simply ``str``.
"""
retval = tiledb_type_from_arrow_type(t)
if retval == "ascii":
return str
else:
return retval
def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
"""
Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
Building block for Arrow-to-TileDB schema translation.
If type is unsupported, with raise a TypeError exception.
Parameters
----------
t : pyarrow.DataType
Arrow DataType instance, eg, pyarrow.int8()
Returns
-------
numpy.dtype
The numpy dtype corresponding to the ``t`` parameter. ``TypeError`` will
be raised for unsupported types.
"""
if t in ARROW_TO_TDB:
arrow_type = ARROW_TO_TDB[t]
if isinstance(arrow_type, Exception):
raise arrow_type
if arrow_type == "ascii":
return arrow_type
if arrow_type == "bytes":
return arrow_type # np.int8()
return np.dtype(arrow_type)
if not pa.types.is_primitive(t):
raise TypeError(f"Type {str(t)} - unsupported type")
if pa.types.is_timestamp(t):
raise TypeError("TimeStampType - unsupported type (timezone not supported)")
if pa.types.is_time32(t):
raise TypeError("Time64Type - unsupported type (use TimestampType)")
if pa.types.is_time64(t):
raise TypeError("Time32Type - unsupported type (use TimestampType)")
if pa.types.is_duration(t):
raise TypeError("DurationType - unsupported type (use TimestampType)")
# else lets try the default conversion path
try:
# Must force into a dtype to catch places where the Pandas type
# system has extra information that can't be expressed
return np.dtype(t.to_pandas_dtype())
except NotImplementedError as exc:
raise TypeError("Unsupported Arrow type") from exc
def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType:
"""
TODO: COMMENT
"""
if tiledb_dtype == "bytes":
return pa.large_binary()
if isinstance(tiledb_dtype, str) and tiledb_dtype == "ascii":
# XXX TODO: temporary work-around until UTF8 support is native. GH #338.
return pa.large_string()
return pa.from_numpy_dtype(tiledb_dtype)
def get_arrow_schema_from_tiledb_uri(
tiledb_uri: str, ctx: Optional[tiledb.Ctx] = None
) -> pa.Schema:
"""
TODO: COMMENT
"""
with tiledb.open(tiledb_uri, ctx=ctx) as A:
arrow_schema_dict = {}
dom = A.schema.domain
for i in range(dom.ndim):
dim = dom.dim(i)
name = dim.name
if name == "":
name = "unnamed"
arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(dim.dtype)
for i in range(A.schema.nattr):
attr = A.schema.attr(i)
name = attr.name
if name == "":
name = "unnamed"
arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype)
return pa.schema(arrow_schema_dict)