TileDB-SOMA/apis/python/tests/test_soma_indexed_dataframe.py at 0905781b1c017d38f4818153448010b359ac1db5 · single-cell-data/TileDB-SOMA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import pyarrow as pa
import pytest

import tiledbsoma as t


@pytest.fixture
def arrow_schema():
    def _schema():
        return pa.schema(
            [
                pa.field("foo", pa.int64()),
                pa.field("bar", pa.float64()),
                pa.field("baz", pa.string()),
            ]
        )

    return _schema


def test_soma_indexed_dataframe(tmp_path, arrow_schema):
    sdf = t.SOMAIndexedDataFrame(uri=tmp_path.as_posix())

    asch = pa.schema(
        [
            ("foo", pa.int32()),
            ("bar", pa.float64()),
            ("baz", pa.large_string()),
        ]
    )

    # Create
    asch = arrow_schema()
    sdf.create(schema=asch, index_column_names=["foo"])

    # Write
    for _ in range(3):
        pydict = {}
        pydict["foo"] = [10, 20, 30, 40, 50]
        pydict["bar"] = [4.1, 5.2, 6.3, 7.4, 8.5]
        pydict["baz"] = ["apple", "ball", "cat", "dog", "egg"]
        rb = pa.Table.from_pydict(pydict)
        sdf.write(rb)

    # Read all
    table = sdf.read_all()
    # Weird thing about pyarrow Table:
    # * We have table.num_rows is 5 and table.num_columns is 3
    # * But len(table) is 3
    # * `for column in table` loops over columns
    assert table.num_rows == 5
    assert table.num_columns == 3
    assert [e.as_py() for e in list(table["foo"])] == pydict["foo"]
    assert [e.as_py() for e in list(table["bar"])] == pydict["bar"]
    assert [e.as_py() for e in list(table["baz"])] == pydict["baz"]

    # Read ids
    table = sdf.read_all(ids=[30, 10])
    assert table.num_rows == 2
    assert table.num_columns == 3
    assert sorted([e.as_py() for e in list(table["foo"])]) == [10, 30]
    assert sorted([e.as_py() for e in list(table["bar"])]) == [4.1, 6.3]
    assert sorted([e.as_py() for e in list(table["baz"])]) == ["apple", "cat"]


def test_soma_indexed_dataframe_with_float_dim(tmp_path, arrow_schema):
    sdf = t.SOMAIndexedDataFrame(uri=tmp_path.as_posix())
    asch = arrow_schema()
    sdf.create(schema=asch, index_column_names=["bar"])
    assert sdf.get_index_column_names() == ["bar"]


@pytest.fixture
def simple_soma_indexed_data_frame(tmp_path):
    """
    A pytest fixture which creates a simple SOMAIndexedDataFrame for use in tests below.
    """
    schema = pa.schema(
        [
            ("index", pa.uint64()),
            ("A", pa.int64()),
            ("B", pa.float64()),
            ("C", pa.large_string()),
        ]
    )
    index_column_names = ["index"]
    sdf = t.SOMAIndexedDataFrame(uri=tmp_path.as_posix())
    sdf.create(schema=schema, index_column_names=index_column_names)

    data = {
        "index": [0, 1, 2, 3],
        "A": [10, 11, 12, 13],
        "B": [100.1, 200.2, 300.3, 400.4],
        "C": ["this", "is", "a", "test"],
    }
    n_data = len(data["index"])
    rb = pa.Table.from_pydict(data)
    sdf.write(rb)
    yield (schema, sdf, n_data, index_column_names)
    sdf.delete()


@pytest.mark.parametrize(
    "ids",
    [
        None,
        [
            0,
        ],
        [1, 3],
    ],
)
@pytest.mark.parametrize(
    "col_names",
    [
        ["A"],
        ["B"],
        ["A", "B"],
        ["index"],
        ["index", "A", "B", "C"],
        None,
    ],
)
def test_SOMAIndexedDataFrame_read_column_names(
    simple_soma_indexed_data_frame, ids, col_names
):
    schema, sdf, n_data, index_column_names = simple_soma_indexed_data_frame
    assert sdf.exists()

    def _check_tbl(tbl, col_names, ids):
        print(tbl)
        assert tbl.num_columns == (
            len(schema.names) if col_names is None else len(col_names)
        )
        assert tbl.num_rows == (n_data if ids is None else len(ids))
        assert tbl.schema == pa.schema(
            [
                schema.field(f)
                for f in (col_names if col_names is not None else schema.names)
            ]
        )

    _check_tbl(
        sdf.read_all(ids=ids, column_names=col_names),
        col_names,
        ids,
    )
    _check_tbl(
        sdf.read_all(column_names=col_names),
        col_names,
        None,
    )

    # TODO: currently unimplemented. Enable tests when issue #329 is resolved.
    #
    # _check_tbl(
    #     pa.Table.from_pandas(
    #         pd.concat(sdf.read_as_pandas(ids=ids, column_names=col_names))
    #     ),
    #     col_names,
    #     ids,
    # )
    # _check_tbl(
    #     pa.Table.from_pandas(sdf.read_as_pandas_all(column_names=col_names)),
    #     col_names,
    #     None,
    # )