-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathData_prep_utils.py
More file actions
69 lines (59 loc) · 1.73 KB
/
Data_prep_utils.py
File metadata and controls
69 lines (59 loc) · 1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import pandas as pd
import numpy as np
from CM_preparation import get_standardized_CM
def check_in(
a,
b
):
uniques = np.unique(a)
uniques = uniques[uniques!=1]
uniques = uniques[uniques!=0]
uniques_b_check = np.unique(b)
uniques_b_check = uniques_b_check[uniques_b_check!=1]
overlap = np.array([i for i in uniques if i in uniques_b_check])
if len(overlap) == len(uniques):
return True
else:
return False
def pick_molecules(
df,
species,
name = 'atom_numbers'
):
temp_col = df[name].apply(lambda x: check_in(x, species))
df = df[temp_col]
compos = np.array([df['atom_numbers'].values[i] for i in range(0,len(df['atom_numbers'].values))])
compos[compos == 1 ] = 0
num_heavy = np.count_nonzero(compos, axis = 1)
return df[num_heavy!=1]
def count_elements(
df,
element_list,
name = 'atom_numbers'
):
for element in element_list:
df[element] = df[name].apply(lambda x: len(np.array(x)[np.array(x) == element]))
return df
def de_hydrogenize_positions(
df,
name_p = 'positions',
name_a = 'atom_numbers'
):
df[name_p] = df[name_p].combine(df[name_a], lambda x, y: [x[i] for i in range(0,len(x)) if y[i] != 1])
return df
def compute_standardized_CM(
df,
element_list,
name_p = 'positions',
name_a = 'atom_numbers'
):
max_n=df[element_list].max().values
tmp=[]
#build master vector
for i in range(0,len(element_list)):
tmp=tmp+[element_list[i]]*max_n[i]
master_vec=tmp
master_vec.sort(reverse=True)
#creates a column with CM representation
df['CM'] = df['positions'].combine(df['atom_numbers'], lambda x,y: get_standardized_CM(y, x, master_vec))
return df