-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrandom_forest.py
More file actions
186 lines (148 loc) · 6.21 KB
/
random_forest.py
File metadata and controls
186 lines (148 loc) · 6.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import randint
data = pd.read_excel(r'/Users/prashankulathunga/Documents/Projects/GOLD_PRICE_PREDICT-SYSTEM/Project_G/pythonProject/data_set/Daily.xlsx')
data = data[data['Date'] > '2012-01-01']
data.head()
data.set_index('Date', inplace=True)
data.head(2)
#
# # Calculate IQR
# Q1 = data['USD'].quantile(0.25)
# Q3 = data['USD'].quantile(0.75)
# IQR = Q3 - Q1
#
# # Define outlier bounds
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR
#
# # Identify and filter outliers
# data = data[(data['USD'] >= lower_bound) & (data['USD'] <= upper_bound)]
data['Year'] = data.index.year
data['Month'] = data.index.month
data['Day'] = data.index.day
scaler = MinMaxScaler()
data['USD'] = scaler.fit_transform(data[['USD']])
print(data.isna().sum())
# Features (X) and Target (y)
X = data.drop(columns=['USD']) # Drop the target column
y = data['USD'] # Target column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")
# Define a more focused parameter grid for RandomizedSearchCV
param_dist = {
'n_estimators': randint(100, 600), # Random distribution between 100 and 500
'max_depth': randint(5, 25), # Random distribution between 5 and 30
'min_samples_split': randint(2, 12), # Random distribution between 2 and 12
'min_samples_leaf': randint(1, 10) # Random distribution between 1 and 10
}
# TimeSeriesSplit for time-series data validation
tscv = TimeSeriesSplit(n_splits=12)
# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(
estimator=RandomForestRegressor(random_state=7),
param_distributions=param_dist,
n_iter=80, # Number of parameter settings sampled
cv=tscv,
scoring='neg_mean_absolute_error',
verbose=2,
random_state=42,
n_jobs=-1
)
# Fit the random search on the training data
random_search.fit(X_train, y_train)
# Display the best parameters and set the best model
print("Best Parameters:", random_search.best_params_)
model = random_search.best_estimator_
# Train the tuned model on the entire training data
model.fit(X_train, y_train)
# Assuming `data` is a DataFrame with a datetime index and a 'USD' column.
# Convert the index to numeric values (e.g., days since the start date).
data['date_numeric'] = (data.index - data.index.min()).days
# Define the degree of the polynomial
degree = 1 # Linear fit
# Fit the polynomial using the numeric representation of dates
coefficients = np.polyfit(data['date_numeric'], data['USD'], degree)
polynomial = np.poly1d(coefficients)
# Generate the best-fit values
data['best_fit'] = polynomial(data['date_numeric'])
# Plot the actual and best-fit lines
plt.figure(figsize=(12, 6))
plt.plot(data.index, data['USD'], color='green', label='Actual Price', alpha=0.8)
plt.plot(data.index, data['best_fit'], color='red', label='Best Fit Line')
plt.title('Best Fit Line Gold Price since 2020', pad=16)
plt.xlabel('Date', labelpad=12)
plt.ylabel('Price (USD)', labelpad=12)
plt.gca().spines[:].set_visible(False)
plt.legend()
plt.show()
y_predict = model.predict(X_test)
print(y_predict[:5])
y_test_original = scaler.inverse_transform(y_test.values.reshape(-1, 1))
y_predict_original = scaler.inverse_transform(y_predict.reshape(-1, 1))
print("Original y_test:", y_test_original[:5])
print("Original y_predict:", y_predict_original[:5])
# Evaluate the model
mae = mean_absolute_error(y_test_original, y_predict_original)
mse = mean_squared_error(y_test_original, y_predict_original)
r2 = r2_score(y_test_original, y_predict_original)
print(f"MAE: {mae}\nMSE: {mse}\nR²: {r2}")
y_test_original = y_test_original.flatten()
y_predict_original = y_predict_original.flatten()
predictdata_frame = pd.DataFrame({
'Date': y_test.index,
'Actual_Price': y_test_original,
'Predicted_Price': y_predict_original})
predictdata_frame.head()
plt.style.use('ggplot')
plt.figure(figsize=(10, 5))
plt.scatter(predictdata_frame['Date'], predictdata_frame['Actual_Price'], color='r', alpha=0.8, label='Actual Price')
plt.scatter(predictdata_frame['Date'], predictdata_frame['Predicted_Price'], color='b', alpha=0.4,
label='Predicted Price')
plt.title('Gold Price Actual vs Predicted', pad=16)
plt.xlabel('Date', labelpad=12)
plt.ylabel('Price (USD)', labelpad=12)
plt.gca().spines[:].set_visible(False)
plt.legend()
plt.show()
# get month data
m_data = pd.read_excel(r'/Users/prashankulathunga/Documents/Projects/GOLD_PRICE_PREDICT-SYSTEM/Project_G/pythonProject/data_set/month_dates.xlsx')
m_data['Date'] = pd.to_datetime(m_data['Date']) # Ensure the column is datetime
m_data.set_index('Date', inplace=True) # Set the 'Date' column as the index
# Now extract year, month, and day
m_data['Year'] = m_data.index.year
m_data['Month'] = m_data.index.month
m_data['Day'] = m_data.index.day
X_m = m_data.drop(columns=['USD']) # Drop the target column
y_m = m_data['USD'] # Target column
y_m.head()
monthpredict_data = model.predict(X_m)
monthpredict_data = scaler.inverse_transform([monthpredict_data])
monthpredict_data = monthpredict_data.flatten()
monthpredict_dataframe = pd.DataFrame({
'Actual_Price': y_m,
'Predicted_price': monthpredict_data
})
print(monthpredict_dataframe)
mae = mean_absolute_error(y_m, monthpredict_data)
mse = mean_squared_error(y_m, monthpredict_data)
r2 = r2_score(y_m, monthpredict_data)
print(f"MAE: {mae}\nMSE: {mse}\nR²: {r2}")
plt.style.use('ggplot')
plt.figure(figsize=(10, 5))
plt.plot(monthpredict_dataframe.index, monthpredict_dataframe['Actual_Price'], color='r', alpha=0.8,
label='Actual Price')
plt.plot(monthpredict_dataframe.index, monthpredict_dataframe['Predicted_price'], color='b', alpha=0.4,
label='Predicted Price')
plt.title('Gold Price Actual vs Predicted', pad=16)
plt.xlabel('Date', labelpad=12)
plt.ylabel('Price (USD)', labelpad=12)
plt.gca().spines[:].set_visible(False)
plt.legend()
plt.show()