Skip to content

Commit dca76fc

Browse files
Update synthetic weather-forecasting
1 parent c58e0f9 commit dca76fc

File tree

1 file changed

+125
-0
lines changed

1 file changed

+125
-0
lines changed

synthetic weather-forecasting

+125
Original file line numberDiff line numberDiff line change
@@ -1 +1,126 @@
1+
from data_structures.custom_queue import Queue
2+
3+
import pandas as pd
4+
import numpy as np
5+
6+
# Load the original dataset
7+
file_path = 'events.csv'
8+
data = pd.read_csv(file_path)
9+
10+
# Step 1: Convert 'Start time UTC' to datetime format
11+
data['Start time UTC'] = pd.to_datetime(data['Start time UTC'])
12+
13+
# Step 2: Shift the dates to match the range 2022-2024
14+
# Calculate the original date range
15+
original_start_date = data['Start time UTC'].min()
16+
new_start_date = pd.Timestamp('2022-01-01 00:00:00')
17+
18+
# Calculate the offset
19+
date_offset = new_start_date - original_start_date
20+
21+
# Apply the offset to shift the date range
22+
data['Start time UTC'] = data['Start time UTC'] + date_offset
23+
data['End time UTC'] = pd.to_datetime(data['End time UTC']) + date_offset
24+
data['Start time UTC+03:00'] = pd.to_datetime(data['Start time UTC+03:00']) + date_offset
25+
data['End time UTC+03:00'] = pd.to_datetime(data['End time UTC+03:00']) + date_offset
26+
27+
# Step 3: Rename the column to 'Electricity consumption in India'
28+
data.rename(columns={'Electricity consumption in Finland': 'Electricity consumption in India'}, inplace=True)
29+
30+
# Step 4: Filter the data for the years 2022 to 2024
31+
data = data[(data['Start time UTC'] >= '2022-01-01') & (data['Start time UTC'] < '2025-01-01')]
32+
33+
# Step 5: Display the transformed dataset
34+
print(data.head())
35+
36+
# Save the modified dataset (optional)
37+
data.to_csv('energy_consumption_india_2022_2024.csv', index=False)
38+
39+
40+
import pandas as pd
41+
import numpy as np
42+
import matplotlib.pyplot as plt
43+
from sklearn.model_selection import train_test_split
44+
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
45+
from sklearn.linear_model import LinearRegression
46+
from sklearn.tree import DecisionTreeRegressor
47+
from sklearn.ensemble import RandomForestRegressor
48+
49+
# Step 1: Load the preprocessed dataset
50+
file_path = 'energy_consumption_india_2022_2024.csv'
51+
data = pd.read_csv(file_path)
52+
53+
# Convert 'Start time UTC' to datetime format if not already done
54+
data['Start time UTC'] = pd.to_datetime(data['Start time UTC'])
55+
56+
# Step 2: Feature Engineering
57+
# Extract useful features from the 'Start time UTC' column
58+
data['year'] = data['Start time UTC'].dt.year
59+
data['month'] = data['Start time UTC'].dt.month
60+
data['day'] = data['Start time UTC'].dt.day
61+
data['hour'] = data['Start time UTC'].dt.hour
62+
data['day_of_week'] = data['Start time UTC'].dt.dayofweek
63+
64+
# Add lag features to capture past consumption patterns
65+
data['lag_1'] = data['Electricity consumption in India'].shift(1)
66+
data['lag_7'] = data['Electricity consumption in India'].shift(7)
67+
data['lag_30'] = data['Electricity consumption in India'].shift(30)
68+
data.dropna(inplace=True) # Remove rows with NaN values due to lagging
69+
70+
# Step 3: Prepare the data for ML models
71+
X = data.drop(['Electricity consumption in India', 'Start time UTC', 'End time UTC',
72+
'Start time UTC+03:00', 'End time UTC+03:00'], axis=1, errors='ignore')
73+
y = data['Electricity consumption in India']
74+
75+
# Train-test split (80% training, 20% testing)
76+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
77+
78+
# Step 4: Train and evaluate different models
79+
models = {
80+
'Linear Regression': LinearRegression(),
81+
'Decision Tree': DecisionTreeRegressor(random_state=42),
82+
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
83+
}
84+
85+
results = {}
86+
87+
for model_name, model in models.items():
88+
# Train the model
89+
model.fit(X_train, y_train)
90+
91+
# Make predictions
92+
y_pred = model.predict(X_test)
93+
94+
# Evaluate the model
95+
mae = mean_absolute_error(y_test, y_pred)
96+
mse = mean_squared_error(y_test, y_pred)
97+
r2 = r2_score(y_test, y_pred)
98+
99+
# Store the results
100+
results[model_name] = {'MAE': mae, 'MSE': mse, 'R2 Score': r2}
101+
102+
# Print evaluation metrics
103+
print(f"{model_name} Evaluation:")
104+
print(f"MAE: {mae:.2f}")
105+
print(f"MSE: {mse:.2f}")
106+
print(f"R2 Score: {r2:.2f}")
107+
print('-' * 30)
108+
109+
# Step 5: Visualize Actual vs Predicted for the Best Model (Random Forest in this case)
110+
best_model = models['Random Forest']
111+
y_pred_best = best_model.predict(X_test)
112+
113+
plt.figure(figsize=(12, 6))
114+
plt.plot(y_test.values, label='Actual', color='blue', alpha=0.7)
115+
plt.plot(y_pred_best, label='Random Forest Predicted', color='orange', alpha=0.7)
116+
plt.title('Energy Consumption Forecasting: Actual vs Predicted')
117+
plt.xlabel('Time Index')
118+
plt.ylabel('Electricity Consumption in India')
119+
plt.legend()
120+
plt.grid(True)
121+
plt.show()
122+
123+
# Step 6: Save Model Results (Optional)
124+
results_df = pd.DataFrame(results).T
125+
results_df.to_csv('model_performance.csv', index=True)
1126

0 commit comments

Comments
 (0)