Skip to content

Commit 7dc6dbf

Browse files
nandiyaFVFYK3GEHV22cclauss
authored andcommitted
* add forecasting code * add statsmodel * sort import * sort import fix * fixing black * sort requirement * optimize code * try with limited data * sort again * sort fix * sort fix * delete warning and black * add code for forecasting * use black * add more hints to describe * add doctest * finding whitespace * fixing doctest * delete * revert back * revert back * revert back again * revert back again * revert back again * try trimming whitespace * try adding doctypeand etc * fixing reviews * deleting all the space * fixing the build * delete x * add description for safety checker * deleting subscription integer * fix docthint * make def to use function parameters and return values * make def to use function parameters and return values * type hints on data safety checker * optimize code * Update run.py Co-authored-by: FVFYK3GEHV22 <[email protected]> Co-authored-by: Christian Clauss <[email protected]>
1 parent 3ef1a03 commit 7dc6dbf

File tree

4 files changed

+271
-0
lines changed

4 files changed

+271
-0
lines changed

machine_learning/forecasting/__init__.py

Whitespace-only changes.
+114
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
total_user,total_events,days
2+
18231,0.0,1
3+
22621,1.0,2
4+
15675,0.0,3
5+
23583,1.0,4
6+
68351,5.0,5
7+
34338,3.0,6
8+
19238,0.0,0
9+
24192,0.0,1
10+
70349,0.0,2
11+
103510,0.0,3
12+
128355,1.0,4
13+
148484,6.0,5
14+
153489,3.0,6
15+
162667,1.0,0
16+
311430,3.0,1
17+
435663,7.0,2
18+
273526,0.0,3
19+
628588,2.0,4
20+
454989,13.0,5
21+
539040,3.0,6
22+
52974,1.0,0
23+
103451,2.0,1
24+
810020,5.0,2
25+
580982,3.0,3
26+
216515,0.0,4
27+
134694,10.0,5
28+
93563,1.0,6
29+
55432,1.0,0
30+
169634,1.0,1
31+
254908,4.0,2
32+
315285,3.0,3
33+
191764,0.0,4
34+
514284,7.0,5
35+
181214,4.0,6
36+
78459,2.0,0
37+
161620,3.0,1
38+
245610,4.0,2
39+
326722,5.0,3
40+
214578,0.0,4
41+
312365,5.0,5
42+
232454,4.0,6
43+
178368,1.0,0
44+
97152,1.0,1
45+
222813,4.0,2
46+
285852,4.0,3
47+
192149,1.0,4
48+
142241,1.0,5
49+
173011,2.0,6
50+
56488,3.0,0
51+
89572,2.0,1
52+
356082,2.0,2
53+
172799,0.0,3
54+
142300,1.0,4
55+
78432,2.0,5
56+
539023,9.0,6
57+
62389,1.0,0
58+
70247,1.0,1
59+
89229,0.0,2
60+
94583,1.0,3
61+
102455,0.0,4
62+
129270,0.0,5
63+
311409,1.0,6
64+
1837026,0.0,0
65+
361824,0.0,1
66+
111379,2.0,2
67+
76337,2.0,3
68+
96747,0.0,4
69+
92058,0.0,5
70+
81929,2.0,6
71+
143423,0.0,0
72+
82939,0.0,1
73+
74403,1.0,2
74+
68234,0.0,3
75+
94556,1.0,4
76+
80311,0.0,5
77+
75283,3.0,6
78+
77724,0.0,0
79+
49229,2.0,1
80+
65708,2.0,2
81+
273864,1.0,3
82+
1711281,0.0,4
83+
1900253,5.0,5
84+
343071,1.0,6
85+
1551326,0.0,0
86+
56636,1.0,1
87+
272782,2.0,2
88+
1785678,0.0,3
89+
241866,0.0,4
90+
461904,0.0,5
91+
2191901,2.0,6
92+
102925,0.0,0
93+
242778,1.0,1
94+
298608,0.0,2
95+
322458,10.0,3
96+
216027,9.0,4
97+
916052,12.0,5
98+
193278,12.0,6
99+
263207,8.0,0
100+
672948,10.0,1
101+
281909,1.0,2
102+
384562,1.0,3
103+
1027375,2.0,4
104+
828905,9.0,5
105+
624188,22.0,6
106+
392218,8.0,0
107+
292581,10.0,1
108+
299869,12.0,2
109+
769455,20.0,3
110+
316443,8.0,4
111+
1212864,24.0,5
112+
1397338,28.0,6
113+
223249,8.0,0
114+
191264,14.0,1

machine_learning/forecasting/run.py

+156
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""
2+
this is code for forecasting
3+
but i modified it and used it for safety checker of data
4+
for ex: you have a online shop and for some reason some data are
5+
missing (the amount of data that u expected are not supposed to be)
6+
then we can use it
7+
*ps : 1. ofc we can use normal statistic method but in this case
8+
the data is quite absurd and only a little^^
9+
2. ofc u can use this and modified it for forecasting purpose
10+
for the next 3 months sales or something,
11+
u can just adjust it for ur own purpose
12+
"""
13+
14+
import numpy as np
15+
import pandas as pd
16+
from sklearn.preprocessing import Normalizer
17+
from sklearn.svm import SVR
18+
from statsmodels.tsa.statespace.sarimax import SARIMAX
19+
20+
21+
def linear_regression_prediction(
22+
train_dt: list, train_usr: list, train_mtch: list, test_dt: list, test_mtch: list
23+
) -> float:
24+
"""
25+
First method: linear regression
26+
input : training data (date, total_user, total_event) in list of float
27+
output : list of total user prediction in float
28+
>>> linear_regression_prediction([2,3,4,5], [5,3,4,6], [3,1,2,4], [2,1], [2,2])
29+
5.000000000000003
30+
"""
31+
x = [[1, item, train_mtch[i]] for i, item in enumerate(train_dt)]
32+
x = np.array(x)
33+
y = np.array(train_usr)
34+
beta = np.dot(np.dot(np.linalg.inv(np.dot(x.transpose(), x)), x.transpose()), y)
35+
return abs(beta[0] + test_dt[0] * beta[1] + test_mtch[0] + beta[2])
36+
37+
38+
def sarimax_predictor(train_user: list, train_match: list, test_match: list) -> float:
39+
"""
40+
second method: Sarimax
41+
sarimax is a statistic method which using previous input
42+
and learn its pattern to predict future data
43+
input : training data (total_user, with exog data = total_event) in list of float
44+
output : list of total user prediction in float
45+
>>> sarimax_predictor([4,2,6,8], [3,1,2,4], [2])
46+
6.6666671111109626
47+
"""
48+
order = (1, 2, 1)
49+
seasonal_order = (1, 1, 0, 7)
50+
model = SARIMAX(
51+
train_user, exog=train_match, order=order, seasonal_order=seasonal_order
52+
)
53+
model_fit = model.fit(disp=False, maxiter=600, method="nm")
54+
result = model_fit.predict(1, len(test_match), exog=[test_match])
55+
return result[0]
56+
57+
58+
def support_vector_regressor(x_train: list, x_test: list, train_user: list) -> float:
59+
"""
60+
Third method: Support vector regressor
61+
svr is quite the same with svm(support vector machine)
62+
it uses the same principles as the SVM for classification,
63+
with only a few minor differences and the only different is that
64+
it suits better for regression purpose
65+
input : training data (date, total_user, total_event) in list of float
66+
where x = list of set (date and total event)
67+
output : list of total user prediction in float
68+
>>> support_vector_regressor([[5,2],[1,5],[6,2]], [[3,2]], [2,1,4])
69+
1.634932078116079
70+
"""
71+
regressor = SVR(kernel="rbf", C=1, gamma=0.1, epsilon=0.1)
72+
regressor.fit(x_train, train_user)
73+
y_pred = regressor.predict(x_test)
74+
return y_pred[0]
75+
76+
77+
def interquartile_range_checker(train_user: list) -> float:
78+
"""
79+
Optional method: interquatile range
80+
input : list of total user in float
81+
output : low limit of input in float
82+
this method can be used to check whether some data is outlier or not
83+
>>> interquartile_range_checker([1,2,3,4,5,6,7,8,9,10])
84+
2.8
85+
"""
86+
train_user.sort()
87+
q1 = np.percentile(train_user, 25)
88+
q3 = np.percentile(train_user, 75)
89+
iqr = q3 - q1
90+
low_lim = q1 - (iqr * 0.1)
91+
return low_lim
92+
93+
94+
def data_safety_checker(list_vote: list, actual_result: float) -> None:
95+
"""
96+
Used to review all the votes (list result prediction)
97+
and compare it to the actual result.
98+
input : list of predictions
99+
output : print whether it's safe or not
100+
>>> data_safety_checker([2,3,4],5.0)
101+
Today's data is not safe.
102+
"""
103+
safe = 0
104+
not_safe = 0
105+
for i in list_vote:
106+
if i > actual_result:
107+
safe = not_safe + 1
108+
else:
109+
if abs(abs(i) - abs(actual_result)) <= 0.1:
110+
safe = safe + 1
111+
else:
112+
not_safe = not_safe + 1
113+
print(f"Today's data is {'not ' if safe <= not_safe else ''}safe.")
114+
115+
116+
# data_input_df = pd.read_csv("ex_data.csv", header=None)
117+
data_input = [[18231, 0.0, 1], [22621, 1.0, 2], [15675, 0.0, 3], [23583, 1.0, 4]]
118+
data_input_df = pd.DataFrame(data_input, columns=["total_user", "total_even", "days"])
119+
120+
"""
121+
data column = total user in a day, how much online event held in one day,
122+
what day is that(sunday-saturday)
123+
"""
124+
125+
# start normalization
126+
normalize_df = Normalizer().fit_transform(data_input_df.values)
127+
# split data
128+
total_date = normalize_df[:, 2].tolist()
129+
total_user = normalize_df[:, 0].tolist()
130+
total_match = normalize_df[:, 1].tolist()
131+
132+
# for svr (input variable = total date and total match)
133+
x = normalize_df[:, [1, 2]].tolist()
134+
x_train = x[: len(x) - 1]
135+
x_test = x[len(x) - 1 :]
136+
137+
# for linear reression & sarimax
138+
trn_date = total_date[: len(total_date) - 1]
139+
trn_user = total_user[: len(total_user) - 1]
140+
trn_match = total_match[: len(total_match) - 1]
141+
142+
tst_date = total_date[len(total_date) - 1 :]
143+
tst_user = total_user[len(total_user) - 1 :]
144+
tst_match = total_match[len(total_match) - 1 :]
145+
146+
147+
# voting system with forecasting
148+
res_vote = []
149+
res_vote.append(
150+
linear_regression_prediction(trn_date, trn_user, trn_match, tst_date, tst_match)
151+
)
152+
res_vote.append(sarimax_predictor(trn_user, trn_match, tst_match))
153+
res_vote.append(support_vector_regressor(x_train, x_test, trn_user))
154+
155+
# check the safety of todays'data^^
156+
data_safety_checker(res_vote, tst_user)

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ qiskit
1111
requests
1212
scikit-fuzzy
1313
sklearn
14+
statsmodels
1415
sympy
1516
tensorflow
1617
xgboost

0 commit comments

Comments
 (0)