Skip to content

Commit bcd6538

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent 8344208 commit bcd6538

File tree

2 files changed

+153
-103
lines changed

2 files changed

+153
-103
lines changed

machine_learning/xgboostclassifier.py

Lines changed: 55 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,79 @@
1-
from xgboost import XGBClassifier
2-
#https://xgboost.readthedocs.io/en/stable/
3-
import numpy as np
4-
import pandas as pd
5-
import seaborn as sns
61
import matplotlib.pyplot as plt
72

3+
# https://xgboost.readthedocs.io/en/stable/
4+
import numpy as np
5+
import pandas as pd
6+
import seaborn as sns
7+
from xgboost import XGBClassifier
88

9-
10-
training = pd.read_csv('../input/titanic/train.csv')
11-
test = pd.read_csv('../input/titanic/test.csv')
9+
training = pd.read_csv("../input/titanic/train.csv")
10+
test = pd.read_csv("../input/titanic/test.csv")
1211

1312
# Commented out IPython magic to ensure Python compatibility.
14-
training['train_test'] = 1
15-
test['train_test'] = 0
16-
test['Survived'] = np.NaN
17-
all_data = pd.concat([training,test])
13+
training["train_test"] = 1
14+
test["train_test"] = 0
15+
test["Survived"] = np.NaN
16+
all_data = pd.concat([training, test])
1817
# %matplotlib inline
1918
all_data.columns
2019

2120
all_data.describe()
2221

23-
all_data['cabin_mul']=all_data.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
24-
all_data['cabin_adv'] = all_data.Cabin.apply(lambda x: str(x)[0])
25-
all_data['name_title']= all_data.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
26-
all_data.Age=all_data.Age.fillna(training.Age.median())
27-
all_data.Fare=all_data.Fare.fillna(training.Fare.median())
28-
all_data.dropna(subset=['Embarked'],inplace=True)
29-
all_data['norm_fare']=np.log(all_data.Fare+1)
30-
all_data.Pclass=all_data.Pclass.astype(str)
31-
all_data['Age']=all_data['Age'].apply(np.int64)
32-
all_dummies = pd.get_dummies(all_data[['Pclass','Sex','Age','SibSp','Parch','norm_fare',
33-
'Embarked','cabin_adv','cabin_mul',
34-
'name_title','train_test']])
22+
all_data["cabin_mul"] = all_data.Cabin.apply(
23+
lambda x: 0 if pd.isna(x) else len(x.split(" "))
24+
)
25+
all_data["cabin_adv"] = all_data.Cabin.apply(lambda x: str(x)[0])
26+
all_data["name_title"] = all_data.Name.apply(
27+
lambda x: x.split(",")[1].split(".")[0].strip()
28+
)
29+
all_data.Age = all_data.Age.fillna(training.Age.median())
30+
all_data.Fare = all_data.Fare.fillna(training.Fare.median())
31+
all_data.dropna(subset=["Embarked"], inplace=True)
32+
all_data["norm_fare"] = np.log(all_data.Fare + 1)
33+
all_data.Pclass = all_data.Pclass.astype(str)
34+
all_data["Age"] = all_data["Age"].apply(np.int64)
35+
all_dummies = pd.get_dummies(
36+
all_data[
37+
[
38+
"Pclass",
39+
"Sex",
40+
"Age",
41+
"SibSp",
42+
"Parch",
43+
"norm_fare",
44+
"Embarked",
45+
"cabin_adv",
46+
"cabin_mul",
47+
"name_title",
48+
"train_test",
49+
]
50+
]
51+
)
3552

3653
from sklearn.preprocessing import StandardScaler
54+
3755
scale = StandardScaler()
3856
all_dummies_scaled = all_dummies.copy()
39-
all_dummies_scaled[['Age','SibSp','Parch','norm_fare']]= scale.fit_transform(all_dummies_scaled[['Age','SibSp','Parch','norm_fare']])
57+
all_dummies_scaled[["Age", "SibSp", "Parch", "norm_fare"]] = scale.fit_transform(
58+
all_dummies_scaled[["Age", "SibSp", "Parch", "norm_fare"]]
59+
)
4060
all_dummies_scaled.head()
4161

42-
X_train_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 1].drop(['train_test'], axis =1)
43-
X_test_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 0].drop(['train_test'], axis =1)
62+
X_train_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 1].drop(
63+
["train_test"], axis=1
64+
)
65+
X_test_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 0].drop(
66+
["train_test"], axis=1
67+
)
4468

45-
y_train = all_data[all_data.train_test==1].Survived
69+
y_train = all_data[all_data.train_test == 1].Survived
4670

4771
from xgboost import XGBClassifier
4872

4973
xgb = XGBClassifier()
50-
xgb.fit(X_train_scaled,y_train)
74+
xgb.fit(X_train_scaled, y_train)
5175

5276
y_hat_base_vc = xgb.predict(X_test_scaled).astype(int)
53-
basic_submission = {'PassengerId': test.PassengerId, 'Survived': y_hat_base_vc}
77+
basic_submission = {"PassengerId": test.PassengerId, "Survived": y_hat_base_vc}
5478
base_submission = pd.DataFrame(data=basic_submission)
55-
base_submission.to_csv('xgb_submission.csv', index=False)
79+
base_submission.to_csv("xgb_submission.csv", index=False)

machine_learning/xgboostregressor.py

Lines changed: 98 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,122 +1,148 @@
1-
from xgboost import XGBRegressor
2-
#https://xgboost.readthedocs.io/en/stable/
3-
import numpy as np
4-
import pandas as pd
1+
import os
52

6-
import seaborn as sns
73
import matplotlib.pyplot as plt
84

5+
# https://xgboost.readthedocs.io/en/stable/
6+
import numpy as np
7+
import pandas as pd
8+
import seaborn as sns
9+
from xgboost import XGBRegressor
910

10-
import os
11-
for dirname, _, filenames in os.walk('/kaggle/input'):
11+
for dirname, _, filenames in os.walk("/kaggle/input"):
1212
for filename in filenames:
1313
print(os.path.join(dirname, filename))
1414

15-
trainAmes=pd.read_csv('/kaggle/input/ames-housing-dataset/AmesHousing.csv')
16-
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
17-
train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
15+
trainAmes = pd.read_csv("/kaggle/input/ames-housing-dataset/AmesHousing.csv")
16+
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
17+
train = pd.read_csv(
18+
"/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
19+
)
1820

19-
trainAmes.columns = trainAmes.columns.str.replace(' ', '')
20-
trainAmes=trainAmes.rename(columns={"YearRemod/Add": "YearRemodAdd"})
21+
trainAmes.columns = trainAmes.columns.str.replace(" ", "")
22+
trainAmes = trainAmes.rename(columns={"YearRemod/Add": "YearRemodAdd"})
2123

22-
data=pd.concat([trainAmes,train,test], axis=0, sort=False)
23-
print("Size of the Housing Dataset",len(data))
24-
useless = ['Id','PID','Order','SalePrice']
25-
data = data.drop(useless, axis = 1)
26-
duplicate = data[data.duplicated(keep='last')].index
24+
data = pd.concat([trainAmes, train, test], axis=0, sort=False)
25+
print("Size of the Housing Dataset", len(data))
26+
useless = ["Id", "PID", "Order", "SalePrice"]
27+
data = data.drop(useless, axis=1)
28+
duplicate = data[data.duplicated(keep="last")].index
2729
len(duplicate)
2830

29-
duplicate=duplicate[0:390]
30-
trainAmes = trainAmes.drop(duplicate, axis = 0)
31+
duplicate = duplicate[0:390]
32+
trainAmes = trainAmes.drop(duplicate, axis=0)
3133

32-
training=pd.concat([trainAmes,train], axis=0, sort=False)
33-
useless = ['Id','PID','Order']
34-
training = training.drop(useless, axis = 1)
34+
training = pd.concat([trainAmes, train], axis=0, sort=False)
35+
useless = ["Id", "PID", "Order"]
36+
training = training.drop(useless, axis=1)
3537

3638
# Separating Target and Features
3739

38-
target = training['SalePrice']
39-
test_id = test['Id']
40-
test = test.drop(['Id'],axis = 1)
41-
training2 = training.drop(['SalePrice'], axis = 1)
40+
target = training["SalePrice"]
41+
test_id = test["Id"]
42+
test = test.drop(["Id"], axis=1)
43+
training2 = training.drop(["SalePrice"], axis=1)
4244

4345

4446
# Concatenating train & test set
4547

46-
train_test = pd.concat([training2,test], axis=0, sort=False)
48+
train_test = pd.concat([training2, test], axis=0, sort=False)
4749

4850
# Filling Categorical NaN (That we know how to fill due to the description file )
4951

50-
train_test['Functional'] = train_test['Functional'].fillna('Typ')
51-
train_test['Electrical'] = train_test['Electrical'].fillna("SBrkr")
52-
train_test['KitchenQual'] = train_test['KitchenQual'].fillna("TA")
53-
train_test['Exterior1st'] = train_test['Exterior1st'].fillna(train_test['Exterior1st'].mode()[0])
54-
train_test['Exterior2nd'] = train_test['Exterior2nd'].fillna(train_test['Exterior2nd'].mode()[0])
55-
train_test['SaleType'] = train_test['SaleType'].fillna(train_test['SaleType'].mode()[0])
52+
train_test["Functional"] = train_test["Functional"].fillna("Typ")
53+
train_test["Electrical"] = train_test["Electrical"].fillna("SBrkr")
54+
train_test["KitchenQual"] = train_test["KitchenQual"].fillna("TA")
55+
train_test["Exterior1st"] = train_test["Exterior1st"].fillna(
56+
train_test["Exterior1st"].mode()[0]
57+
)
58+
train_test["Exterior2nd"] = train_test["Exterior2nd"].fillna(
59+
train_test["Exterior2nd"].mode()[0]
60+
)
61+
train_test["SaleType"] = train_test["SaleType"].fillna(train_test["SaleType"].mode()[0])
5662
train_test["PoolQC"] = train_test["PoolQC"].fillna("None")
5763
train_test["Alley"] = train_test["Alley"].fillna("None")
58-
train_test['FireplaceQu'] = train_test['FireplaceQu'].fillna("None")
59-
train_test['Fence'] = train_test['Fence'].fillna("None")
60-
train_test['MiscFeature'] = train_test['MiscFeature'].fillna("None")
61-
for col in ('GarageArea', 'GarageCars'):
64+
train_test["FireplaceQu"] = train_test["FireplaceQu"].fillna("None")
65+
train_test["Fence"] = train_test["Fence"].fillna("None")
66+
train_test["MiscFeature"] = train_test["MiscFeature"].fillna("None")
67+
for col in ("GarageArea", "GarageCars"):
6268
train_test[col] = train_test[col].fillna(0)
63-
64-
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
65-
train_test[col] = train_test[col].fillna('None')
66-
67-
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
68-
train_test[col] = train_test[col].fillna('None')
69-
70-
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea','BsmtUnfSF', 'TotalBsmtSF'):
69+
70+
for col in ["GarageType", "GarageFinish", "GarageQual", "GarageCond"]:
71+
train_test[col] = train_test[col].fillna("None")
72+
73+
for col in ("BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2"):
74+
train_test[col] = train_test[col].fillna("None")
75+
76+
for col in (
77+
"BsmtFinSF1",
78+
"BsmtFinSF2",
79+
"BsmtFullBath",
80+
"BsmtHalfBath",
81+
"MasVnrArea",
82+
"BsmtUnfSF",
83+
"TotalBsmtSF",
84+
):
7185
train_test[col] = train_test[col].fillna(0)
7286

73-
train_test['LotFrontage'] = train_test['LotFrontage'].fillna(train['LotFrontage'].median())
74-
75-
# Checking the features with NaN remained out
87+
train_test["LotFrontage"] = train_test["LotFrontage"].fillna(
88+
train["LotFrontage"].median()
89+
)
90+
91+
# Checking the features with NaN remained out
7692

7793
for col in train_test:
7894
if train_test[col].isna().sum() > 0:
7995
print(train_test[col][1])
8096

8197
# Converting non-numeric predictors stored as numbers into string
8298

83-
train_test['MSSubClass'] = train_test['MSSubClass'].apply(str)
84-
train_test['YrSold'] = train_test['YrSold'].apply(str)
85-
train_test['MoSold'] = train_test['MoSold'].apply(str)
86-
train_test['OverallQual'] = train_test['OverallQual'].apply(str)
87-
train_test['OverallCond'] = train_test['OverallCond'].apply(str)
88-
train_test["SqFtPerRoom"] = train_test["GrLivArea"] / (train_test["TotRmsAbvGrd"] +
89-
train_test["FullBath"] +
90-
train_test["HalfBath"] +
91-
train_test["KitchenAbvGr"])
92-
93-
train_test['Total_Home_Quality'] = train_test['OverallQual'] + train_test['OverallCond']
94-
95-
train_test['Total_Bathrooms'] = (train_test['FullBath'] + (0.5 * train_test['HalfBath']) +
96-
train_test['BsmtFullBath'] + (0.5 * train_test['BsmtHalfBath']))
99+
train_test["MSSubClass"] = train_test["MSSubClass"].apply(str)
100+
train_test["YrSold"] = train_test["YrSold"].apply(str)
101+
train_test["MoSold"] = train_test["MoSold"].apply(str)
102+
train_test["OverallQual"] = train_test["OverallQual"].apply(str)
103+
train_test["OverallCond"] = train_test["OverallCond"].apply(str)
104+
train_test["SqFtPerRoom"] = train_test["GrLivArea"] / (
105+
train_test["TotRmsAbvGrd"]
106+
+ train_test["FullBath"]
107+
+ train_test["HalfBath"]
108+
+ train_test["KitchenAbvGr"]
109+
)
110+
111+
train_test["Total_Home_Quality"] = train_test["OverallQual"] + train_test["OverallCond"]
112+
113+
train_test["Total_Bathrooms"] = (
114+
train_test["FullBath"]
115+
+ (0.5 * train_test["HalfBath"])
116+
+ train_test["BsmtFullBath"]
117+
+ (0.5 * train_test["BsmtHalfBath"])
118+
)
97119

98120
train_test["HighQualSF"] = train_test["1stFlrSF"] + train_test["2ndFlrSF"]
99-
train_test['renovated']=train_test['YearRemodAdd']+train_test['YearBuilt']
121+
train_test["renovated"] = train_test["YearRemodAdd"] + train_test["YearBuilt"]
100122

101123
# Removing the useless variables
102124

103-
useless = ['GarageYrBlt','YearRemodAdd']
104-
train_test = train_test.drop(useless, axis = 1)
125+
useless = ["GarageYrBlt", "YearRemodAdd"]
126+
train_test = train_test.drop(useless, axis=1)
105127
# Creating dummy variables from categorical features
106128

107129
from scipy.stats import skew
108130

109131
train_test_dummy = pd.get_dummies(train_test)
110132

111133
numeric_features = train_test_dummy.dtypes[train_test_dummy.dtypes != object].index
112-
skewed_features = train_test_dummy[numeric_features].apply(lambda x: skew(x)).sort_values(ascending=False)
134+
skewed_features = (
135+
train_test_dummy[numeric_features]
136+
.apply(lambda x: skew(x))
137+
.sort_values(ascending=False)
138+
)
113139
high_skew = skewed_features[skewed_features > 0.5]
114140
skew_index = high_skew.index
115141

116142
# Normalize skewed features using log_transformation
117-
143+
118144
for i in skew_index:
119-
train_test_dummy[i] = np.log1p(train_test_dummy[i] )
145+
train_test_dummy[i] = np.log1p(train_test_dummy[i])
120146

121147
target_log = np.log1p(target)
122148

@@ -128,11 +154,11 @@
128154
X_test = train_test_dummy[4000:]
129155

130156
xgb = XGBRegressor()
131-
xgb.fit(X_train,target_log)
157+
xgb.fit(X_train, target_log)
132158

133159
test_pred = xgb.predict(X_test)
134-
submission = pd.DataFrame(test_id, columns = ['Id'])
160+
submission = pd.DataFrame(test_id, columns=["Id"])
135161
test_pred = np.expm1(test_pred)
136-
submission['SalePrice'] = test_pred
162+
submission["SalePrice"] = test_pred
137163
submission.head()
138-
submission.to_csv("xgb.csv", index = False, header = True)
164+
submission.to_csv("xgb.csv", index=False, header=True)

0 commit comments

Comments
 (0)