1
- from xgboost import XGBRegressor
2
- #https://xgboost.readthedocs.io/en/stable/
3
- import numpy as np
4
- import pandas as pd
1
+ import os
5
2
6
- import seaborn as sns
7
3
import matplotlib .pyplot as plt
8
4
5
+ # https://xgboost.readthedocs.io/en/stable/
6
+ import numpy as np
7
+ import pandas as pd
8
+ import seaborn as sns
9
+ from xgboost import XGBRegressor
9
10
10
- import os
11
- for dirname , _ , filenames in os .walk ('/kaggle/input' ):
11
+ for dirname , _ , filenames in os .walk ("/kaggle/input" ):
12
12
for filename in filenames :
13
13
print (os .path .join (dirname , filename ))
14
14
15
- trainAmes = pd .read_csv ('/kaggle/input/ames-housing-dataset/AmesHousing.csv' )
16
- test = pd .read_csv ('/kaggle/input/house-prices-advanced-regression-techniques/test.csv' )
17
- train = pd .read_csv ('/kaggle/input/house-prices-advanced-regression-techniques/train.csv' )
15
+ trainAmes = pd .read_csv ("/kaggle/input/ames-housing-dataset/AmesHousing.csv" )
16
+ test = pd .read_csv ("/kaggle/input/house-prices-advanced-regression-techniques/test.csv" )
17
+ train = pd .read_csv (
18
+ "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
19
+ )
18
20
19
- trainAmes .columns = trainAmes .columns .str .replace (' ' , '' )
20
- trainAmes = trainAmes .rename (columns = {"YearRemod/Add" : "YearRemodAdd" })
21
+ trainAmes .columns = trainAmes .columns .str .replace (" " , "" )
22
+ trainAmes = trainAmes .rename (columns = {"YearRemod/Add" : "YearRemodAdd" })
21
23
22
- data = pd .concat ([trainAmes ,train ,test ], axis = 0 , sort = False )
23
- print ("Size of the Housing Dataset" ,len (data ))
24
- useless = ['Id' , ' PID' , ' Order' , ' SalePrice' ]
25
- data = data .drop (useless , axis = 1 )
26
- duplicate = data [data .duplicated (keep = ' last' )].index
24
+ data = pd .concat ([trainAmes , train , test ], axis = 0 , sort = False )
25
+ print ("Size of the Housing Dataset" , len (data ))
26
+ useless = ["Id" , " PID" , " Order" , " SalePrice" ]
27
+ data = data .drop (useless , axis = 1 )
28
+ duplicate = data [data .duplicated (keep = " last" )].index
27
29
len (duplicate )
28
30
29
- duplicate = duplicate [0 :390 ]
30
- trainAmes = trainAmes .drop (duplicate , axis = 0 )
31
+ duplicate = duplicate [0 :390 ]
32
+ trainAmes = trainAmes .drop (duplicate , axis = 0 )
31
33
32
- training = pd .concat ([trainAmes ,train ], axis = 0 , sort = False )
33
- useless = ['Id' , ' PID' , ' Order' ]
34
- training = training .drop (useless , axis = 1 )
34
+ training = pd .concat ([trainAmes , train ], axis = 0 , sort = False )
35
+ useless = ["Id" , " PID" , " Order" ]
36
+ training = training .drop (useless , axis = 1 )
35
37
36
38
# Separating Target and Features
37
39
38
- target = training [' SalePrice' ]
39
- test_id = test ['Id' ]
40
- test = test .drop (['Id' ], axis = 1 )
41
- training2 = training .drop ([' SalePrice' ], axis = 1 )
40
+ target = training [" SalePrice" ]
41
+ test_id = test ["Id" ]
42
+ test = test .drop (["Id" ], axis = 1 )
43
+ training2 = training .drop ([" SalePrice" ], axis = 1 )
42
44
43
45
44
46
# Concatenating train & test set
45
47
46
- train_test = pd .concat ([training2 ,test ], axis = 0 , sort = False )
48
+ train_test = pd .concat ([training2 , test ], axis = 0 , sort = False )
47
49
48
50
# Filling Categorical NaN (That we know how to fill due to the description file )
49
51
50
- train_test ['Functional' ] = train_test ['Functional' ].fillna ('Typ' )
51
- train_test ['Electrical' ] = train_test ['Electrical' ].fillna ("SBrkr" )
52
- train_test ['KitchenQual' ] = train_test ['KitchenQual' ].fillna ("TA" )
53
- train_test ['Exterior1st' ] = train_test ['Exterior1st' ].fillna (train_test ['Exterior1st' ].mode ()[0 ])
54
- train_test ['Exterior2nd' ] = train_test ['Exterior2nd' ].fillna (train_test ['Exterior2nd' ].mode ()[0 ])
55
- train_test ['SaleType' ] = train_test ['SaleType' ].fillna (train_test ['SaleType' ].mode ()[0 ])
52
+ train_test ["Functional" ] = train_test ["Functional" ].fillna ("Typ" )
53
+ train_test ["Electrical" ] = train_test ["Electrical" ].fillna ("SBrkr" )
54
+ train_test ["KitchenQual" ] = train_test ["KitchenQual" ].fillna ("TA" )
55
+ train_test ["Exterior1st" ] = train_test ["Exterior1st" ].fillna (
56
+ train_test ["Exterior1st" ].mode ()[0 ]
57
+ )
58
+ train_test ["Exterior2nd" ] = train_test ["Exterior2nd" ].fillna (
59
+ train_test ["Exterior2nd" ].mode ()[0 ]
60
+ )
61
+ train_test ["SaleType" ] = train_test ["SaleType" ].fillna (train_test ["SaleType" ].mode ()[0 ])
56
62
train_test ["PoolQC" ] = train_test ["PoolQC" ].fillna ("None" )
57
63
train_test ["Alley" ] = train_test ["Alley" ].fillna ("None" )
58
- train_test [' FireplaceQu' ] = train_test [' FireplaceQu' ].fillna ("None" )
59
- train_test [' Fence' ] = train_test [' Fence' ].fillna ("None" )
60
- train_test [' MiscFeature' ] = train_test [' MiscFeature' ].fillna ("None" )
61
- for col in (' GarageArea' , ' GarageCars' ):
64
+ train_test [" FireplaceQu" ] = train_test [" FireplaceQu" ].fillna ("None" )
65
+ train_test [" Fence" ] = train_test [" Fence" ].fillna ("None" )
66
+ train_test [" MiscFeature" ] = train_test [" MiscFeature" ].fillna ("None" )
67
+ for col in (" GarageArea" , " GarageCars" ):
62
68
train_test [col ] = train_test [col ].fillna (0 )
63
-
64
- for col in ['GarageType' , 'GarageFinish' , 'GarageQual' , 'GarageCond' ]:
65
- train_test [col ] = train_test [col ].fillna ('None' )
66
-
67
- for col in ('BsmtQual' , 'BsmtCond' , 'BsmtExposure' , 'BsmtFinType1' , 'BsmtFinType2' ):
68
- train_test [col ] = train_test [col ].fillna ('None' )
69
-
70
- for col in ('BsmtFinSF1' , 'BsmtFinSF2' , 'BsmtFullBath' , 'BsmtHalfBath' , 'MasVnrArea' ,'BsmtUnfSF' , 'TotalBsmtSF' ):
69
+
70
+ for col in ["GarageType" , "GarageFinish" , "GarageQual" , "GarageCond" ]:
71
+ train_test [col ] = train_test [col ].fillna ("None" )
72
+
73
+ for col in ("BsmtQual" , "BsmtCond" , "BsmtExposure" , "BsmtFinType1" , "BsmtFinType2" ):
74
+ train_test [col ] = train_test [col ].fillna ("None" )
75
+
76
+ for col in (
77
+ "BsmtFinSF1" ,
78
+ "BsmtFinSF2" ,
79
+ "BsmtFullBath" ,
80
+ "BsmtHalfBath" ,
81
+ "MasVnrArea" ,
82
+ "BsmtUnfSF" ,
83
+ "TotalBsmtSF" ,
84
+ ):
71
85
train_test [col ] = train_test [col ].fillna (0 )
72
86
73
- train_test ['LotFrontage' ] = train_test ['LotFrontage' ].fillna (train ['LotFrontage' ].median ())
74
-
75
- # Checking the features with NaN remained out
87
+ train_test ["LotFrontage" ] = train_test ["LotFrontage" ].fillna (
88
+ train ["LotFrontage" ].median ()
89
+ )
90
+
91
+ # Checking the features with NaN remained out
76
92
77
93
for col in train_test :
78
94
if train_test [col ].isna ().sum () > 0 :
79
95
print (train_test [col ][1 ])
80
96
81
97
# Converting non-numeric predictors stored as numbers into string
82
98
83
- train_test ['MSSubClass' ] = train_test ['MSSubClass' ].apply (str )
84
- train_test ['YrSold' ] = train_test ['YrSold' ].apply (str )
85
- train_test ['MoSold' ] = train_test ['MoSold' ].apply (str )
86
- train_test ['OverallQual' ] = train_test ['OverallQual' ].apply (str )
87
- train_test ['OverallCond' ] = train_test ['OverallCond' ].apply (str )
88
- train_test ["SqFtPerRoom" ] = train_test ["GrLivArea" ] / (train_test ["TotRmsAbvGrd" ] +
89
- train_test ["FullBath" ] +
90
- train_test ["HalfBath" ] +
91
- train_test ["KitchenAbvGr" ])
92
-
93
- train_test ['Total_Home_Quality' ] = train_test ['OverallQual' ] + train_test ['OverallCond' ]
94
-
95
- train_test ['Total_Bathrooms' ] = (train_test ['FullBath' ] + (0.5 * train_test ['HalfBath' ]) +
96
- train_test ['BsmtFullBath' ] + (0.5 * train_test ['BsmtHalfBath' ]))
99
+ train_test ["MSSubClass" ] = train_test ["MSSubClass" ].apply (str )
100
+ train_test ["YrSold" ] = train_test ["YrSold" ].apply (str )
101
+ train_test ["MoSold" ] = train_test ["MoSold" ].apply (str )
102
+ train_test ["OverallQual" ] = train_test ["OverallQual" ].apply (str )
103
+ train_test ["OverallCond" ] = train_test ["OverallCond" ].apply (str )
104
+ train_test ["SqFtPerRoom" ] = train_test ["GrLivArea" ] / (
105
+ train_test ["TotRmsAbvGrd" ]
106
+ + train_test ["FullBath" ]
107
+ + train_test ["HalfBath" ]
108
+ + train_test ["KitchenAbvGr" ]
109
+ )
110
+
111
+ train_test ["Total_Home_Quality" ] = train_test ["OverallQual" ] + train_test ["OverallCond" ]
112
+
113
+ train_test ["Total_Bathrooms" ] = (
114
+ train_test ["FullBath" ]
115
+ + (0.5 * train_test ["HalfBath" ])
116
+ + train_test ["BsmtFullBath" ]
117
+ + (0.5 * train_test ["BsmtHalfBath" ])
118
+ )
97
119
98
120
train_test ["HighQualSF" ] = train_test ["1stFlrSF" ] + train_test ["2ndFlrSF" ]
99
- train_test [' renovated' ] = train_test [' YearRemodAdd' ] + train_test [' YearBuilt' ]
121
+ train_test [" renovated" ] = train_test [" YearRemodAdd" ] + train_test [" YearBuilt" ]
100
122
101
123
# Removing the useless variables
102
124
103
- useless = [' GarageYrBlt' , ' YearRemodAdd' ]
104
- train_test = train_test .drop (useless , axis = 1 )
125
+ useless = [" GarageYrBlt" , " YearRemodAdd" ]
126
+ train_test = train_test .drop (useless , axis = 1 )
105
127
# Creating dummy variables from categorical features
106
128
107
129
from scipy .stats import skew
108
130
109
131
train_test_dummy = pd .get_dummies (train_test )
110
132
111
133
numeric_features = train_test_dummy .dtypes [train_test_dummy .dtypes != object ].index
112
- skewed_features = train_test_dummy [numeric_features ].apply (lambda x : skew (x )).sort_values (ascending = False )
134
+ skewed_features = (
135
+ train_test_dummy [numeric_features ]
136
+ .apply (lambda x : skew (x ))
137
+ .sort_values (ascending = False )
138
+ )
113
139
high_skew = skewed_features [skewed_features > 0.5 ]
114
140
skew_index = high_skew .index
115
141
116
142
# Normalize skewed features using log_transformation
117
-
143
+
118
144
for i in skew_index :
119
- train_test_dummy [i ] = np .log1p (train_test_dummy [i ] )
145
+ train_test_dummy [i ] = np .log1p (train_test_dummy [i ])
120
146
121
147
target_log = np .log1p (target )
122
148
128
154
X_test = train_test_dummy [4000 :]
129
155
130
156
xgb = XGBRegressor ()
131
- xgb .fit (X_train ,target_log )
157
+ xgb .fit (X_train , target_log )
132
158
133
159
test_pred = xgb .predict (X_test )
134
- submission = pd .DataFrame (test_id , columns = [ 'Id' ])
160
+ submission = pd .DataFrame (test_id , columns = [ "Id" ])
135
161
test_pred = np .expm1 (test_pred )
136
- submission [' SalePrice' ] = test_pred
162
+ submission [" SalePrice" ] = test_pred
137
163
submission .head ()
138
- submission .to_csv ("xgb.csv" , index = False , header = True )
164
+ submission .to_csv ("xgb.csv" , index = False , header = True )
0 commit comments