Skip to content

Fixes : #6551 #6956

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions machine_learning/xgboostclassifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from xgboost import XGBClassifier
#https://xgboost.readthedocs.io/en/stable/
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



training = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

# Commented out IPython magic to ensure Python compatibility.
training['train_test'] = 1
test['train_test'] = 0
test['Survived'] = np.NaN
all_data = pd.concat([training,test])
# %matplotlib inline
all_data.columns

all_data.describe()

all_data['cabin_mul']=all_data.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide descriptive name for the parameter: x

all_data['cabin_adv'] = all_data.Cabin.apply(lambda x: str(x)[0])

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide descriptive name for the parameter: x

all_data['name_title']= all_data.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide descriptive name for the parameter: x

all_data.Age=all_data.Age.fillna(training.Age.median())
all_data.Fare=all_data.Fare.fillna(training.Fare.median())
all_data.dropna(subset=['Embarked'],inplace=True)
all_data['norm_fare']=np.log(all_data.Fare+1)
all_data.Pclass=all_data.Pclass.astype(str)
all_data['Age']=all_data['Age'].apply(np.int64)
all_dummies = pd.get_dummies(all_data[['Pclass','Sex','Age','SibSp','Parch','norm_fare',
'Embarked','cabin_adv','cabin_mul',
'name_title','train_test']])

from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
all_dummies_scaled = all_dummies.copy()
all_dummies_scaled[['Age','SibSp','Parch','norm_fare']]= scale.fit_transform(all_dummies_scaled[['Age','SibSp','Parch','norm_fare']])
all_dummies_scaled.head()

X_train_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 1].drop(['train_test'], axis =1)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: X_train_scaled

X_test_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 0].drop(['train_test'], axis =1)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: X_test_scaled


y_train = all_data[all_data.train_test==1].Survived

from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train_scaled,y_train)

y_hat_base_vc = xgb.predict(X_test_scaled).astype(int)
basic_submission = {'PassengerId': test.PassengerId, 'Survived': y_hat_base_vc}
base_submission = pd.DataFrame(data=basic_submission)
base_submission.to_csv('xgb_submission.csv', index=False)
138 changes: 138 additions & 0 deletions machine_learning/xgboostregressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from xgboost import XGBRegressor
#https://xgboost.readthedocs.io/en/stable/
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))

trainAmes=pd.read_csv('/kaggle/input/ames-housing-dataset/AmesHousing.csv')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: trainAmes

test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

trainAmes.columns = trainAmes.columns.str.replace(' ', '')
trainAmes=trainAmes.rename(columns={"YearRemod/Add": "YearRemodAdd"})

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: trainAmes


data=pd.concat([trainAmes,train,test], axis=0, sort=False)
print("Size of the Housing Dataset",len(data))
useless = ['Id','PID','Order','SalePrice']
data = data.drop(useless, axis = 1)
duplicate = data[data.duplicated(keep='last')].index
len(duplicate)

duplicate=duplicate[0:390]
trainAmes = trainAmes.drop(duplicate, axis = 0)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: trainAmes


training=pd.concat([trainAmes,train], axis=0, sort=False)
useless = ['Id','PID','Order']
training = training.drop(useless, axis = 1)

# Separating Target and Features

target = training['SalePrice']
test_id = test['Id']
test = test.drop(['Id'],axis = 1)
training2 = training.drop(['SalePrice'], axis = 1)


# Concatenating train & test set

train_test = pd.concat([training2,test], axis=0, sort=False)

# Filling Categorical NaN (That we know how to fill due to the description file )

train_test['Functional'] = train_test['Functional'].fillna('Typ')
train_test['Electrical'] = train_test['Electrical'].fillna("SBrkr")
train_test['KitchenQual'] = train_test['KitchenQual'].fillna("TA")
train_test['Exterior1st'] = train_test['Exterior1st'].fillna(train_test['Exterior1st'].mode()[0])
train_test['Exterior2nd'] = train_test['Exterior2nd'].fillna(train_test['Exterior2nd'].mode()[0])
train_test['SaleType'] = train_test['SaleType'].fillna(train_test['SaleType'].mode()[0])
train_test["PoolQC"] = train_test["PoolQC"].fillna("None")
train_test["Alley"] = train_test["Alley"].fillna("None")
train_test['FireplaceQu'] = train_test['FireplaceQu'].fillna("None")
train_test['Fence'] = train_test['Fence'].fillna("None")
train_test['MiscFeature'] = train_test['MiscFeature'].fillna("None")
for col in ('GarageArea', 'GarageCars'):
train_test[col] = train_test[col].fillna(0)

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
train_test[col] = train_test[col].fillna('None')

for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
train_test[col] = train_test[col].fillna('None')

for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea','BsmtUnfSF', 'TotalBsmtSF'):
train_test[col] = train_test[col].fillna(0)

train_test['LotFrontage'] = train_test['LotFrontage'].fillna(train['LotFrontage'].median())

# Checking the features with NaN remained out

for col in train_test:
if train_test[col].isna().sum() > 0:
print(train_test[col][1])

# Converting non-numeric predictors stored as numbers into string

train_test['MSSubClass'] = train_test['MSSubClass'].apply(str)
train_test['YrSold'] = train_test['YrSold'].apply(str)
train_test['MoSold'] = train_test['MoSold'].apply(str)
train_test['OverallQual'] = train_test['OverallQual'].apply(str)
train_test['OverallCond'] = train_test['OverallCond'].apply(str)
train_test["SqFtPerRoom"] = train_test["GrLivArea"] / (train_test["TotRmsAbvGrd"] +
train_test["FullBath"] +
train_test["HalfBath"] +
train_test["KitchenAbvGr"])

train_test['Total_Home_Quality'] = train_test['OverallQual'] + train_test['OverallCond']

train_test['Total_Bathrooms'] = (train_test['FullBath'] + (0.5 * train_test['HalfBath']) +
train_test['BsmtFullBath'] + (0.5 * train_test['BsmtHalfBath']))

train_test["HighQualSF"] = train_test["1stFlrSF"] + train_test["2ndFlrSF"]
train_test['renovated']=train_test['YearRemodAdd']+train_test['YearBuilt']

# Removing the useless variables

useless = ['GarageYrBlt','YearRemodAdd']
train_test = train_test.drop(useless, axis = 1)
# Creating dummy variables from categorical features

from scipy.stats import skew

train_test_dummy = pd.get_dummies(train_test)

numeric_features = train_test_dummy.dtypes[train_test_dummy.dtypes != object].index
skewed_features = train_test_dummy[numeric_features].apply(lambda x: skew(x)).sort_values(ascending=False)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide descriptive name for the parameter: x

high_skew = skewed_features[skewed_features > 0.5]
skew_index = high_skew.index

# Normalize skewed features using log_transformation

for i in skew_index:
train_test_dummy[i] = np.log1p(train_test_dummy[i] )

target_log = np.log1p(target)

from xgboost import XGBRegressor

# Train-Test separation

X_train = train_test_dummy[0:4000]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: X_train

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: X_train

X_test = train_test_dummy[4000:]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: X_test

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: X_test


xgb = XGBRegressor()
xgb.fit(X_train,target_log)

test_pred = xgb.predict(X_test)
submission = pd.DataFrame(test_id, columns = ['Id'])
test_pred = np.expm1(test_pred)
submission['SalePrice'] = test_pred
submission.head()
submission.to_csv("xgb.csv", index = False, header = True)