-
-
Notifications
You must be signed in to change notification settings - Fork 46.9k
Fixes : #6551 #6956
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixes : #6551 #6956
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import matplotlib.pyplot as plt | ||
|
||
# https://xgboost.readthedocs.io/en/stable/ | ||
import numpy as np | ||
import pandas as pd | ||
import seaborn as sns | ||
from xgboost import XGBClassifier | ||
|
||
training = pd.read_csv("../input/titanic/train.csv") | ||
test = pd.read_csv("../input/titanic/test.csv") | ||
|
||
# Commented out IPython magic to ensure Python compatibility. | ||
training["train_test"] = 1 | ||
test["train_test"] = 0 | ||
test["Survived"] = np.NaN | ||
all_data = pd.concat([training, test]) | ||
# %matplotlib inline | ||
all_data.columns | ||
|
||
all_data.describe() | ||
|
||
all_data["cabin_mul"] = all_data.Cabin.apply( | ||
lambda x: 0 if pd.isna(x) else len(x.split(" ")) | ||
) | ||
all_data["cabin_adv"] = all_data.Cabin.apply(lambda x: str(x)[0]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please provide descriptive name for the parameter: |
||
all_data["name_title"] = all_data.Name.apply( | ||
lambda x: x.split(",")[1].split(".")[0].strip() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please provide descriptive name for the parameter: |
||
) | ||
all_data.Age = all_data.Age.fillna(training.Age.median()) | ||
all_data.Fare = all_data.Fare.fillna(training.Fare.median()) | ||
all_data.dropna(subset=["Embarked"], inplace=True) | ||
all_data["norm_fare"] = np.log(all_data.Fare + 1) | ||
all_data.Pclass = all_data.Pclass.astype(str) | ||
all_data["Age"] = all_data["Age"].apply(np.int64) | ||
all_dummies = pd.get_dummies( | ||
all_data[ | ||
[ | ||
"Pclass", | ||
"Sex", | ||
"Age", | ||
"SibSp", | ||
"Parch", | ||
"norm_fare", | ||
"Embarked", | ||
"cabin_adv", | ||
"cabin_mul", | ||
"name_title", | ||
"train_test", | ||
] | ||
] | ||
) | ||
|
||
from sklearn.preprocessing import StandardScaler | ||
|
||
scale = StandardScaler() | ||
all_dummies_scaled = all_dummies.copy() | ||
all_dummies_scaled[["Age", "SibSp", "Parch", "norm_fare"]] = scale.fit_transform( | ||
all_dummies_scaled[["Age", "SibSp", "Parch", "norm_fare"]] | ||
) | ||
all_dummies_scaled.head() | ||
|
||
X_train_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 1].drop( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the |
||
["train_test"], axis=1 | ||
) | ||
X_test_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 0].drop( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the |
||
["train_test"], axis=1 | ||
) | ||
|
||
y_train = all_data[all_data.train_test == 1].Survived | ||
|
||
from xgboost import XGBClassifier | ||
|
||
xgb = XGBClassifier() | ||
xgb.fit(X_train_scaled, y_train) | ||
|
||
y_hat_base_vc = xgb.predict(X_test_scaled).astype(int) | ||
basic_submission = {"PassengerId": test.PassengerId, "Survived": y_hat_base_vc} | ||
base_submission = pd.DataFrame(data=basic_submission) | ||
base_submission.to_csv("xgb_submission.csv", index=False) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
import os | ||
|
||
import matplotlib.pyplot as plt | ||
|
||
# https://xgboost.readthedocs.io/en/stable/ | ||
import numpy as np | ||
import pandas as pd | ||
import seaborn as sns | ||
from xgboost import XGBRegressor | ||
|
||
for dirname, _, filenames in os.walk("/kaggle/input"): | ||
for filename in filenames: | ||
print(os.path.join(dirname, filename)) | ||
|
||
trainAmes = pd.read_csv("/kaggle/input/ames-housing-dataset/AmesHousing.csv") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the |
||
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv") | ||
train = pd.read_csv( | ||
"/kaggle/input/house-prices-advanced-regression-techniques/train.csv" | ||
) | ||
|
||
trainAmes.columns = trainAmes.columns.str.replace(" ", "") | ||
trainAmes = trainAmes.rename(columns={"YearRemod/Add": "YearRemodAdd"}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the |
||
|
||
data = pd.concat([trainAmes, train, test], axis=0, sort=False) | ||
print("Size of the Housing Dataset", len(data)) | ||
useless = ["Id", "PID", "Order", "SalePrice"] | ||
data = data.drop(useless, axis=1) | ||
duplicate = data[data.duplicated(keep="last")].index | ||
len(duplicate) | ||
|
||
duplicate = duplicate[0:390] | ||
trainAmes = trainAmes.drop(duplicate, axis=0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the |
||
|
||
training = pd.concat([trainAmes, train], axis=0, sort=False) | ||
useless = ["Id", "PID", "Order"] | ||
training = training.drop(useless, axis=1) | ||
|
||
# Separating Target and Features | ||
|
||
target = training["SalePrice"] | ||
test_id = test["Id"] | ||
test = test.drop(["Id"], axis=1) | ||
training2 = training.drop(["SalePrice"], axis=1) | ||
|
||
|
||
# Concatenating train & test set | ||
|
||
train_test = pd.concat([training2, test], axis=0, sort=False) | ||
|
||
# Filling Categorical NaN (That we know how to fill due to the description file ) | ||
|
||
train_test["Functional"] = train_test["Functional"].fillna("Typ") | ||
train_test["Electrical"] = train_test["Electrical"].fillna("SBrkr") | ||
train_test["KitchenQual"] = train_test["KitchenQual"].fillna("TA") | ||
train_test["Exterior1st"] = train_test["Exterior1st"].fillna( | ||
train_test["Exterior1st"].mode()[0] | ||
) | ||
train_test["Exterior2nd"] = train_test["Exterior2nd"].fillna( | ||
train_test["Exterior2nd"].mode()[0] | ||
) | ||
train_test["SaleType"] = train_test["SaleType"].fillna(train_test["SaleType"].mode()[0]) | ||
train_test["PoolQC"] = train_test["PoolQC"].fillna("None") | ||
train_test["Alley"] = train_test["Alley"].fillna("None") | ||
train_test["FireplaceQu"] = train_test["FireplaceQu"].fillna("None") | ||
train_test["Fence"] = train_test["Fence"].fillna("None") | ||
train_test["MiscFeature"] = train_test["MiscFeature"].fillna("None") | ||
for col in ("GarageArea", "GarageCars"): | ||
train_test[col] = train_test[col].fillna(0) | ||
|
||
for col in ["GarageType", "GarageFinish", "GarageQual", "GarageCond"]: | ||
train_test[col] = train_test[col].fillna("None") | ||
|
||
for col in ("BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2"): | ||
train_test[col] = train_test[col].fillna("None") | ||
|
||
for col in ( | ||
"BsmtFinSF1", | ||
"BsmtFinSF2", | ||
"BsmtFullBath", | ||
"BsmtHalfBath", | ||
"MasVnrArea", | ||
"BsmtUnfSF", | ||
"TotalBsmtSF", | ||
): | ||
train_test[col] = train_test[col].fillna(0) | ||
|
||
train_test["LotFrontage"] = train_test["LotFrontage"].fillna( | ||
train["LotFrontage"].median() | ||
) | ||
|
||
# Checking the features with NaN remained out | ||
|
||
for col in train_test: | ||
if train_test[col].isna().sum() > 0: | ||
print(train_test[col][1]) | ||
|
||
# Converting non-numeric predictors stored as numbers into string | ||
|
||
train_test["MSSubClass"] = train_test["MSSubClass"].apply(str) | ||
train_test["YrSold"] = train_test["YrSold"].apply(str) | ||
train_test["MoSold"] = train_test["MoSold"].apply(str) | ||
train_test["OverallQual"] = train_test["OverallQual"].apply(str) | ||
train_test["OverallCond"] = train_test["OverallCond"].apply(str) | ||
train_test["SqFtPerRoom"] = train_test["GrLivArea"] / ( | ||
train_test["TotRmsAbvGrd"] | ||
+ train_test["FullBath"] | ||
+ train_test["HalfBath"] | ||
+ train_test["KitchenAbvGr"] | ||
) | ||
|
||
train_test["Total_Home_Quality"] = train_test["OverallQual"] + train_test["OverallCond"] | ||
|
||
train_test["Total_Bathrooms"] = ( | ||
train_test["FullBath"] | ||
+ (0.5 * train_test["HalfBath"]) | ||
+ train_test["BsmtFullBath"] | ||
+ (0.5 * train_test["BsmtHalfBath"]) | ||
) | ||
|
||
train_test["HighQualSF"] = train_test["1stFlrSF"] + train_test["2ndFlrSF"] | ||
train_test["renovated"] = train_test["YearRemodAdd"] + train_test["YearBuilt"] | ||
|
||
# Removing the useless variables | ||
|
||
useless = ["GarageYrBlt", "YearRemodAdd"] | ||
train_test = train_test.drop(useless, axis=1) | ||
# Creating dummy variables from categorical features | ||
|
||
from scipy.stats import skew | ||
|
||
train_test_dummy = pd.get_dummies(train_test) | ||
|
||
numeric_features = train_test_dummy.dtypes[train_test_dummy.dtypes != object].index | ||
skewed_features = ( | ||
train_test_dummy[numeric_features] | ||
.apply(lambda x: skew(x)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please provide descriptive name for the parameter: |
||
.sort_values(ascending=False) | ||
) | ||
high_skew = skewed_features[skewed_features > 0.5] | ||
skew_index = high_skew.index | ||
|
||
# Normalize skewed features using log_transformation | ||
|
||
for i in skew_index: | ||
train_test_dummy[i] = np.log1p(train_test_dummy[i]) | ||
|
||
target_log = np.log1p(target) | ||
|
||
from xgboost import XGBRegressor | ||
|
||
# Train-Test separation | ||
|
||
X_train = train_test_dummy[0:4000] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the |
||
X_test = train_test_dummy[4000:] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the |
||
|
||
xgb = XGBRegressor() | ||
xgb.fit(X_train, target_log) | ||
|
||
test_pred = xgb.predict(X_test) | ||
submission = pd.DataFrame(test_id, columns=["Id"]) | ||
test_pred = np.expm1(test_pred) | ||
submission["SalePrice"] = test_pred | ||
submission.head() | ||
submission.to_csv("xgb.csv", index=False, header=True) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please provide descriptive name for the parameter:
x