Youtubeでビデオ・レッスン（12/16）アンサンブル学習（後編）| AdaBoost、勾配ブースティング

Jupyter Notebookなどで、コードを実装して実際に確かめてみましょう。

事前にxgboostをインストールしておきます。

＄sudo pip3 install xgboost

コードで使われているデータ(ビデオではkaggleへの直リンクが張られてますが以下からもDLできます)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

all_df = pd.concat((train_df.loc[:,'Pclass':'Embarked'],test_df.loc[:,'Pclass':'Embarked']))

all_df.info()
#欠損値を埋める
all_df['Age'] = all_df['Age'].fillna(all_df['Age'].mean())
all_df['Fare'] = all_df['Fare'].fillna(all_df['Fare'].mean())
all_df['Embarked'] = all_df['Embarked'].fillna(all_df['Embarked'].mode()[0])

all_df.info()

#LabelEncoderを適用
#文字データを数字に変更
cat_features = ['Sex','Embarked']

for col in cat_features:
    lbl = LabelEncoder()
    all_df[col] = lbl.fit_transform(list(all_df[col].values))

all_df.head()
all_df = all_df.drop(columns=['Name','Ticket','Cabin'])

train = all_df[:train_df.shape[0]]
test = all_df[train_df.shape[0]:]

y = train_df['Survived']
ID = test_df['PassengerId']

x_train,x_test,y_train,y_test = train_test_split(train,y,random_state=0)

-----------------------------------------------
#kaggleの提出用ファイルを作成してみる
import xgboost as xgb

params = {
    "objective":"binary:logistic",
    "eval_metric":"auc",
    "eta":0.1,
    "max_depth":6,
    "subsample":1,
    "colsample_bytree":1,
    "silent":1
}

dtrain = xgb.DMatrix(x_train,label=y_train)
dtest = xgb.DMatrix(x_test,label=y_test)
model = xgb.train(params=params,dtrain=dtrain,num_boost_round=100,early_stopping_rounds=10,evals=[(dtest,'test')])

prediction = model.predict(xgb.DMatrix(test),ntree_limit=model.best_ntree_limit)

prediction

prediction = np.where(prediction &lt; 0.5,0,1)

prediction

#csvファイル作成
submission = pd.DataFrame({
    'PassengerId': ID,
    'Survived': prediction
})

submission.to_csv('submission.csv',index=False)

import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

train_df = pd.read_csv('train.csv')

test_df = pd.read_csv('test.csv')

all_df = pd.concat((train_df.loc[:,'Pclass':'Embarked'],test_df.loc[:,'Pclass':'Embarked']))

all_df.info()

#欠損値を埋める

all_df['Age'] = all_df['Age'].fillna(all_df['Age'].mean())

all_df['Fare'] = all_df['Fare'].fillna(all_df['Fare'].mean())

all_df['Embarked'] = all_df['Embarked'].fillna(all_df['Embarked'].mode()[0])

all_df.info()

#LabelEncoderを適用

#文字データを数字に変更

cat_features = ['Sex','Embarked']

for col in cat_features:

lbl = LabelEncoder()

all_df[col] = lbl.fit_transform(list(all_df[col].values))

all_df.head()

all_df = all_df.drop(columns=['Name','Ticket','Cabin'])

train = all_df[:train_df.shape[0]]

test = all_df[train_df.shape[0]:]

y = train_df['Survived']

ID = test_df['PassengerId']

x_train,x_test,y_train,y_test = train_test_split(train,y,random_state=0)

-----------------------------------------------

#kaggleの提出用ファイルを作成してみる

import xgboost as xgb

params = {

"objective":"binary:logistic",

"eval_metric":"auc",

"eta":0.1,

"max_depth":6,

"subsample":1,

"colsample_bytree":1,

"silent":1

}

dtrain = xgb.DMatrix(x_train,label=y_train)

dtest = xgb.DMatrix(x_test,label=y_test)

model = xgb.train(params=params,dtrain=dtrain,num_boost_round=100,early_stopping_rounds=10,evals=[(dtest,'test')])

prediction = model.predict(xgb.DMatrix(test),ntree_limit=model.best_ntree_limit)

prediction

prediction = np.where(prediction < 0.5,0,1)

prediction

#csvファイル作成

submission = pd.DataFrame({

'PassengerId': ID,

'Survived': prediction

})

submission.to_csv('submission.csv',index=False)

FRONT

地図と画像のサイト

Youtubeでビデオ・レッスン（12/16）アンサンブル学習（後編）| AdaBoost、勾配ブースティング

Be the first to comment

Leave a Reply コメントをキャンセル