Jupyter Notebookなどで、コードを実装して実際に確かめてみましょう。
事前にxgboostをインストールしておきます。
$sudo pip3 install xgboost
コードで使われているデータ(ビデオではkaggleへの直リンクが張られてますが以下からもDLできます)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder train_df = pd.read_csv('train.csv') test_df = pd.read_csv('test.csv') all_df = pd.concat((train_df.loc[:,'Pclass':'Embarked'],test_df.loc[:,'Pclass':'Embarked'])) all_df.info() #欠損値を埋める all_df['Age'] = all_df['Age'].fillna(all_df['Age'].mean()) all_df['Fare'] = all_df['Fare'].fillna(all_df['Fare'].mean()) all_df['Embarked'] = all_df['Embarked'].fillna(all_df['Embarked'].mode()[0]) all_df.info() #LabelEncoderを適用 #文字データを数字に変更 cat_features = ['Sex','Embarked'] for col in cat_features: lbl = LabelEncoder() all_df[col] = lbl.fit_transform(list(all_df[col].values)) all_df.head() all_df = all_df.drop(columns=['Name','Ticket','Cabin']) train = all_df[:train_df.shape[0]] test = all_df[train_df.shape[0]:] y = train_df['Survived'] ID = test_df['PassengerId'] x_train,x_test,y_train,y_test = train_test_split(train,y,random_state=0) ----------------------------------------------- #kaggleの提出用ファイルを作成してみる import xgboost as xgb params = { "objective":"binary:logistic", "eval_metric":"auc", "eta":0.1, "max_depth":6, "subsample":1, "colsample_bytree":1, "silent":1 } dtrain = xgb.DMatrix(x_train,label=y_train) dtest = xgb.DMatrix(x_test,label=y_test) model = xgb.train(params=params,dtrain=dtrain,num_boost_round=100,early_stopping_rounds=10,evals=[(dtest,'test')]) prediction = model.predict(xgb.DMatrix(test),ntree_limit=model.best_ntree_limit) prediction prediction = np.where(prediction < 0.5,0,1) prediction #csvファイル作成 submission = pd.DataFrame({ 'PassengerId': ID, 'Survived': prediction }) submission.to_csv('submission.csv',index=False) |
Leave a Reply