Jupyter Notebookなどで、コードを実装して実際に確かめてみましょう。
コードで使われているデータ(ビデオではkaggleへの直リンクが張られてますが以下からもDLできます)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import numpy as np import pandas as pd df = pd.read_csv('train.csv') df.head() df.info() df['Age'] = df['Age'].fillna(df['Age'].mean()) df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0]) df.head() df.info() x = df.drop(columns=['PassengerId','Survived','Name','Ticket','Cabin']) y = df['Survived'] x.head() from sklearn.preprocessing import LabelEncoder cat_features = ['Sex','Embarked'] for col in cat_features: lbl = LabelEncoder() x[col] = lbl.fit_transform(list(df[col].values)) x.head() from sklearn.svm import SVC from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0) svm = SVC().fit(x_train,y_train) svm.score(x_test,y_test) #データがどんなものかを見てみる x['Pclass'].value_counts() x['SibSp'].value_counts() ----------------------------------------------------- #スケーリング #AgeとFareにStandardScalerを実施 from sklearn.preprocessing import StandardScaler num_features = ['Age','Fare'] for col in num_features: scaler = StandardScaler() x[col] = scaler.fit_transform(np.array(df[col].values).reshape(-1,1)) x.head() #ダミー変数を用意 x = pd.get_dummies(x,columns=['Pclass','SibSp','Embarked']) x.head() x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0) svm = SVC().fit(x_train,y_train) svm.score(x_test,y_test) ----------------------------------------------------- #パラメータ調整 #ペナルティ項 Cをいじってみる、デフォルトは1 svm = SVC(C=10).fit(x_train,y_train) svm.score(x_test,y_test) svm = SVC(C=100).fit(x_train,y_train) svm.score(x_test,y_test) ・ ・ ・ best_score = 0 for C in [1,10,100,1000,10000]: svm = SVC(C=C).fit(x_train,y_train) score = svm.score(x_test,y_test) if score > best_score: best_score = score best_parameter = C print(best_parameter) print(best_score) ----------------------------------------------------- #新たに検証用データセットを作成 x_train_,x_validation,y_train_,y_validation = train_test_split(x_train,y_train,random_state=0) best_score = 0 for C in [1,10,100,1000,10000]: svm = SVC(C=C).fit(x_train_,y_train_) score = svm.score(x_validation,y_validation) if score > best_score: best_score = score best_parameter = C #再訓練 svm_best = SVC(C=best_parameter).fit(x_train,y_train) svm_best.score(x_test,y_test) ----------------------------------------------------- #sklearnのグリッドサーチ from sklearn.model_selection import GridSearchCV param = { 'C':[0.01,0.1,1,10,100] } grid_search = GridSearchCV(SVC(),param_grid=param,cv=5) grid_search.fit(x_train,y_train) grid_search.best_params_ grid_search.score(x_test,y_test) ----------------------------------------------------- from sklearn.model_selection import cross_val_score scores = cross_val_score(GridSearchCV(SVC(),param_grid=param,cv=5),x,y,cv=5) print(scores) scores.mean() |
Leave a Reply