Jupyter Notebookなどで、コードを実装して実際に確かめてみましょう。
コードで使われているデータ(ビデオではkaggleへの直リンクが張られてますが以下からもDLできます)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_moons moons = make_moons(n_samples=200,noise=0.2,random_state=0) x = moons[0] y = moons[1] from matplotlib.colors import ListedColormap def plot_decision_boundary(model,x,y,margin=0.3): _x1 = np.linspace(x[:,0].min()-margin,x[:,0].max()+margin,100) _x2 = np.linspace(x[:,1].min()-margin,x[:,1].max()+margin,100) x1,x2 = np.meshgrid(_x1,_x2) x_new = np.c_[x1.ravel(),x2.ravel()] y_pred = model.predict(x_new).reshape(x1.shape) custom_cmap = ListedColormap(['mediumblue','orangered']) plt.contourf(x1,x2,y_pred,alpha=0.3,cmap=custom_cmap) def plot_dataset(x,y): plt.plot(x[:,0][y==0],x[:,1][y==0],'bo',ms=15) plt.plot(x[:,0][y==1],x[:,1][y==1],'r^',ms=15) plt.xlabel("$x_0$",fontsize=30) plt.ylabel("$x_1$",fontsize=30,rotation=0) plt.figure(figsize=(12,8)) plot_dataset(x,y) plt.show() ---------------------------------------------- #分類を実行 #決定木の場合 from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0) tree_clf = DecisionTreeClassifier().fit(x_train,y_train) plt.figure(figsize=(12,8)) plot_decision_boundary(tree_clf,x,y) plot_dataset(x,y) plt.show() #ランダムフォレストの場合 from sklearn.ensemble import RandomForestClassifier random_forest = RandomForestClassifier(n_estimators=100,random_state=0).fit(x_train,y_train) plt.figure(figsize=(12,8)) plot_decision_boundary(random_forest,x,y) plot_dataset(x,y) plt.show() #データを変えてみる #iris from sklearn.datasets import load_iris iris = load_iris() x_iris = iris.data y_iris = iris.target random_forest_iris = RandomForestClassifier(random_state=0).fit(x_iris,y_iris) #特徴量の重要度 random_forest_iris.feature_importances_ plt.figure(figsize=(12,8)) plt.barh(range(iris.data.shape[1]),random_forest_iris.feature_importances_,height=0.5) plt.yticks(range(iris.data.shape[1]),iris.feature_names,fontsize=20) plt.xlabel('Feature Importance',fontsize=20) plt.show() #データセットを変えてみる #titanic import pandas as pd df = pd.read_csv('train.csv') df.head() df.info() #Survivedを予測することが目標 #欠損値を埋める df['Age'] = df['Age'].fillna(df['Age'].mean()) df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0]) df.info() #文字データを数字に変更 from sklearn.preprocessing import LabelEncoder cat_features = ['Sex','Embarked'] for col in cat_features: lbl = LabelEncoder() df[col] = lbl.fit_transform(list(df[col].values)) df.head() #使わないデータをdrop x = df.drop(columns=['PassengerId','Survived','Name','Ticket','Cabin']) #予測の対象となるデータ y = df['Survived'] x.head() x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0) x_train.head() tree = DecisionTreeClassifier().fit(x_train,y_train) tree.score(x_test,y_test) rnd_forest = RandomForestClassifier(n_estimators=500,max_depth=5,random_state=0).fit(x_train,y_train) rnd_forest.score(x_test,y_test) #kaggleの提出用ファイルを作ってみる test_df = pd.read_csv('test.csv') test_df.info() #欠損値を埋める test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean()) test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean()) test_df.info() # cat_features = ['Sex','Embarked'] for col in cat_features: lbl = LabelEncoder() test_df[col] = lbl.fit_transform(list(test_df[col].values)) test_df.head() x_pred = test_df.drop(columns=['PassengerId','Name','Ticket','Cabin']) ID = test_df['PassengerId'] prediction = rnd_forest.predict(x_pred) prediction submission = pd.DataFrame({ 'PassengerId': ID, 'Survived': prediction }) submission.head() |
Leave a Reply