Youtubeでビデオ・レッスン（11/16）アンサンブル学習（前編）| バギング・スタッキング・バンピング、ランダムフォレスト

Jupyter Notebookなどで、コードを実装して実際に確かめてみましょう。

コードで使われているデータ(ビデオではkaggleへの直リンクが張られてますが以下からもDLできます)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons

moons = make_moons(n_samples=200,noise=0.2,random_state=0)

x = moons[0]
y = moons[1]

from matplotlib.colors import ListedColormap

def plot_decision_boundary(model,x,y,margin=0.3):
    _x1 = np.linspace(x[:,0].min()-margin,x[:,0].max()+margin,100)
    _x2 = np.linspace(x[:,1].min()-margin,x[:,1].max()+margin,100)
    x1,x2 = np.meshgrid(_x1,_x2)
    x_new = np.c_[x1.ravel(),x2.ravel()]
    y_pred = model.predict(x_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['mediumblue','orangered'])
    plt.contourf(x1,x2,y_pred,alpha=0.3,cmap=custom_cmap)

def plot_dataset(x,y):
    plt.plot(x[:,0][y==0],x[:,1][y==0],'bo',ms=15)
    plt.plot(x[:,0][y==1],x[:,1][y==1],'r^',ms=15)
    plt.xlabel("$x_0$",fontsize=30)
    plt.ylabel("$x_1$",fontsize=30,rotation=0)

plt.figure(figsize=(12,8))
plot_dataset(x,y)
plt.show()

----------------------------------------------
#分類を実行
#決定木の場合
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

tree_clf = DecisionTreeClassifier().fit(x_train,y_train)

plt.figure(figsize=(12,8))
plot_decision_boundary(tree_clf,x,y)
plot_dataset(x,y)
plt.show()

#ランダムフォレストの場合
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100,random_state=0).fit(x_train,y_train)

plt.figure(figsize=(12,8))
plot_decision_boundary(random_forest,x,y)
plot_dataset(x,y)
plt.show()

#データを変えてみる
#iris
from sklearn.datasets import load_iris

iris = load_iris()
x_iris = iris.data
y_iris = iris.target

random_forest_iris = RandomForestClassifier(random_state=0).fit(x_iris,y_iris)

#特徴量の重要度
random_forest_iris.feature_importances_

plt.figure(figsize=(12,8))
plt.barh(range(iris.data.shape[1]),random_forest_iris.feature_importances_,height=0.5)
plt.yticks(range(iris.data.shape[1]),iris.feature_names,fontsize=20)
plt.xlabel('Feature Importance',fontsize=20)
plt.show()

#データセットを変えてみる
#titanic
import pandas as pd

df = pd.read_csv('train.csv')
df.head()
df.info()

#Survivedを予測することが目標
#欠損値を埋める
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df.info()

#文字データを数字に変更
from sklearn.preprocessing import LabelEncoder

cat_features = ['Sex','Embarked']

for col in cat_features:
    lbl = LabelEncoder()
    df[col] = lbl.fit_transform(list(df[col].values))

df.head()

#使わないデータをdrop
x = df.drop(columns=['PassengerId','Survived','Name','Ticket','Cabin'])
#予測の対象となるデータ
y = df['Survived']

x.head()

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

x_train.head()

tree = DecisionTreeClassifier().fit(x_train,y_train)
tree.score(x_test,y_test)

rnd_forest = RandomForestClassifier(n_estimators=500,max_depth=5,random_state=0).fit(x_train,y_train)
rnd_forest.score(x_test,y_test)

#kaggleの提出用ファイルを作ってみる
test_df = pd.read_csv('test.csv')

test_df.info()

#欠損値を埋める
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())
test_df.info()
#
cat_features = ['Sex','Embarked']

for col in cat_features:
    lbl = LabelEncoder()
    test_df[col] = lbl.fit_transform(list(test_df[col].values))

test_df.head()

x_pred = test_df.drop(columns=['PassengerId','Name','Ticket','Cabin'])
ID = test_df['PassengerId']

prediction = rnd_forest.predict(x_pred)

prediction

submission = pd.DataFrame({
    'PassengerId': ID,
    'Survived': prediction
})

submission.head()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import make_moons

moons = make_moons(n_samples=200,noise=0.2,random_state=0)

x = moons[0]

y = moons[1]

from matplotlib.colors import ListedColormap

def plot_decision_boundary(model,x,y,margin=0.3):

_x1 = np.linspace(x[:,0].min()-margin,x[:,0].max()+margin,100)

_x2 = np.linspace(x[:,1].min()-margin,x[:,1].max()+margin,100)

x1,x2 = np.meshgrid(_x1,_x2)

x_new = np.c_[x1.ravel(),x2.ravel()]

y_pred = model.predict(x_new).reshape(x1.shape)

custom_cmap = ListedColormap(['mediumblue','orangered'])

plt.contourf(x1,x2,y_pred,alpha=0.3,cmap=custom_cmap)

def plot_dataset(x,y):

plt.plot(x[:,0][y==0],x[:,1][y==0],'bo',ms=15)

plt.plot(x[:,0][y==1],x[:,1][y==1],'r^',ms=15)

plt.xlabel("$x_0$",fontsize=30)

plt.ylabel("$x_1$",fontsize=30,rotation=0)

plt.figure(figsize=(12,8))

plot_dataset(x,y)

plt.show()

----------------------------------------------

#分類を実行

#決定木の場合

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

tree_clf = DecisionTreeClassifier().fit(x_train,y_train)

plt.figure(figsize=(12,8))

plot_decision_boundary(tree_clf,x,y)

plot_dataset(x,y)

plt.show()

#ランダムフォレストの場合

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100,random_state=0).fit(x_train,y_train)

plt.figure(figsize=(12,8))

plot_decision_boundary(random_forest,x,y)

plot_dataset(x,y)

plt.show()

#データを変えてみる

#iris

from sklearn.datasets import load_iris

iris = load_iris()

x_iris = iris.data

y_iris = iris.target

random_forest_iris = RandomForestClassifier(random_state=0).fit(x_iris,y_iris)

#特徴量の重要度

random_forest_iris.feature_importances_

plt.figure(figsize=(12,8))

plt.barh(range(iris.data.shape[1]),random_forest_iris.feature_importances_,height=0.5)

plt.yticks(range(iris.data.shape[1]),iris.feature_names,fontsize=20)

plt.xlabel('Feature Importance',fontsize=20)

plt.show()

#データセットを変えてみる

#titanic

import pandas as pd

df = pd.read_csv('train.csv')

df.head()

df.info()

#Survivedを予測することが目標

#欠損値を埋める

df['Age'] = df['Age'].fillna(df['Age'].mean())

df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

df.info()

#文字データを数字に変更

from sklearn.preprocessing import LabelEncoder

cat_features = ['Sex','Embarked']

for col in cat_features:

lbl = LabelEncoder()

df[col] = lbl.fit_transform(list(df[col].values))

df.head()

#使わないデータをdrop

x = df.drop(columns=['PassengerId','Survived','Name','Ticket','Cabin'])

#予測の対象となるデータ

y = df['Survived']

x.head()

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

x_train.head()

tree = DecisionTreeClassifier().fit(x_train,y_train)

tree.score(x_test,y_test)

rnd_forest = RandomForestClassifier(n_estimators=500,max_depth=5,random_state=0).fit(x_train,y_train)

rnd_forest.score(x_test,y_test)

#kaggleの提出用ファイルを作ってみる

test_df = pd.read_csv('test.csv')

test_df.info()

#欠損値を埋める

test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())

test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())

test_df.info()

cat_features = ['Sex','Embarked']

for col in cat_features:

lbl = LabelEncoder()

test_df[col] = lbl.fit_transform(list(test_df[col].values))

test_df.head()

x_pred = test_df.drop(columns=['PassengerId','Name','Ticket','Cabin'])

ID = test_df['PassengerId']

prediction = rnd_forest.predict(x_pred)

prediction

submission = pd.DataFrame({

'PassengerId': ID,

'Survived': prediction

})

submission.head()

FRONT

地図と画像のサイト

Youtubeでビデオ・レッスン（11/16）アンサンブル学習（前編）| バギング・スタッキング・バンピング、ランダムフォレスト

Be the first to comment

Leave a Reply コメントをキャンセル