Youtubeでビデオ・レッスン（15/16）次元削減｜教師なし学習、主成分分析

Jupyter Notebookなどで、コードを実装して実際に確かめてみましょう。

コードで使われているデータ(ビデオではkaggleへの直リンクが張られてますが以下からもDLできます)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv('train.csv')
df.head()
df.info()

#欠損値を埋める
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df.info()
#使わないデータをdrop
x = df.drop(columns=['PassengerId','Survived','Name','Ticket','Cabin'])
#予測の対象となるデータ
y = df['Survived']

x.head()

#文字データを数字に変更
from sklearn.preprocessing import LabelEncoder

cat_features = ['Sex','Embarked']

for col in cat_features:
    lbl = LabelEncoder()
    x[col] = lbl.fit_transform(list(df[col].values))

x.head()

#標準化
from sklearn.preprocessing import StandardScaler

num_features = ['Age','Fare']
for col in num_features:
    scaler = StandardScaler()
    x[col] = scaler.fit_transform(np.array(df[col].values).reshape(-1,1))

x.head()

-----------------------------------------------------
#主成分分析
from sklearn.decomposition import PCA

pca = PCA()
x_pca = pca.fit_transform(x)
x_pca.shape

#可視化
def plot_2d(x,y):
    plt.plot(x[:,0][y==0],x[:,1][y==0],'bo',ms=15)
    plt.plot(x[:,0][y==1],x[:,1][y==1],'r^',ms=15)
    plt.xlabel("First Principal Component")
    plt.ylabel("Second Principal Component")
    plt.legend(['Not Survived','survived'],loc='best')

from mpl_toolkits.mplot3d import Axes3D

def plot_3d(x,y):
    fig = plt.figure(figsize=(8,6))
    ax = fig.add_subplot(111,projection='3d')
    
    ax.plot(x[:,0][y==0],x[:,1][y==0],x[:,2][y==0],'bo',ms=15)
    ax.plot(x[:,0][y==1],x[:,1][y==1],x[:,2][y==1],'r^',ms=15)
    
    ax.set_xlabel("First Principal Component",fontsize=15)
    ax.set_ylabel("Second Principal Component",fontsize=15)
    ax.set_zlabel("Third Principal Component",fontsize=15)
    ax.legend(['Not Survived','Survived'],loc='best',fontsize=16)

plt.figure(figsize=(10,10))
plot_2d(x_pca,y)
plt.show()

#以下の一行を記述することで、３Ｄグラフをマウスで回転できる
%matplotlib notebook
plt.figure(figsize=(5,5))
plot_3d(x_pca,y)
plt.show()

#寄与率
pca.explained_variance_ratio_
#図示
%matplotlib notebook
plt.figure(figsize=(12,8))
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('n_components')
plt.ylabel('explained_variance_ratio_')
plt.show()

%matplotlib notebook
plt.figure(figsize=(12,8))
plt.plot(np.hstack([0,pca.explained_variance_ratio_.cumsum()]))
plt.xlabel('n_components')
plt.ylabel('explained_variance_ratio_')
plt.show()

pca.components_

#ヒートマップ
plt.matshow(pca.components_,cmap="Greys")
plt.yticks(range(len(pca.components_)),range(1,len(pca.components_) + 1))
plt.colorbar()
plt.xticks(range(x.shape[1]),x.columns.values,rotation=60,ha='left')
plt.xlabel('Features')
plt.ylabel('Principal Components')
plt.show()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

df = pd.read_csv('train.csv')

df.head()

df.info()

#欠損値を埋める

df['Age'] = df['Age'].fillna(df['Age'].mean())

df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

df.info()

#使わないデータをdrop

x = df.drop(columns=['PassengerId','Survived','Name','Ticket','Cabin'])

#予測の対象となるデータ

y = df['Survived']

x.head()

#文字データを数字に変更

from sklearn.preprocessing import LabelEncoder

cat_features = ['Sex','Embarked']

for col in cat_features:

lbl = LabelEncoder()

x[col] = lbl.fit_transform(list(df[col].values))

x.head()

#標準化

from sklearn.preprocessing import StandardScaler

num_features = ['Age','Fare']

for col in num_features:

scaler = StandardScaler()

x[col] = scaler.fit_transform(np.array(df[col].values).reshape(-1,1))

x.head()

-----------------------------------------------------

#主成分分析

from sklearn.decomposition import PCA

pca = PCA()

x_pca = pca.fit_transform(x)

x_pca.shape

#可視化

def plot_2d(x,y):

plt.plot(x[:,0][y==0],x[:,1][y==0],'bo',ms=15)

plt.plot(x[:,0][y==1],x[:,1][y==1],'r^',ms=15)

plt.xlabel("First Principal Component")

plt.ylabel("Second Principal Component")

plt.legend(['Not Survived','survived'],loc='best')

from mpl_toolkits.mplot3d import Axes3D

def plot_3d(x,y):

fig = plt.figure(figsize=(8,6))

ax = fig.add_subplot(111,projection='3d')

ax.plot(x[:,0][y==0],x[:,1][y==0],x[:,2][y==0],'bo',ms=15)

ax.plot(x[:,0][y==1],x[:,1][y==1],x[:,2][y==1],'r^',ms=15)

ax.set_xlabel("First Principal Component",fontsize=15)

ax.set_ylabel("Second Principal Component",fontsize=15)

ax.set_zlabel("Third Principal Component",fontsize=15)

ax.legend(['Not Survived','Survived'],loc='best',fontsize=16)

plt.figure(figsize=(10,10))

plot_2d(x_pca,y)

plt.show()

#以下の一行を記述することで、３Ｄグラフをマウスで回転できる

%matplotlib notebook

plt.figure(figsize=(5,5))

plot_3d(x_pca,y)

plt.show()

#寄与率

pca.explained_variance_ratio_

#図示

%matplotlib notebook

plt.figure(figsize=(12,8))

plt.plot(pca.explained_variance_ratio_)

plt.xlabel('n_components')

plt.ylabel('explained_variance_ratio_')

plt.show()

%matplotlib notebook

plt.figure(figsize=(12,8))

plt.plot(np.hstack([0,pca.explained_variance_ratio_.cumsum()]))

plt.xlabel('n_components')

plt.ylabel('explained_variance_ratio_')

plt.show()

pca.components_

#ヒートマップ

plt.matshow(pca.components_,cmap="Greys")

plt.yticks(range(len(pca.components_)),range(1,len(pca.components_) + 1))

plt.colorbar()

plt.xticks(range(x.shape[1]),x.columns.values,rotation=60,ha='left')

plt.xlabel('Features')

plt.ylabel('Principal Components')

plt.show()

FRONT

地図と画像のサイト

Youtubeでビデオ・レッスン（15/16）次元削減｜教師なし学習、主成分分析

Be the first to comment

Leave a Reply コメントをキャンセル