ここで解説したコードをまとめて掲載
最後にここで解説したすべてのコードをまとめて掲載しましたので参考にしてください。
### Import the libraries
#from os import terminal_size
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.cross_validation import train_test_split => not found cross_validation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression # Predict using Logistic Regression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.utils.extmath import randomized_range_finder
import warnings
warnings.simplefilter('ignore')
# %%
### Titanic Data Analysis
# 1) Collecting Data
# 2) Analyzing Data
# 3) Data Wrangling => Cleaning Data
# 4) Train & Test
# 5) Accuracy Check
### Load the titanic train data
csv_file = 'data/csv/titanic/train.csv'
#csv_file = 'https://money-or-ikigai.com/menu/python/article/data/titanic/train.csv'
df = pd.read_csv(csv_file)
# df.info()
# RangeIndex: 891 entries, 0 to 890
# Data columns (total 12 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 PassengerId 891 non-null int64
# 1 Survived 891 non-null int64 0 = No, 1 = Yes
# 2 Pclass 891 non-null int64 Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd
# 3 Name 891 non-null object
# 4 Sex 891 non-null object male or female
# 5 Age 714 non-null float64
# 6 SibSp 891 non-null int64 # of siblings / spouses aboard the Titanic
# 7 Parch 891 non-null int64 # of parents / children aboard the Titanic
# 8 Ticket 891 non-null object Ticket number
# 9 Fare 891 non-null float64
# 10 Cabin 204 non-null object Cabin number
# 11 Embarked 889 non-null object Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton
# dtypes: float64(2), int64(5), object(5)
# %%
### Analysing Data
#sns.set_style('darkgrid')
#custom_palette=['red','green']
#sns.set_palette(custom_palette)
ax = sns.countplot(x='Survived', data=df)
# countplot(*, x=None, y=None, hue=None, data=None, order=None,
# hue_order=None, orient=None, color=None, palette=None,
# saturation=0.75, dodge=True, ax=None, **kwargs)
ax.set_xticklabels(['Dead','Survived'])
ax.set_xlabel(None)
ax.set_ylabel('Count')
ax.set_title('Dead vs Survived Analysis\n(Titanic)'); # semi-colon does not produce output
plt.show()
#seaborn.countplot(*, x=None, y=None, hue=None, data=None, order=None,
# hue_order=None, orient=None, color=None, palette=None, saturation=0.75,
# dodge=True, ax=None, **kwargs)
# %%
ax = sns.countplot(x='Survived', hue='Sex', data=df)
ax.set_xticklabels(['Dead','Survived'])
ax.set_xlabel(None)
ax.set_ylabel('Count')
ax.set_title('Dead vs Survived Analysis\n(Sex)')
plt.show()
# %%
ax = sns.countplot(x='Survived', hue='Pclass', data=df)
ax.set_xticklabels(['Dead','Survived'])
ax.set_xlabel(None)
ax.set_ylabel('Count')
ax.set_title('Dead vs Survived Analysis\n(Pclass)')
plt.show()
# %%
ax = sns.countplot(x='SibSp', data=df)
#ax.set_xticklabels(['Dead','Survived'])
ax.set_xlabel('Number of siblings / spouses aboard')
ax.set_ylabel('Count')
ax.set_title('Siblings / Spouses Analysis\n(Titanic)')
plt.show()
# %%
df['Age'].plot.hist(bins=10, edgecolor='black', linewidth=0.8)
#df['Age'].plot.hist(bins=10, edgecolor='white', linewidth=0.8)
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Age Analysis\n(Titanic)')
plt.show()
#DataFrame.plot.hist(by=None, bins=10, **kwargs)
# bins=99, alpha=0.5
# edgecolor='black', linewidth=1.2
# %%
df['Fare'].plot.hist(bins=20, edgecolor='black', linewidth=0.8, figsize=(10,5))
#df['Fare'].plot.hist(bins=20, edgecolor='white', linewidth=0.8, figsize=(10,5))
plt.xlabel('Fare ($)')
plt.ylabel('Count')
plt.title('Fare Analysis\n(Titanic)')
plt.show()
#DataFrame.plot.hist(by=None, bins=10, **kwargs)
# bins=99, alpha=0.5
# edgecolor='black', linewidth=1.2
# %%
### Data Wrangling
df.isnull().sum()
#df.isnull().sum().sum()
# %%
ax = sns.heatmap(df.isnull(), yticklabels=False)
ax.set_xlabel('DataFrame Columns')
#ax.set_ylabel('Count')
ax.set_title('Column Null Value Analysis\n(Titanic)')
plt.show()
# %%
ax = sns.heatmap(df.isnull(), yticklabels=False, cmap='viridis')
ax.set_xlabel('DataFrame Columns')
#ax.set_ylabel('Count')
ax.set_title('Column Null Value Analysis\n(Titanic)')
plt.show()
# %%
ax = sns.boxplot(x='Pclass', y='Age', data=df)
ax.set_xlabel('Ticket Class')
#ax.set_ylabel('Age')
ax.set_title('Ticket Class vs Age Analysis\n(Titanic)')
plt.show()
# %%
### Data Wrangling
df.drop('Cabin', axis=1, inplace=True) # axis=1 => column
# %%
df.dropna(inplace=True)
# %%
ax = sns.heatmap(df.isnull(), yticklabels=False, cbar=False)
ax.set_xlabel('DataFrame Columns')
#ax.set_ylabel('Count')
ax.set_title('Column Null Value Analysis\n(Titanic)')
plt.show()
# %%
df.isnull().sum()
# %%
# Convert categorical variable into dummy/indicator variables.
sex = pd.get_dummies(df['Sex'], drop_first=True)
#sex = pd.get_dummies(df['Sex'])
# get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
# columns=None, sparse=False, drop_first=False, dtype=None)
#ix male
# 0 1
# 1 0
# 2 0
# 3 0
# 4 1
# ... ...
# 885 0
# 886 1
# 887 0
# 889 1
# 890 1
# %%
embark = pd.get_dummies(df['Embarked'], drop_first=True)
#embark = pd.get_dummies(df['Embarked'])
#ix embark
# Q S
# 0 0 1
# 1 0 0
# 2 0 1
# 3 0 1
# 4 0 1
# ... ... ...
# 885 1 0
# 886 0 1
# 887 0 1
# 889 0 0
# 890 1 0
# %%
pcl = pd.get_dummies(df['Pclass'], drop_first=True)
#pcl = pd.get_dummies(df['Pclass'])
# ix 2 3
# 0 0 1
# 1 0 0
# 2 0 1
# 3 0 0
# 4 0 1
# ... ... ...
# 885 0 1
# 886 1 0
# 887 0 0
# 889 0 0
# 890 0 1
# %%
dfx = pd.concat([df, sex, embark, pcl], axis=1) # axis=1 (column)
# dfx.info()
# Int64Index: 712 entries, 0 to 890
# Data columns (total 16 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 PassengerId 712 non-null int64
# 1 Survived 712 non-null int64
# 2 Pclass 712 non-null int64
# 3 Name 712 non-null object
# 4 Sex 712 non-null object
# 5 Age 712 non-null float64
# 6 SibSp 712 non-null int64
# 7 Parch 712 non-null int64
# 8 Ticket 712 non-null object
# 9 Fare 712 non-null float64
# 10 Embarked 712 non-null object
# 11 male 712 non-null uint8 *
# 12 Q 712 non-null uint8 *
# 13 S 712 non-null uint8 *
# 14 2 712 non-null uint8 *
# 15 3 712 non-null uint8 *
# dtypes: float64(2), int64(5), object(4), uint8(5)
# %%
dfx.drop(['Pclass','Sex','Embarked','PassengerId','Name','Ticket'], axis=1, inplace=True)
# dfx.info()
# Int64Index: 712 entries, 0 to 890
# Data columns (total 16 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 PassengerId 712 non-null int64 => drop
# 1 Survived 712 non-null int64
# 2 Pclass 712 non-null int64 => drop
# 3 Name 712 non-null object => drop
# 4 Sex 712 non-null object => drop
# 5 Age 712 non-null float64
# 6 SibSp 712 non-null int64
# 7 Parch 712 non-null int64
# 8 Ticket 712 non-null object => drop
# 9 Fare 712 non-null float64
# 10 Embarked 712 non-null object => drop
# 11 male 712 non-null uint8 *
# 12 Q 712 non-null uint8 *
# 13 S 712 non-null uint8 *
# 14 2 712 non-null uint8 *
# 15 3 712 non-null uint8 *
# dtypes: float64(2), int64(5), object(4), uint8(5)
# %%
### Train & Test Data
X = dfx.drop('Survived', axis=1)
y = dfx['Survived']
# %%
# Split arrays or matrices into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
# train_test_split(*arrays, test_size=None, train_size=None,
# random_state=None, shuffle=True, stratify=None)
# %%
# Logistic Regression classifier
model = LogisticRegression(n_jobs=-1, random_state=0)
# LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True,
# intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs',
# max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
# %%
# Fit the model according to the given training data
model.fit(X_train, y_train)
#model.get_params
# %%
# Predict class labels for samples in X_test
predictions = model.predict(X_test)
# actual(y_test)
# array([0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
# 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
# 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
# 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
# 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
# 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
# 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
# 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
# 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
# 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0], dtype=int64)
# predictions
# array([0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
# 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
# 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
# 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
# 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
# 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
# 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
# 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
# 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
# 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], dtype=int64)
# %%
# Build a text report showing the main classification metrics
classification_report(y_test, predictions)
# classification_report(y_true, y_pred, *, labels=None, target_names=None,
# sample_weight=None, digits=2, output_dict=False, zero_division='warn')
# precision recall f1-score support
# class 0 0.80 0.81 0.81 126
# class 1 0.72 0.72 0.72 88
# precision: the accuracy of the positive prediction (FP:TP) => Survivedのaccuracy
# recall: the true positive rate(TPR) => Survivedのrate
# f1-score: precision + recall => Survived accuracy+Survived rateが高いときf1-scoreが高くなる
# つまり、f1-scoreが高いということはprecisionもrecallも高いことになる。
# なので、f1-scoreのみチェックしてもよい!
# accuracy 0.77 214
# macro avg 0.76 0.76 0.76 214
# nweighted avg 0.77 0.77 0.77 214
# %%
# Compute confusion matrix to evaluate the accuracy of a classification
confusion_matrix(y_test, predictions)
# confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)
# 102, 24 | TP(True Positive) ★ FN(False Negative) ☆ Positive:Survied
# 25, 63 | FP(False Positive) ☆ TN(True Negative) ★ Negative:Dead
#------------
# 127 87
# 20% 28%
# 生存すると予測した127人の内、25人の予測が外れた(20%の予測が外れた)。
# 死亡と予測した87人の内、24人の予測が外れた(28%の予測が外れた)。
# このモデルは生存より死亡の予測精度が悪いと言える。
# %%
# Accuracy classification score
accuracy_score(y_test, predictions) # 0.7710 => 77.10%
# accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None)
# The best performance is 1 with normalize == True
# %%