๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ
# ๊ธฐ๋ณธ ๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
๋ฒ ์ด์ง ๋ฌธํญ
1. Iris ๋ฐ์ดํฐ์ ์์ Logistic Regression ๋ถ๋ฅ
Iris ๋ฐ์ดํฐ์ ์ ์ฌ์ฉํ์ฌ Logistic Regression ๋ชจ๋ธ์ ํ์ต์ํค๊ณ , ์ ํ๋(accuracy)๋ฅผ ๊ณ์ฐํ์ธ์
ํ์ด๊ณผ์
- ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
- test, train ๋ฐ์ดํฐ ๋๋๊ธฐ
๐ก train_test_split์์ stratify์ ์ญํ
ํด๋์ค์ ๋ถํฌ ๋น์จ์ ๋ง์ถฐ์ ๋ฐ์ดํฐ๋ฅผ ๋๋ ์ค๋ค.
→ stratify ์ ๋ฌด์ ๋ฐ๋ผ accuracy์ ์ฐจ์ด๊ฐ ์๋ค. ๊ต์ฐจ๊ฒ์ฆ ์ ํ๋๊ฐ 1์ธ ๊ฒ ๋ณด๋ค๋ , 0.933์ด ๋ ์ ๋ขฐ๊ฐ๋ฅํ ์์ค์ด๋ผ๊ณ ํ๋จํด stratify๋ฅผ ์ ์ฉํจ.
๊ทธ๋ ๋ค๋ฉด, iris์์๋ ์ ์ธตํ์ถ์ถ ์ต์ ์ด ํ์ํ ๊น?
๊ทธ๋ํ ๊ทธ๋ฆฌ๊ธฐ
plt.figure(figsize=(16,9))
plt.subplot(1,2,1)
sns.histplot(iris_y_train, kde=True, color = 'green')
plt.title('train')
plt.subplot(1,2,2)
sns.histplot(iris_y_test, kde=True, color = 'pink')
plt.title('test')
์ธตํ์ถ์ถ์ ๊ฒฝ์ฐ ์ฃผ๋ก ํด๋์ค๊ฐ ๋ถ๊ท ํํ๊ฒ ๋ถํฌ๋์ด ์๊ณ ๋ฐ์ดํฐ ์์ด ์ ์ ๋ ๊ถ์ฅ๋๋ค.
iris ๋ฐ์ดํฐ์ ๊ฒฝ์ฐ ๋ฐ์ดํฐ์ ๊ฐ์๊ฐ ์ด 105๊ฐ๋ก ์ผ๋ฐ์ ์ธ ๋ฐ์ดํฐ ์ ์ ๋นํ๋ฉด,
๋ฐ์ดํฐ์ ์๊ฐ ๋ง์ด ์์ํธ์ ์ํ๋ค.
๊ทธ๋ ๊ธฐ ๋๋ฌธ์ stratify๋ฅผ ์ด์ฉํด์ฃผ๋ฉด ๋น๊ต์ ๊ณ ๋ฅธ ๋ถํฌ๋ฅผ ๊ฐ์ง๋๋ก
๋ฐ์ดํฐ๋ฅผ ๋๋ ์ค ์ ์๊ฒ ๋๋ค.
- Grid Search ์งํ - ๋ ์ข์ ํ์ดํผํ๋ผ๋ฏธํฐ ์ฐพ๊ธฐ
Grid Search ์งํ ๊ฒฐ๊ณผ, max_iter๋ฅผ 100์ผ๋ก ํ๊ณ solver๋ฅผ sag๋ก ํ์ ๋ ๊ต์ฐจ๊ฒ์ฆ ์ ํ๋๊ฐ 0.9333์์ 0.9778๋ก ์ฌ๋ผ๊ฐ๋ค.
- ์ต์ข accuracy ๊ณ์ฐ
์ ์ถ๋ต์
# ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
from sklearn.datasets import load_iris
iris = load_iris()
iris_X, iris_y = iris.data, iris.target
# test, train ๋ฐ์ดํฐ ๋๋๊ธฐ
from sklearn.model_selection import train_test_split
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y,
test_size= 0.3,
shuffle = True,
stratify=iris_y,
random_state= 42)
# ํจ์ ๋ถ๋ฌ์ค๊ธฐ ๋ฐ ๋ชจ๋ธ์ ๊ตฌ์กฐ ๋ฃ๊ธฐ
from sklearn.linear_model import LogisticRegression
model_lor = LogisticRegression(random_state=42)
# ๋ชจ๋ธ ์ ํฉ
model_lor.fit(iris_X_train,iris_y_train)
# accuracy ๊ณ์ฐํ๊ธฐ model_lor
from sklearn.metrics import accuracy_score
iris_pred_test = model_lor.predict(iris_X_test)
accuracy = accuracy_score(iris_y_test,iris_pred_test)
print(f'model_lor์ ๊ต์ฐจ๊ฒ์ฆ ์ ํ๋๋ {accuracy:.4f} ์
๋๋ค.') ## model_lor์ ๊ต์ฐจ๊ฒ์ฆ ์ ํ๋๋ 0.9333 ์
๋๋ค.
# Grid Search
from sklearn.model_selection import GridSearchCV
params = {'solver' : ['newton-cg','lbfgs','liblinear','sag','saga'],
'max_iter' : [10,50,100]}
grid_lor = GridSearchCV(model_lor, param_grid = params, scoring='accuracy', cv = 5)
grid_lor.fit(iris_X_train, iris_y_train)
# Grid Search ์์ ์ฐพ์ best ์กฐํฉ์ผ๋ก model_lor2 ์์ฑ
iris_best_max_iter = grid_lor.best_params_['max_iter']
iris_best_solver = grid_lor.best_params_['solver']
model_lor2 = LogisticRegression(random_state=42, max_iter = iris_best_max_iter, solver = iris_best_solver)
model_lor2.fit(iris_X_train,iris_y_train)
# accuracy ๊ณ์ฐํ๊ธฐ model_lor2
iris_pred_test2 = model_lor2.predict(iris_X_test)
accuracy2 = accuracy_score(iris_y_test,iris_pred_test2)
print(f'model_lor2์ ๊ต์ฐจ๊ฒ์ฆ ์ ํ๋๋ {accuracy2:.4f} ์
๋๋ค.') ## model_lor2์ ๊ต์ฐจ๊ฒ์ฆ ์ ํ๋๋ 0.9778 ์
๋๋ค.
2. Boston ์ฃผํ ๊ฐ๊ฒฉ Linear Regression ์์ธก
Boston ์ฃผํ ๊ฐ๊ฒฉ ๋ฐ์ดํฐ์ ์ ์ฌ์ฉํ์ฌ ์ฃผํ ๊ฐ๊ฒฉ์ ์์ธกํ๋ ํ๊ท ๋ชจ๋ธ์ ๋ง๋์ธ์.
ํ์ด๊ณผ์
- ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
- train, test ๋ฐ์ดํฐ ๋๋๊ธฐ - ์ด 20640๊ฐ์ ๋ฐ์ดํฐ, stratify X(14448, 6192๋ก ๋๋)
- ๋ชจ๋ธ ์ ํฉํ๊ธฐ
- test ๋ฐ์ดํฐ๋ก mse ๊ณ์ฐํ๊ธฐ
์ ์ถ๋ต์
# ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing_X, housing_y = housing['data'], housing['target']
# test, train ๋ฐ์ดํฐ ๋๋๊ธฐ
from sklearn.model_selection import train_test_split
housing_X_train, housing_X_test, housing_y_train, housing_y_test = train_test_split(housing_X, housing_y,
test_size= 0.3,
shuffle = True,
random_state= 42)
# ํจ์ ๋ถ๋ฌ์ค๊ธฐ ๋ฐ ๋ชจ๋ธ์ ๊ตฌ์กฐ ๋ฃ๊ธฐ
from sklearn.linear_model import LinearRegression
model_lr= LinearRegression()
# ๋ชจ๋ธ ์ ํฉ
model_lr.fit(housing_X_train, housing_y_train)
# mse ๊ณ์ฐํ๊ธฐ
from sklearn.metrics import mean_squared_error
housing_pred = model_lr.predict(housing_X_test)
mse = mean_squared_error(housing_y_test, housing_pred).round(4)
print(f'mse๋ {mse}์
๋๋ค.') ## mse๋ 0.5306์
๋๋ค.
3. iris๋ฐ์ดํฐ DecisionTree๋ก ๋ถ๋ฅ
Iris ๋ฐ์ดํฐ์ ์ ์ฌ์ฉํ์ฌ DecisionTree ๋ชจ๋ธ์ ํ์ต์ํค๊ณ , ์ ํ๋(accuracy)๋ฅผ ๊ณ์ฐํ์ธ์
ํ์ด๊ณผ์
- ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ/ train, test ๋ฐ์ดํฐ ๋๋๊ธฐ (์๋ต)
- ํจ์ ๋ถ๋ฌ์ค๊ณ fittingํ๊ธฐ
→ max_depth๋ฅผ ์กฐ์ ํ๋ฉฐ ์ต์ ์ max_depth ๊ฐ์ ์ฐพ์.
max_depth๋ฅผ 4๋ก ์ง์ ํ์๋๋ณด๋ค 5๋ก ์ง์ ํ์ ๋ ์ ํ๋๊ฐ ๋์์ก์ผ๋ฉฐ, 5๋ก ์ง์ ํ์ ๋์ 6์ ์ง์ ํ์ ๋ ์ ํ๋๊ฐ ๊ฐ์ ๋์ง ์์๋ค. ๋ฐ๋ผ์ ๊ฐ์ฅ ํจ์จ์ ์ธ max_depth๊ฐ์ 5์ด๋ค.
from sklearn.tree import DecisionTreeClassifier, plot_tree
model_dt_max_4 = DecisionTreeClassifier(random_state=42, max_depth=4)
model_dt_max_5 = DecisionTreeClassifier(random_state=42, max_depth=5)
model_dt_max_6 = DecisionTreeClassifier(random_state=42, max_depth=6)
# fitting
model_dt_max_4.fit(iris_X_train,iris_y_train)
model_dt_max_5.fit(iris_X_train,iris_y_train)
model_dt_max_6.fit(iris_X_train,iris_y_train)
# accuracy ๊ณ์ฐ - max_depth = 4
iris_dt_pred_4 = model_dt_max_4.predict(iris_X_test)
accuracy_dt_4 = accuracy_score(iris_y_test,iris_dt_pred_4)
print(f'Decision Tree์์ max_depth๋ฅผ 4๋ก ์คฌ์ ๋ ๊ต์ฐจ๊ฒ์ฆ ์ ํ๋๋ {accuracy_dt_4:.4f} ์
๋๋ค.')
# accuracy ๊ณ์ฐ - max_depth = 5
iris_dt_pred_5 = model_dt_max_5.predict(iris_X_test)
accuracy_dt_5 = accuracy_score(iris_y_test,iris_dt_pred_5)
print(f'Decision Tree์์ max_depth๋ฅผ 5๋ก ์คฌ์ ๋ ๊ต์ฐจ๊ฒ์ฆ ์ ํ๋๋ {accuracy_dt_5:.4f} ์
๋๋ค.')
# accuracy ๊ณ์ฐ - max_depth = 6
iris_dt_pred_6 = model_dt_max_6.predict(iris_X_test)
accuracy_dt_6 = accuracy_score(iris_y_test,iris_dt_pred_6)
print(f'Decision Tree์์ max_depth๋ฅผ 6๋ก ์คฌ์ ๋ ๊ต์ฐจ๊ฒ์ฆ ์ ํ๋๋ {accuracy_dt_6:.4f} ์
๋๋ค.')
- ์์ฌ๊ฒฐ์ ๋๋ฌด ๊ทธ๋ฆฌ๊ธฐ
from sklearn.tree import plot_tree
# ์์ฌ๊ฒฐ์ ๋๋ฌด ๊ทธ๋ฆฌ๊ธฐ
X_features = iris.feature_names
iris_class = iris.target_names
plt.figure(figsize=(16,9))
plot_tree(model_dt_max_5,
feature_names=X_features,
class_names=iris_class, #
filled = True # ์์ฌ๊ฒฐ์ ๋๋ฌด ์์ ์ฌ๊ฐํ์ ์ฑ์ฐ๊ธฐ
)
plt.show()
- test ๋ฐ์ดํฐ๋ก ๋ชจ๋ธ์ accuracy ๊ณ์ฐํ๊ธฐ
์ ์ถ๋ต์
# ๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ
from sklearn.tree import DecisionTreeClassifier
# ๋ชจ๋ธ ๋ฃ๊ธฐ
model_dt = DecisionTreeClassifier(random_state=42, max_depth=5)
# fitting
model_dt.fit(iris_X_train,iris_y_train)
# accuracy ๊ณ์ฐ
iris_dt = model_dt.predict(iris_X_test)
accuracy_dt = accuracy_score(iris_y_test,iris_dt)
print(f'Decision Tree์์ ๊ต์ฐจ๊ฒ์ฆ ์ ํ๋๋ {accuracy_dt:.4f} ์
๋๋ค.')
## Decision Tree์์ ๊ต์ฐจ๊ฒ์ฆ ์ ํ๋๋ 0.9333 ์
๋๋ค.
์ฑ๋ฆฐ์ง ๋ฌธํญ
4. ํ์ดํ๋ ๋ฐ์ดํฐ๋ฅผ ๋๋คํฌ๋ ์คํธ ๋ถ๋ฅ๊ธฐ๋ก ์์กด์ฌ๋ถ๋ฅผ ์์ธกํ์ธ์
- ๋ฐ์ดํฐ์ ํน์ฑ ์ค ๋ค์ ํน์ฑ๋ค๋ง ์ฌ์ฉํ์ธ์ ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]
- ์ฑ๋ฅ์ accuracy๋ก ํ๊ฐ
ํ์ด๊ณผ์
- ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
- 'SibSp' + 'Parch' = 'Family' ๋ณ์ ์์ฑ
- Fare ์ด์์น ์ ๊ฑฐ
Fare๊ฐ 500์ด ๋๋ ๊ฒฝ์ฐ ์ด์์น ๋ฐ๊ฒฌ
→ ๊ฑด์๊ฐ 3๊ฐ ๋ฐ์ ๋์ง ์์์ ์ญ์ ํ๊ธฐ๋ก ํจ.
sns.histplot(data = titanic, x= 'Fare', kde = True)
plt.title('Distribution of Fare')
# Fare ์ด์์น ์ ๊ฑฐ - IQR
mask = (titanic['Fare']< 512)
titanic = titanic[mask]
sns.histplot(data = titanic, x= 'Fare', kde = True, color = 'red')
plt.title('Distribution of Fare (outlier removed)')
- train, test ๋ฐ์ดํฐ ๋๋๊ธฐ
- ์ ์ฒ๋ฆฌ
- ๊ฒฐ์ธก์น ํ์ธ ๊ฒฐ๊ณผ 'Age' ์ปฌ๋ผ์ 177๊ฐ์ ๊ฒฐ์ธก์น๊ฐ ์กด์ฌํ๋ ๊ฒ์ ํ์ธ → ํ๊ท ์ผ๋ก ์ฑ์ฐ๊ธฐ
- Pclass(1,2,3)์ Sex('male', 'female')์ ๋ผ๋ฒจ ์ธ์ฝ๋ฉ
- ํ์คํ(Fare), ์ ๊ทํ(Age, Family)
- ํจ์ ๋ถ๋ฌ์ค๊ณ fittingํ๊ธฐ(๋๋ค ํฌ๋ ์คํธ)
- test ๋ฐ์ดํฐ๋ก ๋ชจ๋ธ์ accuracy ๊ณ์ฐํ๊ธฐ - RandomForest ๋ชจ๋ธ์ accuracy๋ 0.933์ ๋๋ค.
์ ์ถ๋ต์
# ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic = pd.read_csv(url)
# SibSp + Parch = Family
titanic['Family'] = titanic['SibSp']+titanic['Parch'] + 1
# test, train ๋ฐ์ดํฐ ๋๋๊ธฐ
from sklearn.model_selection import train_test_split
titanic_X_train, titanic_X_test, titanic_y_train, titanic_y_test = train_test_split(titanic_X, titanic_y,
test_size= 0.3,
shuffle = True,
stratify=titanic_y,
random_state= 42)
# ์ ์ฒ๋ฆฌ
# ๊ฒฐ์ธก์น ์ฒ๋ฆฌ ํจ์
def preprocess_missing(df):
# Age : ๊ฒฐ์ธก๊ฐ (714๊ฐ) ํ๊ท ๊ฐ์ผ๋ก ์ฑ์ฐ๊ธฐ
age_mean = df['Age'].mean()
df['Age'] = df['Age'].fillna(age_mean)
preprocess_missing(titanic_X_train)
preprocess_missing(titanic_X_test)
# ์ธ์ฝ๋ฉ
def label_encoder(df):
from sklearn.preprocessing import LabelEncoder
# Pclass ๋ผ๋ฒจ ์ธ์ฝ๋ฉ
le1 = LabelEncoder()
df['Pclass_le'] = le1.fit_transform(df['Pclass'])
# Sex ๋ผ๋ฒจ ์ธ์ฝ๋ฉ
le2 = LabelEncoder()
df['Sex_le'] = le2.fit_transform(df['Sex'])
label_encoder(titanic_X_train)
label_encoder(titanic_X_test)
# ์ค์ผ์ผ๋ง
def preprocess_scaling(df):
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# ํ์คํ : Fare
sd_sc = StandardScaler()
df['Fare_sd_sc'] = sd_sc.fit_transform(df[['Fare']])
# ์ ๊ทํ : Age
mm_sc = MinMaxScaler()
mm_sc.fit(df[['Age','Family']])
df[['Age_mm_sc','Family_mm_sc']] = mm_sc.transform(df[['Age','Family']])
preprocess_scaling(titanic_X_train)
preprocess_scaling(titanic_X_test)
# Random Forest
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(random_state=42)
# fitting
titanic_X_train2 = titanic_X_train.iloc[:,5:]
model_rf.fit(titanic_X_train2, titanic_y_train)
# ํ๊ฐํ๊ธฐ
from sklearn.metrics import accuracy_score
X_test = titanic_X_test.iloc[:,5:]
y_pred = model_rf.predict(X_test)
accuarcy = accuracy_score(titanic_y_test, y_pred)
print(f'RandomForest ๋ชจ๋ธ์ accuracy๋{accuracy: .3f}์
๋๋ค.')
5. iris ๋ฐ์ดํฐ๋ฅผ ์ด์ฉํ์ฌ ํด๋ฌ์คํฐ๋ง์ ์ํํ๊ณ ์๊ฐํ ํ์ธ์
- ํด๋ฌ์คํฐ์ ๊ฐ์๋ 3๊ฐ๋ก ์ค์ ํ๋ค.
- ํด๋ฌ์คํฐ์ ๊ฒฐ๊ณผ๋ฅผ ์๊ฐํ ํ๊ธฐ ์ํด ๋ฐ์ดํฐ์ ์ฒซ๋ฒ์งธ ํน์ฑ์ X์ถ, ๋๋ฒ์งธ ํน์ฑ์ Y์ถ์ผ๋ก ํ๋ ์ฐ์ ๋(scatter) ๊ทธ๋ฆฌ์ธ์!
ํ์ด๊ณผ์
- ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
- array ํ์ ์ด๊ธฐ ๋๋ฌธ์ scatter plot์ ๊ทธ๋ฆฌ๊ธฐ ์ํด์๋ ๋ฐ์ดํฐ๋ฅผ ํผ์ณ์ค์ผ ํ๋ค!
- ์ ๊ทํ๋ฅผ ํ๋ ๊ฒฝ์ฐ ์ฑ๋ฅ์ด ์ข์์ง๋์ง ํ๋จํ๊ธฐ ์ํด MinMaxScaler๋ฅผ ์ด์ฉํ ๋น๊ต๊ตฐ iris_X_mm_sc ๋ ๋ง๋ค์ด์ค๋ค.
- K-Means Clustering
- ์ ๊ทํ๋ ๋ชจ๋ธ(MinMaxScaler) - inertia๋ 4.115์ด๊ณ , ์ค๋ฃจ์ฃ ๊ณ์๋ 0.444์ด๋ค.
- ์ ๊ทํ๋์ง ์์ ๊ฒฝ์ฐ - inertia๋ 37.051์ด๊ณ , ์ค๋ฃจ์ฃ ๊ณ์๋ 0.445์ด๋ค.
visualize_silhouette๋ฅผ ์ด์ฉํด ์ค๋ฃจ์ฃ ์ค์ฝ์ด๋ฅผ ๊ณ์ฐํ๊ณ ๊ตฐ์ง์ ์๊ฐํ ํด๋ณธ ๊ฒฐ๊ณผ ์ ๊ทํ๋ฅผ ํ์ชฝ์ด ํ์ง ์์ ์ชฝ๋ณด๋ค ์ค๋ฃจ์ฃ ๊ณ์๊ฐ ์๊ฒ ๋ํ๋ฌ๋ค. ๊ทธ๋ฌ๋ ๊ทธ ์ฐจ์ด๊ฐ 0.001๋ก ๋งค์ฐ ์๊ธฐ ๋๋ฌธ์ ํฐ ํ๋ณ๋ ฅ์ ๊ฐ์ง๋ค๊ณ ๋ณผ ์ ์์๋ค. ๋ฐ๋ผ์ ์ถ๊ฐ์ ์ผ๋ก inertia๋ฅผ ๊ณ์ฐํ๋ค.
- ํด๋ฌ์คํฐ์ ๊ฒฐ๊ณผ๋ฅผ ์๊ฐํ ํ๋ค. (seaborn scatter plot)
plt.figure(figsize=(18,6))
plt.subplot(1,3,1)
# ์ต์ข
๊ตฐ์งํ ๋ชจ๋ธ scatter plot์ผ๋ก ํํํ๊ธฐ
sns.scatterplot(x= X, y = y, hue = labels, palette='pastel')
plt. title('Original KMeans Clustering')
plt.subplot(1,3,2)
sns.scatterplot(x=X, y=y, hue = iris_y, palette = 'pastel' )
plt.title('Original')
plt.subplot(1,3,3)
# ์ต์ข
๊ตฐ์งํ ๋ชจ๋ธ scatter plot์ผ๋ก ํํํ๊ธฐ
sns.scatterplot(x= X_mm_sc, y = y_mm_sc, hue = labels_mm, palette='pastel')
plt. title('MinMax Scaled KMeans Clustering')
์ ์ถ๋ต์
# ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
iris = load_iris()
iris_X, iris_y = iris.data, iris.target
# ์ค์ผ์ผ๋ง vs. ์ค๋ฆฌ์ง๋
from sklearn.preprocessing import MinMaxScaler
mm_sc = MinMaxScaler()
iris_X_mm_sc = mm_sc.fit_transform(iris_X)
# array๋ฅผ ํด์ฃผ๊ธฐ
# ๋ฐ์ดํฐ์ ์ฒซ๋ฒ์งธ ํน์ฑ์ X์ถ
X = np.array(iris_X).T[0]
X_mm_sc = np.array(iris_X_mm_sc).T[0]
# ๋๋ฒ์งธ ํน์ฑ์ Y์ถ
y = np.array(iris_X).T[1]
y_mm_sc = np.array(iris_X_mm_sc).T[1]
# ๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# KMeans Clustering
kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state= 42)
kmeans_mm_sc = KMeans(n_clusters = 3, init = 'k-means++', random_state= 42)
# fitting
kmeans.fit(iris_X)
kmeans_mm_sc.fit(iris_X_mm_sc)
# ํ๊ฐํ๊ธฐ
X_features = np.column_stack((X,y))
X_features_mm = np.column_stack((X_mm_sc,y_mm_sc))
labels = kmeans.fit_predict(X_features)
labels_mm = kmeans_mm_sc.fit_predict(X_features_mm)
print(f'๋ชจ๋ธ์ inertia๋ {kmeans.inertia_ : .3f}์ด๊ณ , ์ค๋ฃจ์ฃ ๊ณ์๋ {silhouette_score(X_features, labels).round(3)}์
๋๋ค.')
## ๋ชจ๋ธ์ inertia๋ 37.051์ด๊ณ , ์ค๋ฃจ์ฃ ๊ณ์๋ 0.445์
๋๋ค.
print(f'์ ๊ทํ๋ ๋ชจ๋ธ์ inertia๋ {kmeans_mm_sc.inertia_ : .3f}์ด๊ณ , ์ค๋ฃจ์ฃ ๊ณ์๋ {silhouette_score(X_features_mm, labels_mm).round(3)}์
๋๋ค.')
## ์ ๊ทํ๋ ๋ชจ๋ธ์ inertia๋ 4.115์ด๊ณ , ์ค๋ฃจ์ฃ ๊ณ์๋ 0.444์
๋๋ค.
# scatter plot ๊ทธ๋ฆฌ๊ธฐ
sns.scatterplot(x= X, y = y, hue = labels, palette='pastel')
plt. title('Original KMeans Clustering')
6. ๋ฅ๋ฌ๋์ ์ด์ฉํ์ฌ MNIST ๋ฐ์ดํฐ๋ฅผ ๋ถ๋ฅํ์ธ์
- ๋ณธ์ธ๋ง์ ์ ๊ฒฝ๋ง์ ๊ตฌ์ถํ์ธ์
- ํ์ตํ๊ณ ์ ํ๋(accuracy)๋ฅผ ๊ณ์ฐํ์ธ์.
ํ์ด๊ณผ์
- ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ/ train, test ๋ฐ์ดํฐ ๋๋๊ธฐ
- ์ ๊ทํ ์งํ : 0 ~ 1 ์ฌ์ด ๊ฐ์ผ๋ก
- ํจ์ ๋ถ๋ฌ์ค๊ณ fittingํ๊ธฐ
- Dense ๋ฅผ ์ฌ์ฉํ๊ธฐ ์ํด์ 2์ฐจ์ ๋ฐ์ดํฐ๋ฅผ 1์ฐจ์์ผ๋ก Flatten
- ๋ ์ด์ด, ํ๋ ๋ ์ด์ด , output ๋ ์ด์ด ์ถ๊ฐ
- compile(๋ถ๋ฅ์์๋ ๋ฌด์กฐ๊ฑด cross entropy loss๋ก ์ฌ์ฉ)
- evaluate
์ ์ถ๋ต์
# ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
from tensorflow.keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()
# ์ ๊ทํ ์งํ
X_train, y_train = X_train/255., y_train/255.
X_test, y_test = X_test/255., y_test/255.
# tensorflow ๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
# Sequential ๋ชจ๋ธ ์ด๊ธฐํ
model = Sequential()
model.add(Flatten(input_shape=[28, 28])) # 1์ฐจ์ ๋ฐฐ์ด๋ก ํผ์ณ์ฃผ๊ธฐ
model.add(Dense(128, activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dense(10, activation="softmax")) # ๋ถ๋ฅ์ธ ๊ฒฝ์ฐ ๋งจ ๋ง์ง๋ง activation = softmax
# compile
model.compile(loss="sparse_categorical_crossentropy", #๋ถ๋ฅ์์๋ ๋ฌด์กฐ๊ฑด loss๋ฅผ crossentropy๋ฅผ ์ฌ์ฉ
optimizer="sgd",
metrics=["accuracy"])
model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))
# evaluate
model.evaluate(X_test, y_test)
## model.evaluate(X_test, y_test)
'๐ Today I Learn > ๐ Python' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
๋จธ์ ๋ฌ๋ ํน๊ฐ #3 ๊ตฐ์งํ(Clustering) (0) | 2024.06.15 |
---|---|
๋จธ์ ๋ฌ๋ ํน๊ฐ #2 ํ๊ท(Regression) (0) | 2024.06.15 |
๋จธ์ ๋ฌ๋์ ์ดํด์ ๋ผ์ด๋ธ๋ฌ๋ฆฌ ํ์ฉ (7) ๋ฅ๋ฌ๋ (1) | 2024.06.13 |
๋จธ์ ๋ฌ๋ ํน๊ฐ #1 ๋ถ๋ฅ(Clasification) (0) | 2024.06.12 |
๋จธ์ ๋ฌ๋์ ์ดํด์ ๋ผ์ด๋ธ๋ฌ๋ฆฌ ํ์ฉ (6) ๋น์ง๋ํ์ต (0) | 2024.06.11 |