๊ธฐ์ด ํ๋ก์ ํธ : ์ํ ๊ณ ๊ฐ๋ฐ์ดํฐ๋ฅผ ์ด์ฉํ ์๋น์ค ๋ถ์
๊ธฐ์ด ํ๋ก์ ํธ : ์ํ ๊ณ ๊ฐ๋ฐ์ดํฐ๋ฅผ ์ด์ฉํ ์๋น์ค ๋ถ์ (1)
๊ธฐ์ด ํ๋ก์ ํธ : ์ํ ๊ณ ๊ฐ๋ฐ์ดํฐ๋ฅผ ์ด์ฉํ ์๋น์ค ๋ถ์ํ๋ก์ ํธ ๊ฐ์๋ถ์ ๋ชฉ์ : ์ํ ๊ณ ๊ฐ๋ฐ์ดํฐ๋ฅผ ์ด์ฉํด ์๋น์ค์ ํํฉ์ ๋ถ์ํ๊ณ ๊ณ ๊ฐ์ ๋ถ๋ฅํ๊ธฐ.๋ฐ์ดํฐ ์์ง : Kaggle๋ฐ์ดํฐ ์๊ฐ Ba
archivenyc.tistory.com
๋ค์ ์์ํ ์ ์ฒ๋ฆฌ..! ๊ทธ๋๋ ๋ชฉํ๊ฐ ๊ตฌ์ฒดํ ๋ ๋์ ์คํ๋ ค ๊ฐ๊ณ ๊ฐ์ผ ํ ๋ถ๋ถ๊ณผ ๋ฒ๋ฆฌ๊ณ ๊ฐ์ผ ํ ๋ถ๋ถ์ด ๋ช ํํด์ ธ์ ์ข์๋ค. ๋ชธ์ด ์๋ฐ๋ผ์ค์ ์๊ฐ์ ๋ง์ด ์์ง๋ ๋ชปํ์ง๋ง, ๊ทธ๋๋ ๋ฐ๋ ์๊ฒ ๋ณด๋ธ๊ฒ ๊ฐ์ ๋ฟ๋ฏํ๋ ํ๋ฃจ!
๐ฏ ๋ชฉํ ์ค์
๋์ด, ์ง์ , ์ฐ๊ฐ ์๋ ๋ฑ์ ํน์ฑ์ ๋ฐ๋ผ ๊ณ ๊ฐ์ ์ด๋ป๊ฒ ์ธ๋ถํํ ์ ์์๊น์?
→ ๊ณ ๊ฐ ID(Customer_ID)๋ฅผ ๊ธฐ๋ณธ๊ฐ์ผ๋ก ํ ํ ์ด๋ธ ์์ฑํ๊ธฐ
- ์ซ์ ๋ฐ์ดํฐํ์ : mean์ ์ด์ฉํด ๊ณ ๊ฐ๋ณ ํด๋น ์ปฌ๋ผ๊ฐ์ ํ๊ท ๊ตฌํ๊ธฐ
- ๋ฌธ์ ๋ฐ์ดํฐํ์ : ํ์ฉํ๊ธฐ ์ข๊ฒ ์ ์ฒ๋ฆฌํ๊ธฐ
๐พ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
1. ์ซ์ ๋ฐ์ดํฐํ์
# ์ธ๋๋ฐ ์ ๊ฑฐ ํ ํ์
๋ณ๊ฒฝ ํจ์
def del_underbar_int(data, col):
for i, x in enumerate(data[col]):
if pd.notnull(x) and '_' in x:
data.at[i, col] = x.strip('_')
data[col] = data[col].fillna(0).astype('int64')
def del_underbar_float(data, col):
for i, x in enumerate(data[col]):
if pd.notnull(x) and '_' in x:
data.at[i, col] = x.strip('_')
data[col] = data[col].fillna(0).astype('float')
# object -> int/ float๋ก ์ปฌ๋ผ ํ์
๋ณ๊ฒฝ
del_underbar_int(bank,'Age')
del_underbar_int(bank,'Num_of_Delayed_Payment')
del_underbar_float(bank,'Annual_Income')
del_underbar_float(bank,'Outstanding_Debt')
del_underbar_float(bank,'Monthly_Balance')
del_underbar_float(bank,'Amount_invested_monthly')
# ๋ฐ์ดํฐ ํํฐ๋ง (mask ์ด์ฉ, 0์ธ ์ด์ 109์ธ ๋ฏธ๋ง) ์๋ก์ด ๋ฐ์ดํฐ ํ๋ ์ ๋ง๋ค๊ธฐ
mask = ((bank['Age']>0) & (bank['Age']<109)) # bank['Age'].describe() ๊ฒฐ๊ณผ mean 109.7144, ์์์ ์ผ๋ก ์ฐ๋ น๋๊ฐ ๊ทธ ์ด์์ธ ๊ฒ์ ์ด์์น๋ผ๊ณ ํ๋จ. ์ ์ธ
df1 = bank[mask]
sns.boxplot(data=df1, x='Age') #boxplot์ผ๋ก ๋ถํฌ ํ์ธ
# ์ซ์ํ ์ปฌ๋ผ๋ง ์ถ์ถ
df1 = df1[['Customer_ID', 'Age', 'Annual_Income',
'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Outstanding_Debt',
'Credit_Utilization_Ratio', 'Total_EMI_per_month',
'Monthly_Balance','Amount_invested_monthly']]
# age, ๊ทธ ์ธ ์ปฌ๋ผ๋ค์ ๋ฐ๋ก ๋ง๋ค์ด ๊ฒฐํฉํ๊ธฐ
df11 = df1[df1['Age']<=100].groupby('Customer_ID').mean().iloc[:,0]
df12 = df1.groupby('Customer_ID').mean().iloc[:,1:9]
bank_float= pd.merge(df11,df12, on = 'Customer_ID', how = 'inner')
# ํ์ธํ๊ธฐ
bank_float
2. ๋ฌธ์ ๋ฐ์ดํฐํ์
# ๐Type_of_Loan ๊ฐฏ์ ์ ๋๋ก ์ธ๊ธฐ
bank['Type_of_Loan'] = bank['Type_of_Loan'].str.replace(', and ', ', ')
def get_length(data):
if pd.isnull(data):
return 0 # ๊ฒฐ์ธก์น์ธ ๊ฒฝ์ฐ 0์ ๋ฐํ
lst = data.split(',')
return len(lst)
bank['Num_of_Loan'] = bank['Type_of_Loan'].apply(get_length)
# ๐< Credit_History_Age ์ปฌ๋ผ์์ ์ 2์๋ฆฌ ๊ฐ์ ธ์ค๊ธฐ >
def extract_years(data):
if pd.isnull(data):
return 0
result = data.split()[0]
return result
bank['Credit_History_Age'] = bank['Credit_History_Age'].apply(extract_years)
bank['Credit_History_Age']
# ๐๋ฌธ์์ดํ < ๊ณ ๊ฐ ํ
์ด๋ธ (๋ฌธ์์ด) - Customer_ID, Occupation, Type of loan, Credit Mix >
# 1. ํด๋น ์ปฌ๋ผ๋ง bank ๋ฐ์ดํฐํ๋ ์์์ ์๋ก bank_str๋ก ๋ง๋ค๊ธฐ
bank_str = bank.loc[:, ['Customer_ID', 'Occupation', 'Type_of_Loan', 'Credit_Mix']]
# 2. '_'๊ฐ ํฌํจ๋ ์ด์๊ฐ 4๊ฐ์ฉ ๋ฐ๋ณต๋๋ ๋ฌธ์์ด๋ก ์ฑ์ฐ๊ธฐ - ๋ฐ์ดํฐ 4๊ฐ ๋์ผํ๊ฒ ๋ง๋ค๊ธฐ
# 3. ๋์ผํ 4๊ฐ์ ๋ฐ์ดํฐ 1๊ฐ๋ก ์ค์ด๊ธฐ
import pandas as pd
import numpy as np
def fill_missing_values(df):
# 4๊ฐ์ฉ ๋ฐ๋ณต๋๋ ๊ทธ๋ฃน์ผ๋ก ๋๋์ด ์ฒ๋ฆฌ
for i in range(0, len(df), 4):
group = df.iloc[i:i+4]
# Occupation ์ปฌ๋ผ ์ฒ๋ฆฌ
occupation_mode = group.loc[~group['Occupation'].str.contains('_'), 'Occupation'].mode()
if not occupation_mode.empty:
df.loc[i:i+3, 'Occupation'] = occupation_mode[0]
# Credit_Mix ์ปฌ๋ผ ์ฒ๋ฆฌ
credit_mix_mode = group.loc[~group['Credit_Mix'].str.contains('_'), 'Credit_Mix'].mode()
if not credit_mix_mode.empty:
df.loc[i:i+3, 'Credit_Mix'] = credit_mix_mode[0]
return df
# ๊ฒฐ์ธก๊ฐ ์ฑ์ฐ๊ธฐ
bank_str_filled = fill_missing_values(bank_str)
# ์ค๋ณต๋ ํ ์ ๊ฑฐ (์ฒซ ๋ฒ์งธ ํ๋ง ๋จ๊น)
bank_str_unique = bank_str_filled.drop_duplicates('Customer_ID')
# ๋ณํ๋ ๋ฐ์ดํฐ ํ๋ ์ ํ์ธ
bank_str_unique
# ๐ bank_str_unique ์์ Type_of_Loan ๊ฐฏ์ ์ ๋๋ก ์ธ๊ณ Num_of_Loan ์ปฌ๋ผ ์ถ๊ฐํ๊ธฐ
def get_length(data):
if pd.isnull(data):
return 0 # ๊ฒฐ์ธก์น์ธ ๊ฒฝ์ฐ 0์ ๋ฐํ
lst = data.split(',')
return len(lst)
bank_str_unique['Num_of_Loan'] = bank_str_unique['Type_of_Loan'].apply(get_length)
# ๐ Type_of_Loan ๋ฌธ์์ด ํ์ ๊ฒฐ์ธก๊ฐ NaN -> ๋น ๋ฌธ์์ด๋ก ์ฑ์ฐ๊ธฐ
bank_str_unique['Type_of_Loan'] = bank_str_unique['Type_of_Loan'].fillna('')
3. ๋ฐ์ดํฐ ๊ฒฐํฉํ๊ธฐ ๋ฐ ๋ฐ์ดํฐ ๊ฐ๊ณต
# bank_float์ bank_str_unique ๊ฒฐํฉํ๊ธฐ
df = pd.merge(bank_float, bank_str_unique, on = 'Customer_ID', how = 'inner')
# Age์ ๋ฐ๋ฅธ ๋์ด๋ ๋ถ๋ฅ
df['age_group']= [int(i//10)*10 for i in df['Age']]