< 분류 모델 실습 >
문제) pima-indians-diabetes.csv 파일을 읽어서, 당뇨병을 분류하는 모델을 만드시오.
컬럼 정보 :
Preg=no. of pregnancy
Plas=Plasma
Pres=blood pressure
skin=skin thickness
test=insulin test
mass=body mass
pedi=diabetes pedigree function
age=age
class=target(diabetes of not, 1:diabetic, 0:not diabetic)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# 데이터 불러오기
df = pd.read_csv('../data/pima-indians-diabetes.csv')
df
# 실행전 데이터 분석을 먼저 해보는것이 좋다.
# 분석하는 방법도 꾸준히 연습해야 한다.
# Pres(혈압), skin(피부두께), mass 같은것들은 최소값이 0이 조금 이상하다고 판단됨
df.describe()
df.isna().sum()
Preg 0
Plas 0
Pres 0
skin 0
test 0
mass 0
pedi 0
age 0
class 0
dtype: int64
# 비어있는 데이터는 없지만,
# 데이터의 최소값이 0으로 나오는 이상한 컬럼들이 있다.
# 즉, 비어있는 항목대신, 0으로 셋팅한것 같다.
# 우리가 하는건 단순 가공이 아니라 데이터를 분석하여 알맞은 데이터를 분석할수있도록 가공하는것이다.
# 따라서 Plas 컬럼부터 mass 컬럼 까지 0으로 셋팅되어있는 값을, nan 으로 만들어주자!
# nan은 넘파이에서 나온 개념이므로 문자열로 'nan'을 적는것이 아니라 np.nan으로 적어줘야함
df.loc[ : ,'Plas' : 'mass' ] = df.loc[ : ,'Plas' : 'mass' ].replace(0, np.nan)
# NaN이 나타난걸 확인
df.isna().sum()
Preg 0
Plas 5
Pres 35
skin 227
test 374
mass 11
pedi 0
age 0
class 0
dtype: int64
# 이상한 데이터는 모두 없애버린후에 분석 진행
df = df.dropna()
# 삭제된 데이터때문에 인덱스도 한번 정렬해 주자.
df.reset_index(drop=True, inplace=True)
# X,y 분류y = df['class']y
0 0
1 1
2 1
3 1
4 1
..
387 1
388 1
389 0
390 0
391 0
Name: class, Length: 392, dtype: int64
X = df.loc[ : , 'Preg' : 'age' ]
X
# 본격적으로 예측전에 타겟 데이터의 데이터량을 한번 비교해보자.
import seaborn as sb
import matplotlib.pyplot as plt
sb.countplot(data=df , x = 'class')
plt.show()
# 당뇨병인 사람의 데이터가 반정도 적으니까, (당뇨병일 경우 1)
# 데이터가 불균형하기 때문에 학습효율을 높이기 위해서 up sampling 기법으로, 당뇨병인 사람의 데이터를 늘려준다.
# NaN을 모두 drop한 영향도 있고, 데이터 자체가 불균형하게 수집된 경우도 있을수 있음
# imblearn 라이브러리를 이용한다!
pip install imblearn
# 오리지널 주피터에선 ! pip install imblearn 으로 앞에 ! 를 붙여줘야함
Defaulting to user installation because normal site-packages is not writeable
Collecting imblearn
Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Requirement already satisfied: imbalanced-learn in c:\programdata\anaconda3\lib\site-packages (from imblearn) (0.11.0)
Requirement already satisfied: numpy>=1.17.3 in c:\programdata\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.26.4)
Requirement already satisfied: scipy>=1.5.0 in c:\programdata\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.11.4)
Requirement already satisfied: scikit-learn>=1.0.2 in c:\programdata\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.2.2)
Requirement already satisfied: joblib>=1.1.1 in c:\programdata\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\programdata\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (2.2.0)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0
Note: you may need to restart the kernel to use updated packages.
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=5)
# 원래는 392개인거 확인
X.shape
(392, 8)
# 리샘플 진행
sm.fit_resample(X, y)
( Preg Plas Pres skin test mass pedi \
0 1 89.000000 66.000000 23.000000 94.000000 28.100000 0.167000
1 0 137.000000 40.000000 35.000000 168.000000 43.100000 2.288000
2 3 78.000000 50.000000 32.000000 88.000000 31.000000 0.248000
3 2 197.000000 70.000000 45.000000 543.000000 30.500000 0.158000
4 1 189.000000 60.000000 23.000000 846.000000 30.100000 0.398000
.. ... ... ... ... ... ... ...
519 6 109.000000 66.829196 40.310115 123.825861 33.074953 0.557463
520 4 170.419053 58.961454 31.007709 177.480727 32.286123 0.459002
521 10 155.428788 80.287876 28.000000 152.143938 33.728788 1.282679
522 0 127.856910 67.959117 19.224856 179.693378 30.577678 1.366716
523 0 118.240273 84.480546 45.077818 227.597272 45.751945 0.612750
age
0 21
1 33
2 26
3 53
4 59
.. ...
519 27
520 38
521 47
522 25
523 30
[524 rows x 8 columns],
0 0
1 1
2 1
3 1
4 1
..
519 1
520 1
521 1
522 1
523 1
Name: class, Length: 524, dtype: int64)
ㄴ 하나는 튜플 하나는 데이터 프레임 시리즈로 결과값이 출력됨
# resample 한걸 다시 X,y에 저장
X,y = sm.fit_resample(X, y)
# 524개로 늘어난거 확인
X.shape
(524, 8)
# 갯수 맞춰진거 확인
y.value_counts()
class
0 262
1 262
Name: count, dtype: int64
# 문자열 데이터는 없으므로 피쳐 스케일링 진행
from sklearn.preprocessing import StandardScaler
scaler_X = StandardScaler()
scaler_X.fit_transform(X)
array([[-0.75285334, -1.19723252, -0.41956655, ..., -0.84538944,
-1.1525669 , -1.08174618],
[-1.05376497, 0.29753312, -2.59049008, ..., 1.43499219,
5.36673957, 0.11853968],
[-0.15103007, -1.53978298, -1.75551949, ..., -0.40451566,
-0.90359762, -0.58162707],
...,
[ 1.95535134, 0.87142309, 0.77342911, ..., 0.01032949,
2.27668921, 1.51887318],
[-1.05376497, 0.01280859, -0.25598603, ..., -0.46871941,
2.53499239, -0.68165089],
[-1.05376497, -0.28666263, 1.12350465, ..., 1.83815536,
0.21753282, -0.18153178]])
X = scaler_X.fit_transform(X)
X
array([[-0.75285334, -1.19723252, -0.41956655, ..., -0.84538944,
-1.1525669 , -1.08174618],
[-1.05376497, 0.29753312, -2.59049008, ..., 1.43499219,
5.36673957, 0.11853968],
[-0.15103007, -1.53978298, -1.75551949, ..., -0.40451566,
-0.90359762, -0.58162707],
...,
[ 1.95535134, 0.87142309, 0.77342911, ..., 0.01032949,
2.27668921, 1.51887318],
[-1.05376497, 0.01280859, -0.25598603, ..., -0.46871941,
2.53499239, -0.68165089],
[-1.05376497, -0.28666263, 1.12350465, ..., 1.83815536,
0.21753282, -0.18153178]])
# train/test 데이터 분리
from sklearn.model_selection import train_test_split
# feature 데이터 X, target 데이터 y, 테스트 데이터는 20%, seed 값은 2로 지정
train_test_split( X, y , test_size= 0.2, random_state=2)
[array([[-1.05376497, -0.94810492, -0.58656067, ..., 0.4772319 ,
0.17834291, -0.68165089],
[-1.05376497, 0.01726456, -0.25257243, ..., -0.48052838,
2.60963542, -0.68165089],
[-0.15103007, 0.3861968 , -0.67144443, ..., -0.07339296,
-0.21560048, -0.58162707],
...,
[-0.75285334, -0.263004 , -2.25650185, ..., 0.27959883,
-0.80523939, -0.68165089],
[ 0.45079319, -0.64099183, 0.66510779, ..., 1.360907 ,
0.04370907, 0.71868261],
[-1.05376497, 1.16947975, 0.41540404, ..., 2.1647143 ,
-0.86978698, -0.58162707]]),
array([[-1.05376497e+00, 1.07605689e+00, 4.15404044e-01,
2.62629477e+00, -5.48778080e-01, 2.97044914e+00,
6.67060370e-01, -6.81650891e-01],
[-1.05376497e+00, 1.60545306e+00, -1.75551949e+00,
6.10128600e-01, -4.13781419e-02, 6.29257345e-01,
-2.67342820e-01, -9.81722356e-01],
[ 1.49881556e-01, 4.84055124e-02, -9.20548901e-01,
-1.80927081e+00, 5.77821783e-01, -9.36604702e-01,
-4.60368017e-02, -8.15079615e-02],
[-1.51030074e-01, -1.19723252e+00, 2.48409926e-01,
-1.40603757e+00, -6.77778064e-01, -4.95730921e-01,
2.77318712e-02, 6.18658790e-01],
[-1.51030074e-01, 5.08938562e-04, -3.93925745e-01,
-5.21914247e-01, -3.96630333e-01, -9.57692766e-01,
1.00504059e-01, 2.18563503e-01],
[ 4.50793187e-01, 2.66392169e-01, 1.08338052e+00,
1.11417014e+00, -6.51978067e-01, 2.03586109e-01,
-7.86797225e-01, 3.18587325e-01],
[-4.51941705e-01, -1.63320584e+00, -5.86560665e-01,
-5.99571103e-01, -9.35778033e-01, -6.02148730e-01,
-5.28606870e-01, 1.18539682e-01],
[-4.51941705e-01, -1.40469665e+00, 9.55427955e-01,
1.18151568e-02, -8.52351508e-01, 1.57788751e-01,
1.80471560e+00, -5.81627069e-01],
[-7.52853335e-01, 6.71224531e-01, -2.52572429e-01,
-9.55295600e-02, -3.16578108e-01, -6.62958907e-01,
-5.93154459e-01, 1.01875408e+00],
[ 1.35352808e+00, -8.54682064e-01, 2.48409926e-01,
1.01336183e+00, 4.40221799e-01, 8.72498052e-01,
3.65838289e-01, 1.11877790e+00],
[-7.52853335e-01, 1.27409675e+00, 1.91654167e-01,
-7.57046882e-01, -1.96683814e-01, -1.04839183e+00,
-5.51004792e-01, -2.81555605e-01],
[-7.52853335e-01, -1.13495062e+00, -1.42153126e+00,
-4.98762794e-01, -5.48778080e-01, -1.28626322e+00,
-9.46629350e-01, -8.81698534e-01],
[ 1.05261645e+00, 7.02365482e-01, 5.82398162e-01,
-9.55295600e-02, -3.25178107e-01, 2.33991198e-01,
4.61122825e-01, 2.21903994e+00],
[-4.51941705e-01, -1.25951443e+00, -1.08754302e+00,
-1.40603757e+00, -9.61578030e-01, -1.46072406e-01,
-1.15564059e+00, -6.81650891e-01],
[-1.05376497e+00, -8.54682064e-01, -8.55783104e-02,
-3.97954486e-01, -9.78778027e-01, -4.34920745e-01,
1.69121828e-01, -1.08174618e+00],
[-1.51030074e-01, 1.44678014e+00, -9.28374347e-01,
-6.37844290e-01, 2.52244523e-01, -1.02626188e-01,
2.67275212e-01, 3.18587325e-01],
[ 1.95535134e+00, -2.30508447e-01, 7.20335993e-01,
6.99974265e-01, -1.17655859e-01, 1.28956521e+00,
7.40759263e-01, 1.51887318e+00],
[-7.52853335e-01, 3.49539777e-02, 1.77803969e+00,
5.98798561e-01, -8.58683710e-01, -4.07932943e-01,
1.52322657e+00, 1.18539682e-01],
[-7.52853335e-01, 1.10687414e-01, -5.86560665e-01,
-1.60765419e+00, 2.16022159e+00, -1.51430138e+00,
-4.70206671e-01, -1.08174618e+00],
[-4.51941705e-01, 6.44192203e-01, -8.55783104e-02,
6.68467286e-01, 1.63832925e+00, -6.58024129e-01,
-6.26216258e-01, -1.81531783e-01],
[-7.52853335e-01, 1.72645615e-02, 2.25233934e+00,
1.11417014e+00, -9.09978036e-01, -2.52490215e-01,
2.39447679e+00, 1.18539682e-01],
[-4.51941705e-01, -1.85119249e+00, -8.55783104e-02,
2.06895366e-01, -8.41178044e-01, -1.31666831e+00,
-1.09109300e+00, -6.81650891e-01],
[ 4.50793187e-01, -6.98977310e-01, 8.14158078e-02,
-9.55295600e-02, 1.38622168e+00, 4.92434448e-01,
-1.17715645e+00, -3.81579426e-01],
[ 1.65443971e+00, -1.07299242e-01, -8.55783104e-02,
3.07703674e-01, 2.04842160e+00, 2.64396286e-01,
-7.99092004e-01, 2.18563503e-01],
[-4.51941705e-01, -3.25285899e-01, 1.58436287e+00,
-1.10361265e+00, -7.98178050e-01, -1.28626322e+00,
-7.03807468e-01, -1.08174618e+00],
[-7.52853335e-01, -9.48104917e-01, -5.86560665e-01,
-1.10361265e+00, -7.03578061e-01, -2.35044131e+00,
-7.46839194e-01, -1.08174618e+00],
[-4.51941705e-01, -8.54682064e-01, -8.55783104e-02,
2.22306154e+00, -9.18578035e-01, 1.03972604e+00,
4.15017404e-01, -6.81650891e-01],
[-7.52853335e-01, -1.44636013e+00, 2.48409926e-01,
1.11417014e+00, -9.18578035e-01, 1.92147360e+00,
1.70289549e+00, 1.85158601e-02],
[ 1.49881556e-01, 6.08942629e-01, 2.48409926e-01,
-4.98762794e-01, 1.11102172e+00, 1.88383565e-01,
-4.82501450e-01, -1.81531783e-01],
[-1.51030074e-01, -6.00772107e-01, -3.64441358e-01,
1.31907842e+00, -5.35540216e-01, 3.69875650e-01,
8.26663419e-01, -6.81650891e-01],
[-1.51030074e-01, -1.35293728e+00, -2.52572429e-01,
5.27874857e-03, -4.97178086e-01, -2.67692759e-01,
1.50679659e-01, -6.81650891e-01],
[ 1.05261645e+00, 1.01377499e+00, -1.42153126e+00,
2.06895366e-01, 9.62218413e-02, -4.80528377e-01,
1.41458575e-01, 7.18682611e-01],
[-4.51941705e-01, 8.58070236e-01, 2.48409926e-01,
-1.30522926e+00, -5.83178076e-01, -1.07342760e+00,
-3.34964104e-01, -4.81603248e-01],
[-7.52853335e-01, -1.41761632e-01, -9.34653910e-01,
4.47045423e-01, -3.90080987e-02, 1.37302248e-01,
7.08250209e-01, -2.81555605e-01],
[-1.05376497e+00, -2.82246258e-01, 1.13123578e+00,
1.75753798e+00, 5.61006819e-01, 1.90209630e+00,
-3.86235565e-02, -8.15079615e-02],
[ 1.65443971e+00, -2.31863046e-01, 8.14158078e-02,
-8.01187720e-01, -9.27178034e-01, -1.95517516e+00,
5.87144307e-01, 1.61889701e+00],
[-1.05376497e+00, 2.97533120e-01, -2.59049008e+00,
5.09320291e-01, 3.60218487e-02, 1.43499219e+00,
5.36673957e+00, 1.18539682e-01],
[-1.05376497e+00, -7.30118261e-01, -5.86560665e-01,
-7.00379411e-01, -4.11178097e-01, -8.90997069e-01,
-2.70416515e-01, -8.81698534e-01],
[ 2.25626297e+00, 3.28674071e-01, 2.48409926e-01,
-3.97954486e-01, -1.70378126e-01, 3.70814095e-01,
4.61740394e-02, 1.81894465e+00],
[-7.52853335e-01, -6.05554457e-01, -9.20548901e-01,
1.61821169e+00, 1.22021838e-01, 2.79598830e-01,
-3.90290609e-01, -7.81674713e-01],
[-1.51030074e-01, 1.63659401e+00, -5.86560665e-01,
-4.98762794e-01, -8.06778048e-01, 5.15606678e-02,
-8.32902646e-01, -5.81627069e-01],
[-7.52853335e-01, 3.90955973e-01, 2.48409926e-01,
-3.97954486e-01, 1.39221836e-01, -1.45349120e+00,
8.79145304e-01, -8.81698534e-01],
[-4.51941705e-01, 4.22096923e-01, -1.08754302e+00,
4.08511983e-01, -3.07978109e-01, -1.25585813e+00,
4.82638687e-01, -7.81674713e-01],
[-4.51941705e-01, 1.72879934e+00, 5.82398162e-01,
1.01730315e+00, 1.02108165e+00, 6.65606193e-01,
2.03816208e-01, -4.81603248e-01],
[-1.05376497e+00, -7.30118261e-01, -5.86560665e-01,
7.10936908e-01, -8.58378042e-01, -9.24950875e-03,
-9.82896117e-02, -9.81722356e-01],
[-7.52853335e-01, -1.47750108e+00, 2.48409926e-01,
-1.91007911e+00, -8.92778038e-01, -5.56541098e-01,
-4.60368017e-02, -9.81722356e-01],
[ 4.06173275e+00, 1.10719784e+00, 8.14158078e-02,
1.11417014e+00, -4.28378095e-01, 1.10053621e+00,
8.45334663e-01, 1.51887318e+00],
[ 4.50793187e-01, -8.20424263e-01, 7.30587152e-02,
6.20218349e-01, -6.02670899e-01, 7.83683020e-01,
1.04702103e+00, -2.81555605e-01],
[-7.52853335e-01, -9.79245868e-01, -5.86560665e-01,
-2.97146177e-01, -6.60578066e-01, -7.00596853e-02,
-7.77576141e-01, -1.08174618e+00],
[ 4.50793187e-01, 1.72940589e-01, -1.85867210e-01,
-5.79297849e-01, -3.16601908e-01, -9.50193975e-03,
-1.98675226e-01, -3.81579426e-01],
[-4.51941705e-01, -8.85823015e-01, -9.20548901e-01,
-1.30522926e+00, -3.27781429e-02, 4.46826816e-01,
-2.73490210e-01, -1.08174618e+00],
[-1.51030074e-01, 1.31744991e+00, 3.97215936e-02,
1.81857230e-01, -2.50998921e-01, -2.02889104e-01,
-1.00547724e+00, -5.81627069e-01],
[ 7.51704817e-01, 4.80485095e-01, 9.16386399e-01,
8.40505369e-02, 1.13897478e+00, 5.53710744e-02,
-2.10861595e-01, 2.61913522e+00],
[ 2.85808623e+00, 7.64647384e-01, 1.58436287e+00,
3.07703674e-01, -1.15937801e+00, -1.04302251e+00,
5.80996918e-01, 1.11877790e+00],
[-1.05376497e+00, 7.64647384e-01, 9.16386399e-01,
9.12553525e-01, 9.30421740e-01, 1.19175148e+00,
-8.35976341e-01, -4.81603248e-01],
[ 7.51704817e-01, 1.07805446e+00, -1.63719359e-01,
-9.68566330e-01, -3.97125460e-01, -7.47234882e-01,
-7.58704383e-01, 4.18611146e-01],
[ 1.05261645e+00, 7.63702230e-01, 6.51794894e-01,
3.32572255e-01, 3.97221805e-01, 1.33088143e+00,
-1.31365881e-01, 3.18587325e-01],
[-1.05376497e+00, 1.88572162e+00, 9.16386399e-01,
-1.60765419e+00, 1.82221831e-01, -2.52490215e-01,
4.30385877e-01, -9.81722356e-01],
[ 3.15899786e+00, -8.54682064e-01, 5.82398162e-01,
-4.98762794e-01, 1.73621832e-01, 4.46826816e-01,
-3.99511693e-01, 1.41884936e+00],
[ 1.05261645e+00, -5.41580895e-01, 8.69969732e-01,
5.71156182e-01, 1.34468294e-01, 6.08377237e-02,
-4.81403495e-01, 2.18563503e-01],
[-7.52853335e-01, -7.61259212e-01, 7.49392281e-01,
-1.91007911e+00, -7.03578061e-01, -2.16801078e+00,
-1.56689811e-01, -9.81722356e-01],
[-4.51941705e-01, 1.44974830e+00, 1.41736875e+00,
7.10936908e-01, -3.76778101e-01, 1.64782780e+00,
3.19732868e-01, -7.81674713e-01],
[ 1.65443971e+00, 5.46660727e-01, 1.41736875e+00,
4.08511983e-01, 1.02218518e-02, -5.10933466e-01,
7.03944706e-01, 2.11901611e+00],
[-1.51030074e-01, 4.84055124e-02, 1.75135699e+00,
1.92063661e+00, -7.57781377e-02, 4.16421727e-01,
1.30946256e+00, 1.85158601e-02],
[ 4.50793187e-01, 1.15232902e+00, 8.83710575e-01,
7.85444933e-01, 1.37319717e+00, 5.44320556e-01,
2.69637780e-01, 6.18658790e-01],
[-7.52853335e-01, -7.61259212e-01, -3.42546067e+00,
8.11745217e-01, -6.94978062e-01, 1.46539727e+00,
-1.10338778e+00, 1.18539682e-01],
[-7.52853335e-01, -1.35293728e+00, -5.86560665e-01,
-7.00379411e-01, -4.19778096e-01, 4.92434448e-01,
-2.18163705e-01, -3.81579426e-01],
[-1.05376497e+00, 5.15287045e-02, -6.58278513e-01,
2.45448380e-01, 1.84174405e-02, 1.18899779e-01,
-5.57138587e-01, -4.81603248e-01],
[-1.51030074e-01, -9.79245868e-01, -1.25453714e+00,
4.08511983e-01, -4.19778096e-01, -1.36227594e+00,
1.23569389e+00, 7.18682611e-01],
[ 1.35352808e+00, 1.82343972e+00, 1.58436287e+00,
5.09320291e-01, 5.26221789e-01, 1.27573389e-01,
-3.65701051e-01, 5.18634968e-01],
[-7.52853335e-01, 1.72645615e-02, 1.41736875e+00,
9.12553525e-01, -4.62778090e-01, 4.31624271e-01,
1.58302139e+00, 5.18634968e-01],
[-1.05376497e+00, -6.43743522e-01, -7.40956221e-01,
5.85158214e-02, -7.78866163e-01, 4.12418982e-01,
6.03636347e-01, -7.81674713e-01],
[ 1.35352808e+00, -1.07299242e-01, 4.15404044e-01,
-5.99571103e-01, 3.75122140e+00, -7.54174172e-01,
4.45754351e-01, 2.01899229e+00],
[ 1.05261645e+00, 7.02365482e-01, -4.19566547e-01,
1.21497845e+00, 1.53242167e+00, 1.57978477e-01,
5.41038887e-01, 1.01875408e+00],
[ 4.50793187e-01, 1.87064730e+00, -4.44943227e-01,
3.71820668e-01, 1.38379064e+00, -2.98905238e-01,
1.70402495e-01, -2.81555605e-01],
[-7.52853335e-01, -7.61582913e-02, -1.75551949e+00,
1.01336183e+00, 2.74218497e-02, -5.48571412e-02,
1.29102039e+00, -3.81579426e-01],
[-7.52853335e-01, 3.59815022e-01, -2.08950773e+00,
-1.10361265e+00, -6.94978062e-01, -7.54174172e-01,
3.44322426e-01, -9.81722356e-01],
[-7.52853335e-01, 4.84378825e-01, 2.48409926e-01,
-8.01187720e-01, -8.84178039e-01, -1.13423778e+00,
-8.79008067e-01, -1.08174618e+00],
[-4.51941705e-01, 4.84055124e-02, 2.48409926e-01,
-3.97954486e-01, 3.54221810e-01, -7.00596853e-02,
1.50679659e-01, -6.81650891e-01],
[-7.52853335e-01, -1.41521918e+00, -5.86560665e-01,
-1.70846250e+00, -5.91778075e-01, -1.89436498e+00,
-3.90290609e-01, -8.81698534e-01],
[ 1.49881556e-01, 8.26929286e-01, 8.14158078e-02,
-9.55295600e-02, -3.25178107e-01, -3.58908024e-01,
-6.26965101e-01, 5.18634968e-01],
[ 1.49881556e-01, -8.26522668e-01, 4.78174820e-01,
-9.61694632e-01, 3.81757786e-03, -3.05052276e-01,
3.63318182e-02, 9.18730254e-01],
[ 1.49881556e-01, 1.29987488e-01, -3.24069448e-01,
-4.12442465e-01, -3.52188165e-01, -3.86944987e-01,
-4.72552438e-01, -3.81579426e-01],
[ 4.50793187e-01, 3.59815022e-01, -5.86560665e-01,
5.09320291e-01, -2.04778122e-01, -7.69376716e-01,
-4.02585387e-01, -5.81627069e-01],
[-7.52853335e-01, 1.66773496e+00, 5.82398162e-01,
1.21497845e+00, 1.11102172e+00, 9.63713317e-01,
2.20083403e+00, -9.81722356e-01],
[ 1.65443971e+00, 8.89211187e-01, 1.25037464e+00,
-1.96337868e-01, -7.57781377e-02, 9.71683003e-02,
1.98874909e+00, 1.01875408e+00],
[-1.51030074e-01, -1.47750108e+00, 9.16386399e-01,
1.06087057e-01, -8.06778048e-01, 8.19657561e-02,
2.30533965e+00, -4.81603248e-01],
[-4.51941705e-01, -6.05554457e-01, -7.53554783e-01,
-2.01088742e+00, 9.82021733e-01, -1.27106067e+00,
1.04205112e+00, -9.81722356e-01],
[-4.51941705e-01, -9.77999368e-01, -6.60036396e-01,
-7.85059912e-01, -6.37874047e-01, -1.67659655e-01,
6.56550727e-01, -4.81603248e-01],
[-1.51030074e-01, -1.22837348e+00, -1.08754302e+00,
-1.91007911e+00, -9.44378032e-01, -1.34707339e+00,
-8.45197425e-01, -9.81722356e-01],
[-1.05376497e+00, 2.16174861e+00, 5.47825560e-02,
4.35973282e-01, 8.25479658e-01, 7.46789236e-01,
5.28443906e-01, -3.81579426e-01],
[ 2.85808623e+00, 1.81931042e-01, -8.55783104e-02,
5.77602572e-02, -4.52175775e-01, 2.97934159e-01,
-1.15618953e+00, 1.01875408e+00],
[-7.52853335e-01, -1.22837348e+00, 5.82398162e-01,
-9.55295600e-02, -7.55178055e-01, -2.52490215e-01,
-5.43975344e-01, -2.81555605e-01],
[-7.52853335e-01, -3.25285899e-01, 1.41736875e+00,
-5.99571103e-01, -1.61778127e-01, 1.27573389e-01,
-4.27174945e-01, 8.18706433e-01],
[-1.05376497e+00, -1.07266872e+00, -9.20548901e-01,
-4.98762794e-01, -6.17578072e-01, -7.54174172e-01,
-3.06683282e-02, -9.81722356e-01],
[-1.51030074e-01, -8.54682064e-01, -2.52572429e-01,
-7.00379411e-01, -7.12178060e-01, -3.13300392e-01,
1.25106236e+00, -3.81579426e-01],
[ 2.55717460e+00, 7.33506433e-01, -8.55783104e-02,
1.01336183e+00, 9.21821741e-01, 1.23735911e+00,
6.14807560e-01, 6.18658790e-01],
[-4.51941705e-01, 4.53237874e-01, 9.16386399e-01,
-1.20442095e+00, -8.58378042e-01, -1.36227594e+00,
6.73207759e-01, -1.08174618e+00],
[-7.52853335e-01, -5.74413506e-01, -9.20548901e-01,
-2.21250404e+00, 1.56421834e-01, -1.25585813e+00,
1.24491497e+00, -1.08174618e+00],
[-1.51030074e-01, 1.72645615e-02, 8.14158078e-02,
-4.98762794e-01, 2.25221826e-01, -1.91680038e-01,
2.15844818e-02, -4.81603248e-01],
[-7.52853335e-01, -8.54682064e-01, -4.19566547e-01,
-9.55295600e-02, 2.76821819e-01, -2.52490215e-01,
-3.01153462e-01, 1.01875408e+00],
[ 7.51704817e-01, 5.15519776e-01, 8.14158078e-02,
-2.97146177e-01, 5.52021786e-01, 3.63581237e-02,
-8.82081761e-01, 8.18706433e-01],
[-7.52853335e-01, -1.25951443e+00, 5.82398162e-01,
-2.97146177e-01, -1.13357801e+00, 1.42775933e-01,
-1.35543075e+00, -9.81722356e-01],
[ 7.51704817e-01, -5.74413506e-01, -1.34144309e-02,
9.86514154e-01, -3.81056003e-01, -4.35221769e-01,
1.59219664e-01, -3.81579426e-01],
[ 4.50793187e-01, -1.38440193e-01, 2.48409926e-01,
1.01336183e+00, -7.46578056e-01, 6.67632120e-02,
-8.39050035e-01, -3.81579426e-01]]),
240 0
152 1
428 1
445 1
372 0
..
75 0
466 1
299 0
493 1
168 0
Name: class, Length: 419, dtype: int64,
55 1
331 1
160 0
221 0
439 1
..
199 0
41 0
109 0
485 1
335 0
Name: class, Length: 105, dtype: int64]
# 변수로 지정하여 메모리에 업로드
X_train, X_test, y_train, y_test = train_test_split( X, y , test_size= 0.2, random_state=2 )
# 예측을 위한 인공지능 모델 생성
from sklearn.linear_model import LogisticRegression
# 깡통 인공지는 classifier 생성
classifier = LogisticRegression()
# 트레이닝용 데이터로만 학습 진행 .fit
classifier.fit(X_train, y_train)
# 결과값인 X_test로 예측 진행 .predict
classifier.predict( X_test )
array([1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0], dtype=int64)
# 정확한 분류말고 퍼센트율도 봐보자 .predict_proba
classifier.predict_proba(X_test)
array([[0.05222541, 0.94777459],
[0.11605631, 0.88394369],
[0.63700128, 0.36299872],
[0.83052528, 0.16947472],
[0.59207874, 0.40792126],
[0.42025592, 0.57974408],
[0.90931087, 0.09068913],
[0.81084452, 0.18915548],
[0.35558493, 0.64441507],
[0.44843601, 0.55156399],
[0.39815681, 0.60184319],
[0.92707468, 0.07292532],
[0.11132818, 0.88867182],
[0.88910844, 0.11089156],
[0.85013329, 0.14986671],
[0.12538946, 0.87461054],
[0.17940585, 0.82059415],
[0.50770797, 0.49229203],
[0.85703651, 0.14296349],
[0.53347169, 0.46652831],
[0.43599634, 0.56400366],
[0.97117043, 0.02882957],
[0.78015126, 0.21984874],
[0.53529107, 0.46470893],
[0.9006661 , 0.0993339 ],
[0.95881574, 0.04118426],
[0.58447275, 0.41552725],
[0.52381334, 0.47618666],
[0.41482745, 0.58517255],
[0.57627107, 0.42372893],
[0.87272256, 0.12727744],
[0.14814786, 0.85185214],
[0.51649563, 0.48350437],
[0.47313965, 0.52686035],
[0.42611047, 0.57388953],
[0.5575374 , 0.4424626 ],
[0.03603706, 0.96396294],
[0.86972787, 0.13027213],
[0.16185158, 0.83814842],
[0.6975837 , 0.3024163 ],
[0.16247656, 0.83752344],
[0.66723503, 0.33276497],
[0.53690716, 0.46309284],
[0.11724695, 0.88275305],
[0.76838179, 0.23161821],
[0.93975628, 0.06024372],
[0.02570856, 0.97429144],
[0.53847368, 0.46152632],
[0.86447721, 0.13552279],
[0.48291809, 0.51708191],
[0.79448456, 0.20551544],
[0.28831743, 0.71168257],
[0.20759439, 0.79240561],
[0.21777607, 0.78222393],
[0.34080765, 0.65919235],
[0.28658669, 0.71341331],
[0.15100284, 0.84899716],
[0.21902747, 0.78097253],
[0.50242181, 0.49757819],
[0.66480633, 0.33519367],
[0.95087155, 0.04912845],
[0.10503121, 0.89496879],
[0.1937405 , 0.8062595 ],
[0.38877472, 0.61122528],
[0.14896837, 0.85103163],
[0.40624769, 0.59375231],
[0.83190782, 0.16809218],
[0.57135648, 0.42864352],
[0.71870423, 0.28129577],
[0.10205759, 0.89794241],
[0.34353325, 0.65646675],
[0.64469831, 0.35530169],
[0.52147373, 0.47852627],
[0.16248731, 0.83751269],
[0.12033401, 0.87966599],
[0.39117792, 0.60882208],
[0.49433721, 0.50566279],
[0.71563406, 0.28436594],
[0.61285689, 0.38714311],
[0.96545934, 0.03454066],
[0.30554451, 0.69445549],
[0.7257715 , 0.2742285 ],
[0.56977791, 0.43022209],
[0.52689497, 0.47310503],
[0.07670217, 0.92329783],
[0.10641807, 0.89358193],
[0.79014557, 0.20985443],
[0.85495167, 0.14504833],
[0.76739926, 0.23260074],
[0.93914653, 0.06085347],
[0.062172 , 0.937828 ],
[0.27151662, 0.72848338],
[0.88906408, 0.11093592],
[0.65047503, 0.34952497],
[0.88853887, 0.11146113],
[0.71506858, 0.28493142],
[0.08330045, 0.91669955],
[0.67634886, 0.32365114],
[0.83441148, 0.16558852],
[0.60488491, 0.39511509],
[0.72362065, 0.27637935],
[0.33912939, 0.66087061],
[0.91664381, 0.08335619],
[0.68747058, 0.31252942],
[0.58689496, 0.41310504]])
# 이제 예측값을 변수로 메모리에 업로드
y_pred = classifier.predict( X_test )
# 이제 정말 실제 데이터와 잘맞았는지 확인해 보기 위해 실제 결과값인 y_test를 데이터프레임 형태로 변환하여 메모리에 업로드
df_test = y_test.to_frame()
# 인공지능으로 예측한 값을 실제 값에 새로운 컬럼으로 삽입
df_test['y_pred'] = y_pred
df_test
## 분류결과표 필요 Confusion Matrix ##
from sklearn.metrics import confusion_matrix, accuracy_score
# 최종 결과를 분석하는것이므로 학습때는 테스트용 데이터로만 결과표는 최종 결과 데이터로만 실행
confusion_matrix(y_test, y_pred)
array([[40, 8],
[20, 37]], dtype=int64)
cm = confusion_matrix(y_test, y_pred)
cm
array([[40, 8],
[20, 37]], dtype=int64)
# 정확도 계산
accuracy_score(y_test, y_pred)
0.7333333333333333
ㄴ 대략 73% 높은 퍼센트율은 아님 (실무에서는 90% 이상이어야 그나마 사용 가능)
# 전체 결과보기
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.67 0.83 0.74 48
1 0.82 0.65 0.73 57
accuracy 0.73 105
macro avg 0.74 0.74 0.73 105
weighted avg 0.75 0.73 0.73 105
# cm 결과를 히트맵으로
import seaborn as sb
sb.heatmap(data = cm, cmap = 'RdPu' , annot= True)
plt.show()
# 로지스틱 리그레션 방정식 확인 (분류에 쓰인다)
classifier.coef_
array([[ 0.17179715, 1.1977546 , -0.21976933, 0.09019753, -0.14669847,
0.55918106, 0.31306116, 0.40475482]])
classifier.intercept_
array([-0.0293159])