[AI 과제] 회귀 및 분류 모델을 통한 데이터 분석 및 시각화

2024. 11. 27. 10:11컴퓨터비전&AI

반응형

 

<회귀 모델 학습 및 비교를 통한 데이터 분석과 시각화>

 

import sklearn

import numpy as np

import numpy.random as rnd

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

from sklearn.linear_model import SGDRegressor

from sklearn.preprocessing import PolynomialFeatures

# to make this notebook's output stable across runs

from sklearn.linear_model import Ridge

from sklearn.linear_model import Lasso

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

np.random.seed(42)

# To plot pretty figures

%matplotlib inline

import matplotlib as mpl

import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)

mpl.rc('xtick', labelsize=12)

mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)

import warnings

warnings.filterwarnings(action="ignore", message="^internal gelsd")

def plot_generated_data(X, y):

plt.plot(X, y, "b.")

plt.xlabel("$x_1$", fontsize=18)

plt.ylabel("$y$", rotation=0, fontsize=18)

plt.show()

 

def plot_model(model_class, polynomial, alphas, **model_kargs):

for alpha, style in zip(alphas, ("b-", "g--", "r:")):

model = model_class(alpha, **model_kargs) if alpha > 0 else LinearRegression()

if polynomial:

model = Pipeline([

("poly_features", PolynomialFeatures(degree=10, include_bias=False)),

("std_scaler", StandardScaler()),

("regul_reg", model),

])

model.fit(X, y)

y_new_regul = model.predict(X_new)

lw = 2 if alpha > 0 else 1

plt.plot(X_new, y_new_regul, style, linewidth=lw, label=r"$\alpha = {}$".format(alpha))

plt.plot(X, y, "b.", linewidth=3)

plt.legend(loc="upper left", fontsize=15)

plt.xlabel("$x_1$", fontsize=18)

plt.axis([0, 3, 0, 4])

def plot_logistic(_new, y_proba):

decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]

plt.figure(figsize=(8, 3))

plt.plot(X[y==0], y[y==0], "bs")

plt.plot(X[y==1], y[y==1], "g^")

plt.plot([decision_boundary, decision_boundary], [-1, 2], "k:", linewidth=2)

plt.plot(X_new, y_proba[:, 1], "g-", linewidth=2, label="Iris virginica")

plt.plot(X_new, y_proba[:, 0], "b--", linewidth=2, label="Not Iris virginica")

plt.text(decision_boundary+0.02, 0.15, "Decision boundary", fontsize=14, color="k", ha="center")

plt.arrow(decision_boundary, 0.08, -0.3, 0, head_width=0.05, head_length=0.1, fc='b', ec='b')

plt.arrow(decision_boundary, 0.92, 0.3, 0, head_width=0.05, head_length=0.1, fc='g', ec='g')

plt.xlabel("Petal width (cm)", fontsize=14)

plt.ylabel("Probability", fontsize=14)

plt.legend(loc="center left", fontsize=14)

plt.axis([0, 3, -0.02, 1.02])

plt.show()

np.random.seed(42)

m = 20

X = 3 * np.random.rand(m,1)

y = 1 + 0.5 * X + np.random.randn(m,1)/1.5

X_new = np.linspace(0,3,100).reshape(100,1)

plot_generated_data(X,y)

=> Ridge Predict value: [[1.55071465]]

SGD Predict value: [1.47012588]

 

<방법 2>

from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha = 1, solver = 'cholesky', random_state = 42)

ridge_reg.fit(X,y)

print('Ridge Predict value : ', ridge_reg.predict([[1.5]]))

from sklearn.linear_model import SGDClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline

from sklearn.pipeline import Pipeline

sgd_reg = SGDRegressor(penalty = "l2", max_iter = 1000, tol=1e-3, random_state=42)

#sgd_reg = make_pipeline(StandardScaler(), SGDClassifier(max_iter = 1000, tol=1e-3, penalty = '12', random_state = 42))

#https://fixexception.com/scikit-learn/penalty-s-is-not-supported/

#https://runebook.dev/ko/docs/scikit_learn/modules/generated/sklearn.linear_model.sgdregressor

sgd_reg.fit(X, y.ravel())

print('SGD Predict value : ', sgd_reg.predict([[1.5]]))

 

<방법 1>

def plot_model(model_class, polynomial, alphas, **model_kargs):

for alpha, style in zip(alphas, ("b-","g--","r:")):

model = model_class(alpha, **model_kargs) if alpha > 0 else LinearRegression()

if polynomial:

model = Pipeline([

("poly_features", PolynomialFeatures(degree=10, include_bias=False)),

("std_scaler", StandardScaler()),

("regul_reg", model),

])

model.fit(X,y)

y_new_regul = model.predict(X_new)

lw = 2 if alpha > 0 else 1

plt.plot(X_new, y_new_regul, style, linewidth=lw, label=r"$|alpha = {}$".format(alpha))

plt.plot(X,y,"b.", linewidth=3)

plt.legend(loc="upper left", fontsize=15)

plt.xlabel("$x_1$", fontsize=18)

plt.axis([0,3,0,4])

plt.figure(figsize=(8,4))

plt.subplot(121)

plot_model(Ridge, polynomial=False, alphas=(0,10,100), random_state=42)

plt.title('Linear Regression')

plt.ylabel("$y$", rotation=0, fontsize=18)

plt.subplot(122)

plot_model(Ridge, polynomial=True, alphas=(0,10,100), random_state=42)

plt.title('Polynomial Regression')

plt.show()

 

<방법 2>

plt.figure(figsize=(8,4))

plt.subplot(121)

#linear regression with Ridge

plot_model(Ridge, polynomial=False, alphas = (0, 10, 100), random_state=42)

plt.title('Linear Regression')

plt.ylabel("$y$", rotation = 0 , fontsize= 18)

plt.subplot(122)

#Polynomial regression with Ridge

plot_model(Ridge, polynomial=True, alphas=(0,10,100), random_state = 42)

plt.title('Polynomial Regression')

plt.show()

Lab2

from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=0.1)

lasso_reg.fit(X, y)

print('Lasso Predict value:', lasso_reg.predict([[1.5]]))

=> Lasso Predict value: [1.53788174]

plt.figure(figsize=(8,4))

plt.subplot(121)

plot_model(Lasso, polynomial=False, alphas=(0,0.1,1), random_state=42)

plt.title('Linear Regression')

plt.ylabel("$y$", rotation=0, fontsize=18)

plt.subplot(122)

plot_model(Lasso, polynomial=True, alphas=(0,0.1,1), random_state=42)

plt.title('Ploynomial Regression')

plt.show()

 

 

 

Lab 3

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

iris = load_iris()

# get familar with the iris dataset

print('Dataset kets: ', iris.keys())

# 4 features: sepal length, sepal width, petal length, petal width

print('Data sample 0: ', iris["data"][0])

# Target values

# 0: Iris-setosa, 1: Iris-versicolor, 2: Iris-Virginica

print('Target : ', iris["target"][0])

Dataset kets: dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module']) Data sample 0: [5.1 3.5 1.4 0.2] Target : 0

X = iris["data"][:, 3]

y = (iris["target"] == 2).astype(int)

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

log_reg.fit(X.reshape(-1,1),y)

#print(X.reshape(-1,1))

# Create new samples for the model to predict

X_new = np.linspace(0, 3, 1000).reshape(-1, 1)

# Predict target of new samples

y_proba = log_reg.predict_proba(X_new)

plot_logistic(X_new, y_proba)

=> /usr/local/lib/python3.7/dist-packages/matplotlib/patches.py:1327: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray. verts = np.dot(coords, M) + (x + dx, y + dy)

 

 

 

## Question 1

Build a Logistic regression model to reconize Iris-Versicolor with following requirements

* Use 2 features >> sepal width, petal length

* Randomly split Iris data into training - test sets with ratio 8:2 using train_test_split with random_state = 10.

* Report the accuracies of model on training and test set.

**< Tip >**

model.score(data,target) 을 사용해 정확도를 구하시오.

정답 : 학습 정확도는 0.7167이고 Test 정확도는 0.7333입니다.

### 1. 데이터 로드

### 1.1) Iris 데이터 로드

iris = load_iris()

#type(iris["data"][0])

### 1.2) x와 y값 설정.

X = iris["data"][:, 1:3]

y = (iris["target"] == 1).astype(int) # versicolor

### 1.3) Train과 Test 데이터 구별

* train_test_split(feature, target, test_size, random_state)를 사용해서 학습과 테스트 데이터셋을 구별합니다.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 10)

#print(X_train)

### 2. 모델 빌드

### 2.1) LogisticRegression모델 빌드

from sklearn.linear_model import LogisticRegression

### 2.2) LogisticRegression 훈련

log_reg=LogisticRegression()

#print(X_train.reshape(-1, 2)) # max = 6.9

log_reg.fit(X_train.reshape(-1, 2) , y_train)

train_pred = log_reg.predict(X_train)

test_pred = log_reg.predict(X_test)

#print(test_pred)

### 3. 성능 평가

### 3.1) 학습 데이터로 스코어 평가(정확도)

* 학습 정확도는 0.7167이다.

from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score

#sgd_clf = SGDClassifier(max_iter = 1000, tol = 1e-3, random_state =42)

#cross_val_score(sgd_clf, X_train, y_train, cv= 3, scoring="accuracy")

accuracy_score(y_train, train_pred)

### 3.2) 테스트 데이터로 스코어 평가(정확도)

* Test 정확도는 0.7333이다.

accuracy_score(y_test, test_pred)

=> 0.7333333333333333

## Question 2

Build a Logistic regression model to reconize Iris-Versicolor with following requirements

* Use all 4 features

* Randomly split Iris data into training - test sets with ratio 8:2 using train_test_split with random_state = 10.

* Report the accuracies of model on training and test set.

**< Tip >**

model.score(data,target) 을 사용해 정확도를 구하시오.

정답 : 학습 정확도는 0.7083이고 Test 정확도는 0.7667입니다.

iris = load_iris()

X2 = iris["data"][:, 0:4]

y2 = (iris["target"] == 1).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size= 0.2, random_state= 10)

log_reg=LogisticRegression()

#print(X_train.reshape(-1, 2)) # max = 6.9

log_reg.fit(X_train.reshape(-1, 4) , y_train)

train_pred = log_reg.predict(X_train)

test_pred = log_reg.predict(X_test)

print('학습정확도 :' , accuracy_score(y_train, train_pred) , ', 테스트 정확도 : ', accuracy_score(y_test, test_pred))

=>학습정확도 : 0.7083333333333334 , 테스트 정확도 : 0.7666666666666667

### Question **3**

from sklearn.datasets import fetch_openml

# 1) Load MNIST data

mnist = fetch_openml('mnist_784', version = 1, as_frame = False)

mnist.keys()

=> dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

X, y = mnist["data"], mnist["target"]

print(X.shape)

print(y.shape)

some_digit= X[0]

some_digit_image = some_digit.reshape(28, 28)

# reshape(행, 열) 데이터의 구조를 변환 https://supermemi.tistory.com/12

plt.imshow(some_digit_image, cmap=mpl.cm.binary)

# imshow https://pyvisuall.tistory.com/78 (행렬에 원하는 사이즈의 픽셀을 원하는 색으로 채움)

# https://matplotlib.org/tutorials/colors/colormaps.html

plt.axis("off") #https://kongdols-room.tistory.com/83

plt.show()

print('Label of this digit is : ', y[0])

y = y.astype(np.uint8)

(70000, 784) (70000,)

Label of this digit is : 5

 

 

 

 

y_even = (y % 2 == 0 )

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y_even[:60000], y_even[60000:]

X_train = X_train / 255

X_test = X_test / 255

from sklearn.linear_model import SGDClassifier

sgd_mnist = SGDClassifier(max_iter = 1000, tol =1e-3, random_state = 42)

log_mnist = LogisticRegression()

sgd_mnist.fit(X_train, y_train) #데이터 학슴

log_mnist.fit(X_train, y_train)

mnist_test_pred = log_mnist.predict(X_test) # 테스트 모델 결과 테스트

accuracy_score(y_test, mnist_test_pred)

=>0.9009

mnist_test_pred = log_mnist.predict(X_test) # 테스트 모델 결과 테스트

accuracy_score(y_test, mnist_test_pred)

=>0.9009

 

 

 

 

 

 

반응형