# 공통
from warnings import simplefilter
import numpy as np
import os
import pandas as pd
import sklearn.linear_model
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import Imputer
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
simplefilter(action='ignore', category=FutureWarning)
# 일관된 출력을 위해 유사난수 초기화
np.random.seed(42)
%matplotlib inline
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['axes.unicode_minus'] = False
# 그림을 저장할 폴드
PROJECT_ROOT_DIR = "C:/Users/Admin/Desktop/ML/"
# PROJECT_ROOT_DIR = "C:/Users/User/Desktop/ML/"
# PROJECT_ROOT_DIR = "C:/Users/sally/Dropbox/2019-Fall-Semester/ML"
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
def save_fig(fig_id, tight_layout=True):
path = os.path.join(IMAGES_PATH, fig_id + ".png")
if tight_layout:
plt.tight_layout()
plt.savefig(path, format='png', dpi=300)
housing_path = os.path.join("datasets","housing","")
housing = pd.read_csv(housing_path + "housing.csv")
housing.head()
housing.info()
housing["ocean_proximity"].value_counts()
housing.describe()
housing.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
Split arrays or matrices into random train and test subsets
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
test_set.head()
# 소득 카테고리 개수를 제한하기 위해 1.5로 나눕니다.
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing[["median_income", "income_cat"]]
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.where.html
# 5 이상은 5로 레이블합니다.
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
housing[["median_income", "income_cat"]]
housing["income_cat"].value_counts()
housing["income_cat"].hist()
save_fig('income_category_hist')
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
print(train_index)
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
strat_train_set.head()
strat_test_set["income_cat"].value_counts(normalize=True)
def income_cat_proportions(data):
return data["income_cat"].value_counts(normalize=True)
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
compare_props = pd.DataFrame({
"Overall": income_cat_proportions(housing),
"Stratified": income_cat_proportions(strat_test_set),
"Random": income_cat_proportions(test_set),
}).sort_index()
compare_props
compare_props["Rand. %error"] = 100 * (compare_props["Random"] - compare_props["Overall"]) / compare_props["Overall"]
compare_props["Strat. %error"] = 100 * (compare_props["Stratified"] - compare_props["Overall"]) / compare_props["Overall"]
compare_props
Drop specified labels from rows or columns.
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html
for set_ in (strat_train_set, strat_test_set):
set_.drop("income_cat", axis=1, inplace=True)
strat_train_set.info()
housing = strat_train_set.copy()
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html
housing.plot(kind="scatter", x="longitude", y="latitude")
save_fig("bad_visualization_plot")
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100,label="population", figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, sharex=False)
save_fig("better_visualization_plot")
corr_matrix = housing.corr()
corr_matrix
corr_matrix["median_house_value"].sort_values(ascending=False)
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
save_fig("scatter_matrix_plot")
housing.plot(kind="scatter", x="median_income", y="median_house_value",
alpha=0.1)
save_fig("income_vs_house_value_scatterplot")
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
housing.plot(kind="scatter", x="rooms_per_household", y="median_house_value",
alpha=0.2)
# plt.axis([0, 5, 0, 520000])
plt.show()
housing.describe()
housing = strat_train_set.drop("median_house_value", axis=1) # 훈련 세트를 위해 레이블 삭제
housing_labels = strat_train_set["median_house_value"].copy()
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.any.html
Remove missing values.
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html
Fill NA/NaN values using the specified method
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows
sample_incomplete_rows.dropna(subset=["total_bedrooms"]) # 옵션 1: 해당 구역을 제거
# sample_incomplete_rows.dropna() # 옵션 1: 해당 구역을 제거
sample_incomplete_rows.drop("total_bedrooms", axis=1) # 옵션 2: 전체 특성을 삭제
median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True) # 옵션 3: median으로 채움
sample_incomplete_rows
http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop('ocean_proximity', axis = 1)
imputer.fit(housing_num)
imputer.statistics_
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.median.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.values.html
housing_num.median().values
X = imputer.transform(housing_num) #변환된 특성들이 들아 있는 numpy 배열
X
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
index = list(housing.index.values))
https://pandas.pydata.org/pandas-docs/version/0.23/generated/pandas.DataFrame.loc.html
housing_tr.loc[sample_incomplete_rows.index.values]
housing_cat = housing['ocean_proximity']
housing_cat.head(10)
housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]
housing_categories
http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
encoder = OneHotEncoder(categories = 'auto')
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) # 행 차원은 unknown, 열 차원은 1 1
housing_cat_1hot
housing_cat_1hot.toarray()
housing_cat_reshaped = housing_cat.values.reshape(-1, 1) # 텍스트 카테고리
housing_cat_reshaped
housing_cat_1hot = encoder.fit_transform(housing_cat_reshaped)
housing_cat_1hot
housing_cat_1hot.toarray()
encoder.categories_
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room = False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing_extra_attribs
housing_extra_attribs = pd.DataFrame(
housing_extra_attribs,
columns=list(housing.columns)+["rooms_per_household", "population_per_household"])
housing_extra_attribs.head()
num_pipeline = Pipeline([
# ('imputer', Imputer(strategy="median")),
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
df = pd.DataFrame({'age': [ 3, 29],
'height': [94, 170],
'weight': [31, 115]})
type(df.values)
num_attribs = list(housing_num)
num_attribs
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', SimpleImputer(strategy="median")),
# ('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', encoder),
# ('cat_encoder', OneHotEncoder(categories = 'auto')),
# ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline)
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
housing_prepared.shape
housing_prepared.toarray()
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels) # 선형 회귀 모델 훈련
https://pandas.pydata.org/pandas-docs/version/0.23/generated/pandas.DataFrame.iloc.html
# 훈련 샘플 몇 개를 사용해 전체 파이프라인을 적용
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("예측:", lin_reg.predict(some_data_prepared))
print("레이블:", list(some_labels))
some_data_prepared
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
def display_scores(scores):
print("점수:", scores)
print("평균:", scores.mean())
print("표준편차:", scores.std())
display_scores(lin_rmse_scores)
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
display_scores(tree_rmse_scores)
forest_reg = RandomForestRegressor(random_state = 42,n_estimators=10)
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
param_grid = [
# 하이퍼파라미터 12(=3×4)개의 조합을 시도합니다.
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
# bootstrap은 False로 하고 6(=2×3)개의 조합을 시도합니다.
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor(random_state=42)
# 다섯 폴드에서 훈련하면 총 (12+6)*5=90번의 훈련이 일어납니다.
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error',
return_train_score=True, n_jobs=-1)
grid_search.fit(housing_prepared, housing_labels)
grid_search.best_params_
grid_search.best_estimator_
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
# cat_one_hot_attribs = list(cat_encoder.categories_[0])
cat_one_hot_attribs = list(encoder.categories_[0])
cat_one_hot_attribs
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse