# Pandas, Numpy, and Matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Scipy
from scipy.stats.stats import pearsonr
# Diabale warning
import warnings
warnings.filterwarnings('ignore')
# Scikit methods and functions
from sklearn.base import TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_squared_error,
Path='https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/tips.csv'
df = pd.read_csv(Path)
y = df['total_bill']
X = df.drop('total_bill', axis=1)
df.head(3)
df.info()
class CategoricalTransformer(TransformerMixin):
"Converts a set of columns in a DataFrame to categoricals"
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
'Records the categorical information'
self.cat_map_ = {col: X[col].astype('category').cat
for col in self.columns}
return self
def transform(self, X, y=None):
X = X.copy()
for col in self.columns:
X[col] = pd.Categorical(X[col],
categories=self.cat_map_[col].categories,
ordered=self.cat_map_[col].ordered)
return X
def inverse_transform(self, trn, y=None):
trn = trn.copy()
trn[self.columns] = trn[self.columns].apply(lambda x: x.astype(object))
return trn
class DummyEncoder(TransformerMixin):
def fit(self, X, y=None):
self.columns_ = X.columns
self.cat_cols_ = X.select_dtypes(include=['category']).columns
self.non_cat_cols_ = X.columns.drop(self.cat_cols_)
self.cat_map_ = {col: X[col].cat for col in self.cat_cols_}
self.cat_blocks_ = {} # {cat col: slice}
left = len(self.non_cat_cols_)
for col in self.cat_cols_:
right = left + len(self.cat_map_[col].categories)
self.cat_blocks_[col] = slice(left, right)
left = right
return self
def transform(self, X, y=None):
return np.asarray(pd.get_dummies(X))
def inverse_transform(self, trn, y=None):
numeric = pd.DataFrame(trn[:, :len(self.non_cat_cols_)],
columns=self.non_cat_cols_)
series = []
for col, slice_ in self.cat_blocks_.items():
codes = trn[:, slice_].argmax(1)
cat = self.cat_map_[col]
cat = pd.Categorical.from_codes(codes,
cat.categories,
cat.ordered)
series.append(pd.Series(cat, name=col))
return pd.concat([numeric] + series, axis='columns')[self.columns_]
X.columns.tolist()
columns = ['sex', 'smoker', 'day', 'time']
# Building pipe
pipe = make_pipeline(CategoricalTransformer(columns),\
DummyEncoder(),\
StandardScaler(),\
PolynomialFeatures(interaction_only=True),\
LinearRegression())
#pipe = make_pipeline(CategoricalTransformer(columns), \
# DummyEncoder(),StandardScaler(), \
# PolynomialFeatures(interaction_only=True),\
# Lasso(alpha=.5))
pipe.fit(X, y)
yhat = pipe.predict(X)
def RegressionPerf(y,yhat):
(Person, P_value) = pearsonr(y,yhat)
print('=================================')
print(f' MSE = {mean_squared_error(y,yhat)}')
print(f' R-Squared = {r2_score(y,yhat)}')
print(f' Pearson = {Person}')
print(f' P-value = {P_value}')
print('=================================')
RegressionPerf(y,yhat)
sns.jointplot(y, yhat,kind='reg',size=10)
pipe = make_pipeline(CategoricalTransformer(columns), DummyEncoder(), PCA(n_components=3))
trn = pipe.fit_transform(X)
sns.jointplot(trn[:, 0], trn[:, 1],kind='kde',size=10);
sns.jointplot(trn[:, 0], trn[:, 2],size=10);
sns.jointplot(trn[:, 1], trn[:, 2],kind='hex',size=10);
original_DF=pipe.inverse_transform(trn)
original_DF.head(5)