Ibrahim AbuAlhaol, PhD, P.Eng., SMIEEE, Data Scientest, Larus Technologies, Ottawa, Canada August 2018

Import libraries

In [181]:
# Pandas, Numpy, and Matplotlib
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scipy
from scipy.stats.stats import pearsonr


# Diabale warning
import warnings
warnings.filterwarnings('ignore')

# Scikit methods and functions
from sklearn.base import TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_squared_error,

Loading Data

In [171]:
Path='https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/tips.csv'
df = pd.read_csv(Path)
y = df['total_bill']
X = df.drop('total_bill', axis=1)
In [172]:
df.head(3)
Out[172]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
In [173]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null object
smoker        244 non-null object
day           244 non-null object
time          244 non-null object
size          244 non-null int64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.4+ KB

Transforms Panadas objects to categorical columns class.

In [174]:
class CategoricalTransformer(TransformerMixin):
    "Converts a set of columns in a DataFrame to categoricals"
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        'Records the categorical information'
        self.cat_map_ = {col: X[col].astype('category').cat
                         for col in self.columns}
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for col in self.columns:
            X[col] = pd.Categorical(X[col],
            categories=self.cat_map_[col].categories,
            ordered=self.cat_map_[col].ordered)
        return X

    def inverse_transform(self, trn, y=None):
        trn = trn.copy()
        trn[self.columns] = trn[self.columns].apply(lambda x: x.astype(object))
        return trn

DummyEncoder Class

In [175]:
class DummyEncoder(TransformerMixin):

    def fit(self, X, y=None):
        self.columns_ = X.columns
        self.cat_cols_ = X.select_dtypes(include=['category']).columns
        self.non_cat_cols_ = X.columns.drop(self.cat_cols_)
        self.cat_map_ = {col: X[col].cat for col in self.cat_cols_}

        self.cat_blocks_ = {}  # {cat col: slice}
        left = len(self.non_cat_cols_)
        for col in self.cat_cols_:
            right = left + len(self.cat_map_[col].categories)
            self.cat_blocks_[col] = slice(left, right)
            left = right
        return self

    def transform(self, X, y=None):
        return np.asarray(pd.get_dummies(X))

    def inverse_transform(self, trn, y=None):
        numeric = pd.DataFrame(trn[:, :len(self.non_cat_cols_)],
                               columns=self.non_cat_cols_)
        series = []
        for col, slice_ in self.cat_blocks_.items():
            codes = trn[:, slice_].argmax(1)
            cat = self.cat_map_[col]
            cat = pd.Categorical.from_codes(codes,
                                            cat.categories,
                                            cat.ordered)
            series.append(pd.Series(cat, name=col))
        return pd.concat([numeric] + series, axis='columns')[self.columns_]
In [176]:
X.columns.tolist()
Out[176]:
['tip', 'sex', 'smoker', 'day', 'time', 'size']

Building pipeline:

  • Convert Pandas object clolumns to Categorical columns
  • Converting Categorical columns to numerical via hot-key encoding
  • standardize each column (subtract the mean, normalize the variance to 1)
  • compute all the interaction terms
  • fitting a Lasso/Linear regression
In [227]:
columns = ['sex', 'smoker', 'day', 'time']
# Building pipe
pipe = make_pipeline(CategoricalTransformer(columns),\
                     DummyEncoder(),\
                     StandardScaler(),\
                     PolynomialFeatures(interaction_only=True),\
                     LinearRegression())


#pipe = make_pipeline(CategoricalTransformer(columns), \
#                     DummyEncoder(),StandardScaler(), \
#                     PolynomialFeatures(interaction_only=True),\
#                     Lasso(alpha=.5))

Fitting the model

In [228]:
pipe.fit(X, y)
Out[228]:
Pipeline(memory=None,
     steps=[('categoricaltransformer', <__main__.CategoricalTransformer object at 0x7f2ba0939320>), ('dummyencoder', <__main__.DummyEncoder object at 0x7f2ba0939278>), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=True)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

Using the Model to Predict

In [182]:
yhat = pipe.predict(X)

checking Model Perofrmance

  • R2-score
  • MSE
  • Pearson
  • P-Value
In [213]:
def RegressionPerf(y,yhat):
    (Person, P_value) = pearsonr(y,yhat)
    print('=================================')
    print(f' MSE  = {mean_squared_error(y,yhat)}')
    print(f' R-Squared = {r2_score(y,yhat)}')
    print(f' Pearson = {Person}')
    print(f' P-value = {P_value}')
    print('=================================')
In [214]:
RegressionPerf(y,yhat)
=================================
 MSE  = 29.003491212713126
 R-Squared = 0.6325329047358115
 Pearson = 0.7953195914924319
 P-value = 1.582696082651608e-54
=================================

Visulize the regression in jointplot

In [255]:
sns.jointplot(y, yhat,kind='reg',size=10)
Out[255]:
<seaborn.axisgrid.JointGrid at 0x7f2b9c289cf8>

PCA Unsupervised transformation

  • Categorical encoding
  • DummyEncoding (one-hot key)
  • Transoform the features using PCA
In [232]:
pipe = make_pipeline(CategoricalTransformer(columns), DummyEncoder(), PCA(n_components=3))
trn = pipe.fit_transform(X)

Visulaize PCA

In [248]:
sns.jointplot(trn[:, 0], trn[:, 1],kind='kde',size=10);
In [249]:
sns.jointplot(trn[:, 0], trn[:, 2],size=10);
In [250]:
sns.jointplot(trn[:, 1], trn[:, 2],kind='hex',size=10);

Inverse transform to get the Orginal Feature set

In [251]:
original_DF=pipe.inverse_transform(trn)
original_DF.head(5)
Out[251]:
tip sex smoker day time size
0 0.926659 Male No Sun Dinner 2.236633
1 1.613839 Male No Sun Dinner 3.166677
2 3.383382 Male No Sun Dinner 3.348111
3 3.102052 Male No Sat Dinner 2.592446
4 3.624311 Male No Sun Dinner 3.966866

The End