DataFrame_Categorical_Imputer

PHOTO

Thu Aug 24 2023 15:45:50 GMT+0000 (Coordinated Universal Time)

Saved by @sumikk ##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df) info.statistical_summary(df)

class DataFrame_Categorical_Imputer():
    

    def __init__(self):
        
        
        print("Imputation object created")
        
        
        
    def fit(self, data):
        
        
        """
        This method will fit 
        impute mode value for 
        all missing categoriical 
        variables
        """
        

        self.fill = pd.Series([data[column].\
                        value_counts().index[0]
            if data[column].dtype == np.dtype('O') else \
                 data[column].mode() for column in data],
            index=data.columns)

        return self

COPY

This method is applicable for categorical variables, where you have a list of finite values. We can impute with the most frequent value. It is possible, if values are Nominal and Ordinal categorical values. Unfortunately this method doesn’t handle correlation between features and there is a possibility of introducing bias in the data. If the category values are not balanced than you are likely to introduce bias in the data. So make sure that our independent variables are balanced, if it is balanced we can impute with most frequent value.

Save snippets that work from anywhere online with our extensions

Comments

Data Science

@sumikk

Exploratory Data Analysis DataFrame_Categorical_Imputer Imputing Numerical Variable Model Selection using KFold CrossValidation Model Building Feature Selection Hyper Parameter Tuning Fitted_DataPoints_vs_ActualPoints

##partialdependencyplot

Explainable AI

# ------------------------------- Partial Dependency Plot--------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from pdpbox import pdp, get_dataset, info_plots

# Assuming 'data' is your DataFrame with the features and target variable.
# Let's say 'target' is the column you want to predict and 'features' is the list of feature names.

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

# Train a machine learning model (e.g., Random Forest Regressor)
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Create the PDP plot for a specific feature (e.g., 'feature_name')
feature_to_plot = 'feature_name'
pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test, model_features=features, feature=feature_to_plot)

# Plot the PDP
pdp.pdp_plot(pdp_dist, feature_to_plot)
plt.show()

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df) info.num_count_summary(df) info.statistical_summary(df)

Exploratory Data Analysis

class Attribute_Information():
    

    def __init__(self):
        
        print("Attribute Information object created")
        
        
        
    def Column_information(self,df):
        
        """
        This method will give us a basic
        information of the dataframe like
        Count of Attributes,Count of rows,
        Numerical Attributes, Categorical 
        Attributes, Factor Attributes etc..
        """
    
        data_info = pd.DataFrame(
                                columns=['No of observation',
                                        'No of Variables',
                                        'No of Numerical Variables',
                                        'No of Factor Variables',
                                        'No of Categorical Variables',
                                        'No of Logical Variables',
                                        'No of Date Variables',
                                        'No of zero variance variables'])


        data_info.loc[0,'No of observation'] = df.shape[0]
        data_info.loc[0,'No of Variables'] = df.shape[1]
        data_info.loc[0,'No of Numerical Variables'] = df._get_numeric_data().shape[1]
        data_info.loc[0,'No of Factor Variables'] = df.select_dtypes(include='category').shape[1]
        data_info.loc[0,'No of Logical Variables'] = df.select_dtypes(include='bool').shape[1]
        data_info.loc[0,'No of Categorical Variables'] = df.select_dtypes(include='object').shape[1]
        data_info.loc[0,'No of Date Variables'] = df.select_dtypes(include='datetime64').shape[1]
        data_info.loc[0,'No of zero variance variables'] = df.loc[:,df.apply(pd.Series.nunique)==1].shape[1]

        data_info =data_info.transpose()
        data_info.columns=['value']
        data_info['value'] = data_info['value'].astype(int)


        return data_info

    def __get_missing_values(self,data):
        
        """
        It is a Private method, so it cannot 
        be accessed by object outside the 
        class. This function will give us 
        a basic information like count 
        of missing values
        """
        
        #Getting sum of missing values for each feature
        missing_values = data.isnull().sum()
        #Feature missing values are sorted from few to many
        missing_values.sort_values(ascending=False, inplace=True)
        
        #Returning missing values
        return missing_values

    
    def Agg_Tabulation(self,data):
        
        
        """
        This method is a extension of 
        schema will gives the aditional 
        information about the data
        like Entropy value, Missing 
        Value Percentage and some observations
        """
        
        print("=" * 110)
        print("Aggregation of Table")
        print("=" * 110)
        table = pd.DataFrame(data.dtypes,columns=['dtypes'])
        table1 =pd.DataFrame(data.columns,columns=['Names'])
        table = table.reset_index()
        table= table.rename(columns={'index':'Name'})
        table['No of Missing'] = data.isnull().sum().values    
        table['No of Uniques'] = data.nunique().values
        table['Percent of Missing'] = ((data.isnull().sum().values)/ (data.shape[0])) *100
        table['First Observation'] = data.loc[0].values
        table['Second Observation'] = data.loc[1].values
        table['Third Observation'] = data.loc[2].values
        for name in table['Name'].value_counts().index:
            table.loc[table['Name'] == name, 'Entropy'] = round(stats.entropy(data[name].value_counts(normalize=True), base=2),2)
        return table
    
        print("=" * 110)
        
    def __iqr(self,x):
        
        
        """
        It is a private method which 
        returns you interquartile Range
        """
        return x.quantile(q=0.75) - x.quantile(q=0.25)

    def __outlier_count(self,x):
        
        
        """
        It is a private method which 
        returns you outlier present
        in the interquartile Range
        """
        upper_out = x.quantile(q=0.75) + 1.5 * self.__iqr(x)
        lower_out = x.quantile(q=0.25) - 1.5 * self.__iqr(x)
        return len(x[x > upper_out]) + len(x[x < lower_out])

    def num_count_summary(self,df):
        
        
        """
        This method will returns 
        you the information about
        numerical attributes like
        Positive values,Negative Values
        Unique count, Zero count 
        positive and negative inf-
        nity count and count of outliers
        etc 
        
        """
        
        df_num = df._get_numeric_data()
        data_info_num = pd.DataFrame()
        i=0
        for c in  df_num.columns:
            data_info_num.loc[c,'Negative values count']= df_num[df_num[c]<0].shape[0]
            data_info_num.loc[c,'Positive values count']= df_num[df_num[c]>0].shape[0]
            data_info_num.loc[c,'Zero count']= df_num[df_num[c]==0].shape[0]
            data_info_num.loc[c,'Unique count']= len(df_num[c].unique())
            data_info_num.loc[c,'Negative Infinity count']= df_num[df_num[c]== -np.inf].shape[0]
            data_info_num.loc[c,'Positive Infinity count']= df_num[df_num[c]== np.inf].shape[0]
            data_info_num.loc[c,'Missing Percentage']= df_num[df_num[c].isnull()].shape[0]/ df_num.shape[0]
            data_info_num.loc[c,'Count of outliers']= self.__outlier_count(df_num[c])
            i = i+1
        return data_info_num
    
    def statistical_summary(self,df):
        
        
        """
        This method will returns 
        you the varoius percentile
        of the data including count 
        and mean
        """
    
        df_num = df._get_numeric_data()

        data_stat_num = pd.DataFrame()

        try:
            data_stat_num = pd.concat([df_num.describe().transpose(),
                                       pd.DataFrame(df_num.quantile(q=0.10)),
                                       pd.DataFrame(df_num.quantile(q=0.90)),
                                       pd.DataFrame(df_num.quantile(q=0.95))],axis=1)
            data_stat_num.columns = ['count','mean','std','min','25%','50%','75%','max','10%','90%','95%']
        except:
            pass

        return data_stat_num



Info = Attribute_Information()
Info.Column_information(df)
Info.Agg_Tabulation(df)
Info.num_count_summary(df)
Info.statistical_summary(df)

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df) info.statistical_summary(df)

DataFrame_Categorical_Imputer

class DataFrame_Categorical_Imputer():
    

    def __init__(self):
        
        
        print("Imputation object created")
        
        
        
    def fit(self, data):
        
        
        """
        This method will fit 
        impute mode value for 
        all missing categoriical 
        variables
        """
        

        self.fill = pd.Series([data[column].\
                        value_counts().index[0]
            if data[column].dtype == np.dtype('O') else \
                 data[column].mode() for column in data],
            index=data.columns)

        return self

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Imputing Numerical Variable

class DataFrame_numerical_Imputer():
    

    def __init__(self):
        print("numerical_Imputer object created")

        
   
    def KNN_Imputer(self,df):
        
        """
        This method is for
        imputation, behalf
        of all methods KNN
        imputation performs
        well, hence this method
        will helps to impute
        missing values in 
        dataset
        """
        
        knn_imputer = KNNImputer(n_neighbors=5)
        df.iloc[:, :] = knn_imputer.fit_transform(df)
        return df

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Model Selection using KFold CrossValidation

from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
x = df.drop(['Customer_Lifetime_Value'],axis=1)
y = df['Customer_Lifetime_Value']
x_train,x_test,y_train,y_test=train_test_split(x\
                ,y,test_size=0.30,random_state=42)

class Model_Selector():
    
    

    def __init__(self):
        print("Model Selector object created")
        
    """
    This method helps to select
    the best machine learning 
    model to compute the relationship
    betweem i/p and d/p variable
    
    """    
        
        
    def Regression_Model_Selector(self,df):
        seed = 42
        models = []
        models.append(("LR", LinearRegression()))
        models.append(("RF", RandomForestRegressor()))
        models.append(("KNN", KNeighborsRegressor()))
        models.append(("CART", DecisionTreeRegressor()))
        models.append(("XGB", XGBRegressor()))
        result = []
        names = []
        scoring = 'r2'
        seed = 42
        
        

        for name, model in models:
            kfold = KFold(n_splits = 5, random_state =seed)
            cv_results = cross_val_score(model, x_train,\
                    y_train, cv = kfold, scoring = scoring)
            result.append(cv_results)
            names.append(name)
            msg = (name, cv_results.mean(), cv_results.std())
            print(msg)
            
            
            
        fig = plt.figure(figsize = (8,4))
        fig.suptitle('Algorithm Comparison')
        ax = fig.add_subplot(1,1,1)
        plt.boxplot(result)
        ax.set_xticklabels(names)
        plt.show()

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Model Building

import statsmodels.api as sm

class Data_Modelling(object):
    

    def __init__(self,n_estimators,
                    max_depth,
                    min_samples_split,
                    min_samples_leaf,
                    max_leaf_nodes,
                    min_impurity_split,
                    min_impurity_decrease,
                    bootstrap,
                    min_child_weight,
                    learning_rate,
                    Subsample,
                    Alpha,
                    Lamda,
                    random_state,
                    criterion):
        
        self.n_estimators = 150
        self.max_depth = 5
        self.min_samples_split = 3
        self.min_samples_leaf = 3
        self.max_leaf_nodes = None
        self.min_impurity_split = 1
        self.min_impurity_decrease = 0.1
        self.bootstrap = True
        self.min_child_weight = 3
        self.learning_rate = 0.07
        self.Subsample = 0.7
        self.Alpha = 0
        self.Lamda = 1.5
        self.random_state = 29 
        self.criterion = 'mse'
        
        print("Data Modelling object created")
        
        
    def OLS_Summary(self,data):
        
        model2 =sm.OLS(y_train,x_train).fit()
        
        return model2.summary()
        
        
    def Linear_Regression_Model(self,df):
        
        regressor = LinearRegression()
        
        reg=regressor.fit(x_train,y_train)
        
        LR_pred=regressor.predict(x_test)
        
        LR_RMSE = np.sqrt(metrics.mean_squared_error(y_test,LR_pred))
        
        LR_r2_score = r2_score(y_test,LR_pred)
        
        return LR_RMSE,LR_r2_score
        
        
    def Decision_Tree_Model(self,df):
        
        DT_Regressor = DecisionTreeRegressor(max_depth = self.max_depth,
                    min_samples_split = self.min_samples_split,
                    min_samples_leaf = self.min_samples_leaf,
                    max_leaf_nodes = self.max_leaf_nodes,
                    min_impurity_split = self.min_impurity_split,
                    min_impurity_decrease = self.min_impurity_decrease,
                    random_state = self.random_state)
        
        DT_Regressor.fit(x_train,y_train)
        
        DT_pred=DT_Regressor.predict(x_test)
        
        DT_RMSE = np.sqrt(metrics.mean_squared_error(y_test,DT_pred))
        
        DT_r2_score = r2_score(y_test,DT_pred)
        
        return DT_RMSE,DT_r2_score
        
        
    def Random_Forest_Model(self,df):
        
        RF_Regressor = RandomForestRegressor(n_estimators = self.n_estimators,
                    max_depth = self.max_depth,
                    min_samples_split = self.min_samples_split,
                    min_samples_leaf = self.min_samples_leaf,
                    max_leaf_nodes = self.max_leaf_nodes,
                    bootstrap = self.bootstrap,
                    criterion = self.criterion)
        
        RF_Regressor.fit(x_train,y_train)
        
        RF_pred=RF_Regressor.predict(x_test)
        
        RF_RMSE = np.sqrt(metrics.mean_squared_error(y_test,RF_pred))
        
        RF_r2_score = r2_score(y_test,RF_pred)
        
        return RF_RMSE,RF_r2_score
    
    
    def Extreme_Gradient_Boosting_Model(self,df):
        
        XGB_Regressor = XGBRegressor(n_estimators = self.n_estimators,
                    learning_rate = self.learning_rate,
                    max_depth = self.max_depth,
                    min_child_weight = self.min_child_weight,
                    random_state = self.random_state,
                    Subsample = self.Subsample,
                    Alpha = self.Alpha,
                    Lamda = self.Lamda)
        
        XGB_Regressor.fit(x_train,y_train)
        
        XGB_pred=XGB_Regressor.predict(x_test)
        
        XGB_RMSE = np.sqrt(metrics.mean_squared_error(y_test,XGB_pred))
        
        XGB_r2_score = r2_score(y_test,XGB_pred)
        
        return XGB_RMSE,XGB_r2_score

# base model
Basemodell = Data_Modelling(500,5,3,3,None,1,0.1,True,3,0.07,0.7,0,1.5,29,'mse')

Basemodell.OLS_Summary(df)
Basemodell.Linear_Regression_Model(df)
Basemodell.Decision_Tree_Model(df)
Basemodell.Random_Forest_Model(df)
Basemodell.Extreme_Gradient_Boosting_Model(df)

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Feature Selection

from sklearn.feature_selection import RFE
from catboost import CatBoostRegressor

class Feature_Selection(Data_Modelling):

    def __init__(self,n_estimators,
                    max_depth,
                    min_samples_split,
                    min_samples_leaf,
                    max_leaf_nodes,
                    min_impurity_split,
                    min_impurity_decrease,
                    bootstrap,
                    min_child_weight,
                    learning_rate,
                    Subsample,
                    Alpha,
                    Lamda,
                    random_state,
                    criterion):
        
        Data_Modelling.__init__(self,n_estimators,
                    max_depth,
                    min_samples_split,
                    min_samples_leaf,
                    max_leaf_nodes,
                    min_impurity_split,
                    min_impurity_decrease,
                    bootstrap,
                    min_child_weight,
                    learning_rate,
                    Subsample,
                    Alpha,
                    Lamda,
                    random_state,
                    criterion)
        print("Feature Selection object created")
        
    def Regression_Feature_Selector(self,data):
        estimator = RandomForestRegressor(n_estimators = self.n_estimators,
                    max_depth = self.max_depth,
                    min_samples_split = self.min_samples_split,
                    min_samples_leaf = self.min_samples_leaf,
                    max_leaf_nodes = self.max_leaf_nodes,
                    bootstrap = self.bootstrap,
                    criterion = self.criterion)
        
        selector = RFE(estimator,6,step=1)
        selector = selector.fit(x_train,y_train)
        rank =pd.DataFrame(selector.ranking_,\
                        columns=['Importance'])
        Columns = pd.DataFrame(x_train.columns,\
                            columns=['Columns'])
        Var = pd.concat([rank,Columns],axis=1)
        Var.sort_values(["Importance"], axis=0,\
                    ascending=True, inplace=True) 
        return Var


FS = Feature_Selection(500,5,3,3,None,1,0.1,True,3,0.07,0.7,0,1.5,29,'mse')

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Hyper Parameter Tuning

from sklearn.model_selection import RandomizedSearchCV
class Model_Regression_HyperParameter_Tuning():
    

    def __init__(self):
        
        
        print("HyperParameter_Tuning object created")
        
    class XGB_Regressor_HyperParameter_Tuning():
    

        def __init__(self):

            print("XGB HyperParameter_Tuning object created")


        def Fit_XGB_HyperParameter_Tuner(self,dftrain):
            

            xgb_Reg = XGBRegressor(tree_method = "exact", predictor = "cpu_predictor",
                                        objective = "reg:squarederror")


            parameters = {"learning_rate": [0.1, 0.01, 0.001],
                           "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
                           "max_depth": [2, 4, 7, 10],
                           "colsample_bytree": [0.3, 0.6, 0.8, 1.0],
                           "subsample": [0.2, 0.4, 0.5, 0.6, 0.7],
                           "reg_alpha": [0, 0.5, 1],
                           "reg_lambda": [1, 1.5, 2, 3, 4.5],
                           "min_child_weight": [1, 3, 5, 7],
                           "n_estimators": [100, 250, 500, 1000]}


            xgb_rscv = RandomizedSearchCV(xgb_Reg, param_distributions = parameters, scoring = "r2",
                                         cv = 3, random_state = 29 )

            # Fit the model
            model_xgboost = xgb_rscv.fit(x_train, y_train)
            return model_xgboost
        
        
        def XGB_Get_Best_Prams(self):
            
            print("Learning Rate: ", Xgb_model.best_estimator_.get_params()["learning_rate"])
            print("Gamma: ", Xgb_model.best_estimator_.get_params()["gamma"])
            print("Max Depth: ", Xgb_model.best_estimator_.get_params()["max_depth"])
            print("Subsample: ", Xgb_model.best_estimator_.get_params()["subsample"])
            print("Max Features at Split: ", Xgb_model.best_estimator_.get_params()["colsample_bytree"])
            print("Alpha: ", Xgb_model.best_estimator_.get_params()["reg_alpha"])
            print("Lamda: ", Xgb_model.best_estimator_.get_params()["reg_lambda"])
            print("Minimum Sum of the Instance Weight Hessian to Make a Child: ",Xgb_model.best_estimator_.get_params()["min_child_weight"])
            print("Number of Trees: ", Xgb_model.best_estimator_.get_params()["n_estimators"])


        
        def get_Regressor_result(self,modelname,y_test):
            
            
            
            RF_pred = modelname.predict(x_test)
            
            RF_RMSE = np.sqrt(metrics.mean_squared_error(y_test,RF_pred))
        
            RF_r2_score = r2_score(y_test,RF_pred)
        
            return RF_RMSE,RF_r2_score

        
        class RF_Regressor_HyperParameter_Tuning():
    

            def __init__(self):

                print("RF HyperParameter_Tuning object created")


            def Fit_RF_HyperParameter_Tuner(self,dftrain):
                
                

                param_grid = {"max_depth": [1, 3, 5, 7, 9, 10],
                              "min_samples_split": [1, 3, 10, 15, 20],
                              "min_samples_leaf": [1, 3, 5, 10],
                              "bootstrap": [True, False],
                              "criterion": ["mse", "mae"],
                              "n_estimators": [100, 250, 500, 1000]}

                Reg = RandomForestRegressor(random_state=29, n_jobs=-1)
                model = RandomizedSearchCV(Reg, param_grid, scoring = 'r2', cv=3)

                model.fit(x_train, y_train)

                return model

            def RF_Get_Best_Prams(self):
                
                
                

                print("n_estimators: ", RF_model.best_estimator_.get_params()["n_estimators"])
                print("Max Depth: ", RF_model.best_estimator_.get_params()["max_depth"])
                print("min_samples_split: ", RF_model.best_estimator_.get_params()["min_samples_split"])
                print("min_samples_leaf: ", RF_model.best_estimator_.get_params()["min_samples_leaf"])
                print("max_leaf_nodes: ", RF_model.best_estimator_.get_params()["max_leaf_nodes"])
                print("bootstrap: ", RF_model.best_estimator_.get_params()["bootstrap"])
                print("criterion: ",RF_model.best_estimator_.get_params()["criterion"])

            def Evaluation_Result(self,modelname,y_test):
                
                XGB_pred = modelname.predict(x_test)
                
                XGB_RMSE = np.sqrt(metrics.mean_squared_error(y_test,XGB_pred))

                XGB_r2_score = r2_score(y_test,XGB_pred)

                return XGB_RMSE,XGB_r2_score



HP_XGB = Model_Regression_HyperParameter_Tuning().XGB_Regressor_HyperParameter_Tuning()
HP_RF = Model_Regression_HyperParameter_Tuning().XGB_Regressor_HyperParameter_Tuning().RF_Regressor_HyperParameter_Tuning

Xgb_model = HP_XGB.Fit_XGB_HyperParameter_Tuner(df)


HP_XGB.XGB_Get_Best_Prams()


HP_XGB.get_Regressor_result(Xgb_model,y_test)

RF_model = HP_RF.Fit_RF_HyperParameter_Tuner(df)

HP_RF.RF_Get_Best_Prams()

HP_RF.Evaluation_Result(RF_model,y_test)

DataFrame_Categorical_Imputer

Save snippets that work from anywhere online with our extensions

Comments

More like this

Data Science

Browse more snippets >>

DataFrame_Categorical_Imputer

Save snippets that work from anywhere online with our extensions

Comments

More like this

Data Science

Browse more snippets >>

Embed code snippet