Feature Selection

PHOTO

Thu Aug 24 2023 15:53:13 GMT+0000 (Coordinated Universal Time)

Saved by @sumikk ##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

from sklearn.feature_selection import RFE
from catboost import CatBoostRegressor

class Feature_Selection(Data_Modelling):

    def __init__(self,n_estimators,
                    max_depth,
                    min_samples_split,
                    min_samples_leaf,
                    max_leaf_nodes,
                    min_impurity_split,
                    min_impurity_decrease,
                    bootstrap,
                    min_child_weight,
                    learning_rate,
                    Subsample,
                    Alpha,
                    Lamda,
                    random_state,
                    criterion):
        
        Data_Modelling.__init__(self,n_estimators,
                    max_depth,
                    min_samples_split,
                    min_samples_leaf,
                    max_leaf_nodes,
                    min_impurity_split,
                    min_impurity_decrease,
                    bootstrap,
                    min_child_weight,
                    learning_rate,
                    Subsample,
                    Alpha,
                    Lamda,
                    random_state,
                    criterion)
        print("Feature Selection object created")
        
    def Regression_Feature_Selector(self,data):
        estimator = RandomForestRegressor(n_estimators = self.n_estimators,
                    max_depth = self.max_depth,
                    min_samples_split = self.min_samples_split,
                    min_samples_leaf = self.min_samples_leaf,
                    max_leaf_nodes = self.max_leaf_nodes,
                    bootstrap = self.bootstrap,
                    criterion = self.criterion)
        
        selector = RFE(estimator,6,step=1)
        selector = selector.fit(x_train,y_train)
        rank =pd.DataFrame(selector.ranking_,\
                        columns=['Importance'])
        Columns = pd.DataFrame(x_train.columns,\
                            columns=['Columns'])
        Var = pd.concat([rank,Columns],axis=1)
        Var.sort_values(["Importance"], axis=0,\
                    ascending=True, inplace=True) 
        return Var


FS = Feature_Selection(500,5,3,3,None,1,0.1,True,3,0.07,0.7,0,1.5,29,'mse')

COPY

Save snippets that work from anywhere online with our extensions

Comments

Data Science

@sumikk

Exploratory Data Analysis DataFrame_Categorical_Imputer Imputing Numerical Variable Model Selection using KFold CrossValidation Model Building Feature Selection Hyper Parameter Tuning Fitted_DataPoints_vs_ActualPoints

##partialdependencyplot

Explainable AI

# ------------------------------- Partial Dependency Plot--------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from pdpbox import pdp, get_dataset, info_plots

# Assuming 'data' is your DataFrame with the features and target variable.
# Let's say 'target' is the column you want to predict and 'features' is the list of feature names.

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

# Train a machine learning model (e.g., Random Forest Regressor)
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Create the PDP plot for a specific feature (e.g., 'feature_name')
feature_to_plot = 'feature_name'
pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test, model_features=features, feature=feature_to_plot)

# Plot the PDP
pdp.pdp_plot(pdp_dist, feature_to_plot)
plt.show()

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df) info.num_count_summary(df) info.statistical_summary(df)

Exploratory Data Analysis

class Attribute_Information():
    

    def __init__(self):
        
        print("Attribute Information object created")
        
        
        
    def Column_information(self,df):
        
        """
        This method will give us a basic
        information of the dataframe like
        Count of Attributes,Count of rows,
        Numerical Attributes, Categorical 
        Attributes, Factor Attributes etc..
        """
    
        data_info = pd.DataFrame(
                                columns=['No of observation',
                                        'No of Variables',
                                        'No of Numerical Variables',
                                        'No of Factor Variables',
                                        'No of Categorical Variables',
                                        'No of Logical Variables',
                                        'No of Date Variables',
                                        'No of zero variance variables'])


        data_info.loc[0,'No of observation'] = df.shape[0]
        data_info.loc[0,'No of Variables'] = df.shape[1]
        data_info.loc[0,'No of Numerical Variables'] = df._get_numeric_data().shape[1]
        data_info.loc[0,'No of Factor Variables'] = df.select_dtypes(include='category').shape[1]
        data_info.loc[0,'No of Logical Variables'] = df.select_dtypes(include='bool').shape[1]
        data_info.loc[0,'No of Categorical Variables'] = df.select_dtypes(include='object').shape[1]
        data_info.loc[0,'No of Date Variables'] = df.select_dtypes(include='datetime64').shape[1]
        data_info.loc[0,'No of zero variance variables'] = df.loc[:,df.apply(pd.Series.nunique)==1].shape[1]

        data_info =data_info.transpose()
        data_info.columns=['value']
        data_info['value'] = data_info['value'].astype(int)


        return data_info

    def __get_missing_values(self,data):
        
        """
        It is a Private method, so it cannot 
        be accessed by object outside the 
        class. This function will give us 
        a basic information like count 
        of missing values
        """
        
        #Getting sum of missing values for each feature
        missing_values = data.isnull().sum()
        #Feature missing values are sorted from few to many
        missing_values.sort_values(ascending=False, inplace=True)
        
        #Returning missing values
        return missing_values

    
    def Agg_Tabulation(self,data):
        
        
        """
        This method is a extension of 
        schema will gives the aditional 
        information about the data
        like Entropy value, Missing 
        Value Percentage and some observations
        """
        
        print("=" * 110)
        print("Aggregation of Table")
        print("=" * 110)
        table = pd.DataFrame(data.dtypes,columns=['dtypes'])
        table1 =pd.DataFrame(data.columns,columns=['Names'])
        table = table.reset_index()
        table= table.rename(columns={'index':'Name'})
        table['No of Missing'] = data.isnull().sum().values    
        table['No of Uniques'] = data.nunique().values
        table['Percent of Missing'] = ((data.isnull().sum().values)/ (data.shape[0])) *100
        table['First Observation'] = data.loc[0].values
        table['Second Observation'] = data.loc[1].values
        table['Third Observation'] = data.loc[2].values
        for name in table['Name'].value_counts().index:
            table.loc[table['Name'] == name, 'Entropy'] = round(stats.entropy(data[name].value_counts(normalize=True), base=2),2)
        return table
    
        print("=" * 110)
        
    def __iqr(self,x):
        
        
        """
        It is a private method which 
        returns you interquartile Range
        """
        return x.quantile(q=0.75) - x.quantile(q=0.25)

    def __outlier_count(self,x):
        
        
        """
        It is a private method which 
        returns you outlier present
        in the interquartile Range
        """
        upper_out = x.quantile(q=0.75) + 1.5 * self.__iqr(x)
        lower_out = x.quantile(q=0.25) - 1.5 * self.__iqr(x)
        return len(x[x > upper_out]) + len(x[x < lower_out])

    def num_count_summary(self,df):
        
        
        """
        This method will returns 
        you the information about
        numerical attributes like
        Positive values,Negative Values
        Unique count, Zero count 
        positive and negative inf-
        nity count and count of outliers
        etc 
        
        """
        
        df_num = df._get_numeric_data()
        data_info_num = pd.DataFrame()
        i=0
        for c in  df_num.columns:
            data_info_num.loc[c,'Negative values count']= df_num[df_num[c]<0].shape[0]
            data_info_num.loc[c,'Positive values count']= df_num[df_num[c]>0].shape[0]
            data_info_num.loc[c,'Zero count']= df_num[df_num[c]==0].shape[0]
            data_info_num.loc[c,'Unique count']= len(df_num[c].unique())
            data_info_num.loc[c,'Negative Infinity count']= df_num[df_num[c]== -np.inf].shape[0]
            data_info_num.loc[c,'Positive Infinity count']= df_num[df_num[c]== np.inf].shape[0]
            data_info_num.loc[c,'Missing Percentage']= df_num[df_num[c].isnull()].shape[0]/ df_num.shape[0]
            data_info_num.loc[c,'Count of outliers']= self.__outlier_count(df_num[c])
            i = i+1
        return data_info_num
    
    def statistical_summary(self,df):
        
        
        """
        This method will returns 
        you the varoius percentile
        of the data including count 
        and mean
        """
    
        df_num = df._get_numeric_data()

        data_stat_num = pd.DataFrame()

        try:
            data_stat_num = pd.concat([df_num.describe().transpose(),
                                       pd.DataFrame(df_num.quantile(q=0.10)),
                                       pd.DataFrame(df_num.quantile(q=0.90)),
                                       pd.DataFrame(df_num.quantile(q=0.95))],axis=1)
            data_stat_num.columns = ['count','mean','std','min','25%','50%','75%','max','10%','90%','95%']
        except:
            pass

        return data_stat_num



Info = Attribute_Information()
Info.Column_information(df)
Info.Agg_Tabulation(df)
Info.num_count_summary(df)
Info.statistical_summary(df)

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df) info.statistical_summary(df)

DataFrame_Categorical_Imputer

class DataFrame_Categorical_Imputer():
    

    def __init__(self):
        
        
        print("Imputation object created")
        
        
        
    def fit(self, data):
        
        
        """
        This method will fit 
        impute mode value for 
        all missing categoriical 
        variables
        """
        

        self.fill = pd.Series([data[column].\
                        value_counts().index[0]
            if data[column].dtype == np.dtype('O') else \
                 data[column].mode() for column in data],
            index=data.columns)

        return self

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Imputing Numerical Variable

class DataFrame_numerical_Imputer():
    

    def __init__(self):
        print("numerical_Imputer object created")

        
   
    def KNN_Imputer(self,df):
        
        """
        This method is for
        imputation, behalf
        of all methods KNN
        imputation performs
        well, hence this method
        will helps to impute
        missing values in 
        dataset
        """
        
        knn_imputer = KNNImputer(n_neighbors=5)
        df.iloc[:, :] = knn_imputer.fit_transform(df)
        return df

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Model Selection using KFold CrossValidation

from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
x = df.drop(['Customer_Lifetime_Value'],axis=1)
y = df['Customer_Lifetime_Value']
x_train,x_test,y_train,y_test=train_test_split(x\
                ,y,test_size=0.30,random_state=42)

class Model_Selector():
    
    

    def __init__(self):
        print("Model Selector object created")
        
    """
    This method helps to select
    the best machine learning 
    model to compute the relationship
    betweem i/p and d/p variable
    
    """    
        
        
    def Regression_Model_Selector(self,df):
        seed = 42
        models = []
        models.append(("LR", LinearRegression()))
        models.append(("RF", RandomForestRegressor()))
        models.append(("KNN", KNeighborsRegressor()))
        models.append(("CART", DecisionTreeRegressor()))
        models.append(("XGB", XGBRegressor()))
        result = []
        names = []
        scoring = 'r2'
        seed = 42
        
        

        for name, model in models:
            kfold = KFold(n_splits = 5, random_state =seed)
            cv_results = cross_val_score(model, x_train,\
                    y_train, cv = kfold, scoring = scoring)
            result.append(cv_results)
            names.append(name)
            msg = (name, cv_results.mean(), cv_results.std())
            print(msg)
            
            
            
        fig = plt.figure(figsize = (8,4))
        fig.suptitle('Algorithm Comparison')
        ax = fig.add_subplot(1,1,1)
        plt.boxplot(result)
        ax.set_xticklabels(names)
        plt.show()

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Model Building

import statsmodels.api as sm

class Data_Modelling(object):
    

    def __init__(self,n_estimators,
                    max_depth,
                    min_samples_split,
                    min_samples_leaf,
                    max_leaf_nodes,
                    min_impurity_split,
                    min_impurity_decrease,
                    bootstrap,
                    min_child_weight,
                    learning_rate,
                    Subsample,
                    Alpha,
                    Lamda,
                    random_state,
                    criterion):
        
        self.n_estimators = 150
        self.max_depth = 5
        self.min_samples_split = 3
        self.min_samples_leaf = 3
        self.max_leaf_nodes = None
        self.min_impurity_split = 1
        self.min_impurity_decrease = 0.1
        self.bootstrap = True
        self.min_child_weight = 3
        self.learning_rate = 0.07
        self.Subsample = 0.7
        self.Alpha = 0
        self.Lamda = 1.5
        self.random_state = 29 
        self.criterion = 'mse'
        
        print("Data Modelling object created")
        
        
    def OLS_Summary(self,data):
        
        model2 =sm.OLS(y_train,x_train).fit()
        
        return model2.summary()
        
        
    def Linear_Regression_Model(self,df):
        
        regressor = LinearRegression()
        
        reg=regressor.fit(x_train,y_train)
        
        LR_pred=regressor.predict(x_test)
        
        LR_RMSE = np.sqrt(metrics.mean_squared_error(y_test,LR_pred))
        
        LR_r2_score = r2_score(y_test,LR_pred)
        
        return LR_RMSE,LR_r2_score
        
        
    def Decision_Tree_Model(self,df):
        
        DT_Regressor = DecisionTreeRegressor(max_depth = self.max_depth,
                    min_samples_split = self.min_samples_split,
                    min_samples_leaf = self.min_samples_leaf,
                    max_leaf_nodes = self.max_leaf_nodes,
                    min_impurity_split = self.min_impurity_split,
                    min_impurity_decrease = self.min_impurity_decrease,
                    random_state = self.random_state)
        
        DT_Regressor.fit(x_train,y_train)
        
        DT_pred=DT_Regressor.predict(x_test)
        
        DT_RMSE = np.sqrt(metrics.mean_squared_error(y_test,DT_pred))
        
        DT_r2_score = r2_score(y_test,DT_pred)
        
        return DT_RMSE,DT_r2_score
        
        
    def Random_Forest_Model(self,df):
        
        RF_Regressor = RandomForestRegressor(n_estimators = self.n_estimators,
                    max_depth = self.max_depth,
                    min_samples_split = self.min_samples_split,
                    min_samples_leaf = self.min_samples_leaf,
                    max_leaf_nodes = self.max_leaf_nodes,
                    bootstrap = self.bootstrap,
                    criterion = self.criterion)
        
        RF_Regressor.fit(x_train,y_train)
        
        RF_pred=RF_Regressor.predict(x_test)
        
        RF_RMSE = np.sqrt(metrics.mean_squared_error(y_test,RF_pred))
        
        RF_r2_score = r2_score(y_test,RF_pred)
        
        return RF_RMSE,RF_r2_score
    
    
    def Extreme_Gradient_Boosting_Model(self,df):
        
        XGB_Regressor = XGBRegressor(n_estimators = self.n_estimators,
                    learning_rate = self.learning_rate,
                    max_depth = self.max_depth,
                    min_child_weight = self.min_child_weight,
                    random_state = self.random_state,
                    Subsample = self.Subsample,
                    Alpha = self.Alpha,
                    Lamda = self.Lamda)
        
        XGB_Regressor.fit(x_train,y_train)
        
        XGB_pred=XGB_Regressor.predict(x_test)
        
        XGB_RMSE = np.sqrt(metrics.mean_squared_error(y_test,XGB_pred))
        
        XGB_r2_score = r2_score(y_test,XGB_pred)
        
        return XGB_RMSE,XGB_r2_score

# base model
Basemodell = Data_Modelling(500,5,3,3,None,1,0.1,True,3,0.07,0.7,0,1.5,29,'mse')

Basemodell.OLS_Summary(df)
Basemodell.Linear_Regression_Model(df)
Basemodell.Decision_Tree_Model(df)
Basemodell.Random_Forest_Model(df)
Basemodell.Extreme_Gradient_Boosting_Model(df)

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Feature Selection

from sklearn.feature_selection import RFE
from catboost import CatBoostRegressor

class Feature_Selection(Data_Modelling):

    def __init__(self,n_estimators,
                    max_depth,
                    min_samples_split,
                    min_samples_leaf,
                    max_leaf_nodes,
                    min_impurity_split,
                    min_impurity_decrease,
                    bootstrap,
                    min_child_weight,
                    learning_rate,
                    Subsample,
                    Alpha,
                    Lamda,
                    random_state,
                    criterion):
        
        Data_Modelling.__init__(self,n_estimators,
                    max_depth,
                    min_samples_split,
                    min_samples_leaf,
                    max_leaf_nodes,
                    min_impurity_split,
                    min_impurity_decrease,
                    bootstrap,
                    min_child_weight,
                    learning_rate,
                    Subsample,
                    Alpha,
                    Lamda,
                    random_state,
                    criterion)
        print("Feature Selection object created")
        
    def Regression_Feature_Selector(self,data):
        estimator = RandomForestRegressor(n_estimators = self.n_estimators,
                    max_depth = self.max_depth,
                    min_samples_split = self.min_samples_split,
                    min_samples_leaf = self.min_samples_leaf,
                    max_leaf_nodes = self.max_leaf_nodes,
                    bootstrap = self.bootstrap,
                    criterion = self.criterion)
        
        selector = RFE(estimator,6,step=1)
        selector = selector.fit(x_train,y_train)
        rank =pd.DataFrame(selector.ranking_,\
                        columns=['Importance'])
        Columns = pd.DataFrame(x_train.columns,\
                            columns=['Columns'])
        Var = pd.concat([rank,Columns],axis=1)
        Var.sort_values(["Importance"], axis=0,\
                    ascending=True, inplace=True) 
        return Var


FS = Feature_Selection(500,5,3,3,None,1,0.1,True,3,0.07,0.7,0,1.5,29,'mse')

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Hyper Parameter Tuning

from sklearn.model_selection import RandomizedSearchCV
class Model_Regression_HyperParameter_Tuning():
    

    def __init__(self):
        
        
        print("HyperParameter_Tuning object created")
        
    class XGB_Regressor_HyperParameter_Tuning():
    

        def __init__(self):

            print("XGB HyperParameter_Tuning object created")


        def Fit_XGB_HyperParameter_Tuner(self,dftrain):
            

            xgb_Reg = XGBRegressor(tree_method = "exact", predictor = "cpu_predictor",
                                        objective = "reg:squarederror")


            parameters = {"learning_rate": [0.1, 0.01, 0.001],
                           "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
                           "max_depth": [2, 4, 7, 10],
                           "colsample_bytree": [0.3, 0.6, 0.8, 1.0],
                           "subsample": [0.2, 0.4, 0.5, 0.6, 0.7],
                           "reg_alpha": [0, 0.5, 1],
                           "reg_lambda": [1, 1.5, 2, 3, 4.5],
                           "min_child_weight": [1, 3, 5, 7],
                           "n_estimators": [100, 250, 500, 1000]}


            xgb_rscv = RandomizedSearchCV(xgb_Reg, param_distributions = parameters, scoring = "r2",
                                         cv = 3, random_state = 29 )

            # Fit the model
            model_xgboost = xgb_rscv.fit(x_train, y_train)
            return model_xgboost
        
        
        def XGB_Get_Best_Prams(self):
            
            print("Learning Rate: ", Xgb_model.best_estimator_.get_params()["learning_rate"])
            print("Gamma: ", Xgb_model.best_estimator_.get_params()["gamma"])
            print("Max Depth: ", Xgb_model.best_estimator_.get_params()["max_depth"])
            print("Subsample: ", Xgb_model.best_estimator_.get_params()["subsample"])
            print("Max Features at Split: ", Xgb_model.best_estimator_.get_params()["colsample_bytree"])
            print("Alpha: ", Xgb_model.best_estimator_.get_params()["reg_alpha"])
            print("Lamda: ", Xgb_model.best_estimator_.get_params()["reg_lambda"])
            print("Minimum Sum of the Instance Weight Hessian to Make a Child: ",Xgb_model.best_estimator_.get_params()["min_child_weight"])
            print("Number of Trees: ", Xgb_model.best_estimator_.get_params()["n_estimators"])


        
        def get_Regressor_result(self,modelname,y_test):
            
            
            
            RF_pred = modelname.predict(x_test)
            
            RF_RMSE = np.sqrt(metrics.mean_squared_error(y_test,RF_pred))
        
            RF_r2_score = r2_score(y_test,RF_pred)
        
            return RF_RMSE,RF_r2_score

        
        class RF_Regressor_HyperParameter_Tuning():
    

            def __init__(self):

                print("RF HyperParameter_Tuning object created")


            def Fit_RF_HyperParameter_Tuner(self,dftrain):
                
                

                param_grid = {"max_depth": [1, 3, 5, 7, 9, 10],
                              "min_samples_split": [1, 3, 10, 15, 20],
                              "min_samples_leaf": [1, 3, 5, 10],
                              "bootstrap": [True, False],
                              "criterion": ["mse", "mae"],
                              "n_estimators": [100, 250, 500, 1000]}

                Reg = RandomForestRegressor(random_state=29, n_jobs=-1)
                model = RandomizedSearchCV(Reg, param_grid, scoring = 'r2', cv=3)

                model.fit(x_train, y_train)

                return model

            def RF_Get_Best_Prams(self):
                
                
                

                print("n_estimators: ", RF_model.best_estimator_.get_params()["n_estimators"])
                print("Max Depth: ", RF_model.best_estimator_.get_params()["max_depth"])
                print("min_samples_split: ", RF_model.best_estimator_.get_params()["min_samples_split"])
                print("min_samples_leaf: ", RF_model.best_estimator_.get_params()["min_samples_leaf"])
                print("max_leaf_nodes: ", RF_model.best_estimator_.get_params()["max_leaf_nodes"])
                print("bootstrap: ", RF_model.best_estimator_.get_params()["bootstrap"])
                print("criterion: ",RF_model.best_estimator_.get_params()["criterion"])

            def Evaluation_Result(self,modelname,y_test):
                
                XGB_pred = modelname.predict(x_test)
                
                XGB_RMSE = np.sqrt(metrics.mean_squared_error(y_test,XGB_pred))

                XGB_r2_score = r2_score(y_test,XGB_pred)

                return XGB_RMSE,XGB_r2_score



HP_XGB = Model_Regression_HyperParameter_Tuning().XGB_Regressor_HyperParameter_Tuning()
HP_RF = Model_Regression_HyperParameter_Tuning().XGB_Regressor_HyperParameter_Tuning().RF_Regressor_HyperParameter_Tuning

Xgb_model = HP_XGB.Fit_XGB_HyperParameter_Tuner(df)


HP_XGB.XGB_Get_Best_Prams()


HP_XGB.get_Regressor_result(Xgb_model,y_test)

RF_model = HP_RF.Fit_RF_HyperParameter_Tuner(df)

HP_RF.RF_Get_Best_Prams()

HP_RF.Evaluation_Result(RF_model,y_test)

Feature Selection

Save snippets that work from anywhere online with our extensions

Comments

More like this

Data Science

Browse more snippets >>

Feature Selection

Save snippets that work from anywhere online with our extensions

Comments

More like this

Data Science

Browse more snippets >>

Embed code snippet