thiscodeWorks - Organizing the best of code online

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Hyper Parameter Tuning

from sklearn.model_selection import RandomizedSearchCV
class Model_Regression_HyperParameter_Tuning():
    

    def __init__(self):
        
        
        print("HyperParameter_Tuning object created")
        
    class XGB_Regressor_HyperParameter_Tuning():
    

        def __init__(self):

            print("XGB HyperParameter_Tuning object created")


        def Fit_XGB_HyperParameter_Tuner(self,dftrain):
            

            xgb_Reg = XGBRegressor(tree_method = "exact", predictor = "cpu_predictor",
                                        objective = "reg:squarederror")


            parameters = {"learning_rate": [0.1, 0.01, 0.001],
                           "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
                           "max_depth": [2, 4, 7, 10],
                           "colsample_bytree": [0.3, 0.6, 0.8, 1.0],
                           "subsample": [0.2, 0.4, 0.5, 0.6, 0.7],
                           "reg_alpha": [0, 0.5, 1],
                           "reg_lambda": [1, 1.5, 2, 3, 4.5],
                           "min_child_weight": [1, 3, 5, 7],
                           "n_estimators": [100, 250, 500, 1000]}


            xgb_rscv = RandomizedSearchCV(xgb_Reg, param_distributions = parameters, scoring = "r2",
                                         cv = 3, random_state = 29 )

            # Fit the model
            model_xgboost = xgb_rscv.fit(x_train, y_train)
            return model_xgboost
        
        
        def XGB_Get_Best_Prams(self):
            
            print("Learning Rate: ", Xgb_model.best_estimator_.get_params()["learning_rate"])
            print("Gamma: ", Xgb_model.best_estimator_.get_params()["gamma"])
            print("Max Depth: ", Xgb_model.best_estimator_.get_params()["max_depth"])
            print("Subsample: ", Xgb_model.best_estimator_.get_params()["subsample"])
            print("Max Features at Split: ", Xgb_model.best_estimator_.get_params()["colsample_bytree"])
            print("Alpha: ", Xgb_model.best_estimator_.get_params()["reg_alpha"])
            print("Lamda: ", Xgb_model.best_estimator_.get_params()["reg_lambda"])
            print("Minimum Sum of the Instance Weight Hessian to Make a Child: ",Xgb_model.best_estimator_.get_params()["min_child_weight"])
            print("Number of Trees: ", Xgb_model.best_estimator_.get_params()["n_estimators"])


        
        def get_Regressor_result(self,modelname,y_test):
            
            
            
            RF_pred = modelname.predict(x_test)
            
            RF_RMSE = np.sqrt(metrics.mean_squared_error(y_test,RF_pred))
        
            RF_r2_score = r2_score(y_test,RF_pred)
        
            return RF_RMSE,RF_r2_score

        
        class RF_Regressor_HyperParameter_Tuning():
    

            def __init__(self):

                print("RF HyperParameter_Tuning object created")


            def Fit_RF_HyperParameter_Tuner(self,dftrain):
                
                

                param_grid = {"max_depth": [1, 3, 5, 7, 9, 10],
                              "min_samples_split": [1, 3, 10, 15, 20],
                              "min_samples_leaf": [1, 3, 5, 10],
                              "bootstrap": [True, False],
                              "criterion": ["mse", "mae"],
                              "n_estimators": [100, 250, 500, 1000]}

                Reg = RandomForestRegressor(random_state=29, n_jobs=-1)
                model = RandomizedSearchCV(Reg, param_grid, scoring = 'r2', cv=3)

                model.fit(x_train, y_train)

                return model

            def RF_Get_Best_Prams(self):
                
                
                

                print("n_estimators: ", RF_model.best_estimator_.get_params()["n_estimators"])
                print("Max Depth: ", RF_model.best_estimator_.get_params()["max_depth"])
                print("min_samples_split: ", RF_model.best_estimator_.get_params()["min_samples_split"])
                print("min_samples_leaf: ", RF_model.best_estimator_.get_params()["min_samples_leaf"])
                print("max_leaf_nodes: ", RF_model.best_estimator_.get_params()["max_leaf_nodes"])
                print("bootstrap: ", RF_model.best_estimator_.get_params()["bootstrap"])
                print("criterion: ",RF_model.best_estimator_.get_params()["criterion"])

            def Evaluation_Result(self,modelname,y_test):
                
                XGB_pred = modelname.predict(x_test)
                
                XGB_RMSE = np.sqrt(metrics.mean_squared_error(y_test,XGB_pred))

                XGB_r2_score = r2_score(y_test,XGB_pred)

                return XGB_RMSE,XGB_r2_score



HP_XGB = Model_Regression_HyperParameter_Tuning().XGB_Regressor_HyperParameter_Tuning()
HP_RF = Model_Regression_HyperParameter_Tuning().XGB_Regressor_HyperParameter_Tuning().RF_Regressor_HyperParameter_Tuning

Xgb_model = HP_XGB.Fit_XGB_HyperParameter_Tuner(df)


HP_XGB.XGB_Get_Best_Prams()


HP_XGB.get_Regressor_result(Xgb_model,y_test)

RF_model = HP_RF.Fit_RF_HyperParameter_Tuner(df)

HP_RF.RF_Get_Best_Prams()

HP_RF.Evaluation_Result(RF_model,y_test)

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Feature Selection

from sklearn.feature_selection import RFE
from catboost import CatBoostRegressor

class Feature_Selection(Data_Modelling):

    def __init__(self,n_estimators,
                    max_depth,
                    min_samples_split,
                    min_samples_leaf,
                    max_leaf_nodes,
                    min_impurity_split,
                    min_impurity_decrease,
                    bootstrap,
                    min_child_weight,
                    learning_rate,
                    Subsample,
                    Alpha,
                    Lamda,
                    random_state,
                    criterion):
        
        Data_Modelling.__init__(self,n_estimators,
                    max_depth,
                    min_samples_split,
                    min_samples_leaf,
                    max_leaf_nodes,
                    min_impurity_split,
                    min_impurity_decrease,
                    bootstrap,
                    min_child_weight,
                    learning_rate,
                    Subsample,
                    Alpha,
                    Lamda,
                    random_state,
                    criterion)
        print("Feature Selection object created")
        
    def Regression_Feature_Selector(self,data):
        estimator = RandomForestRegressor(n_estimators = self.n_estimators,
                    max_depth = self.max_depth,
                    min_samples_split = self.min_samples_split,
                    min_samples_leaf = self.min_samples_leaf,
                    max_leaf_nodes = self.max_leaf_nodes,
                    bootstrap = self.bootstrap,
                    criterion = self.criterion)
        
        selector = RFE(estimator,6,step=1)
        selector = selector.fit(x_train,y_train)
        rank =pd.DataFrame(selector.ranking_,\
                        columns=['Importance'])
        Columns = pd.DataFrame(x_train.columns,\
                            columns=['Columns'])
        Var = pd.concat([rank,Columns],axis=1)
        Var.sort_values(["Importance"], axis=0,\
                    ascending=True, inplace=True) 
        return Var


FS = Feature_Selection(500,5,3,3,None,1,0.1,True,3,0.07,0.7,0,1.5,29,'mse')

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Model Building

import statsmodels.api as sm

class Data_Modelling(object):
    

    def __init__(self,n_estimators,
                    max_depth,
                    min_samples_split,
                    min_samples_leaf,
                    max_leaf_nodes,
                    min_impurity_split,
                    min_impurity_decrease,
                    bootstrap,
                    min_child_weight,
                    learning_rate,
                    Subsample,
                    Alpha,
                    Lamda,
                    random_state,
                    criterion):
        
        self.n_estimators = 150
        self.max_depth = 5
        self.min_samples_split = 3
        self.min_samples_leaf = 3
        self.max_leaf_nodes = None
        self.min_impurity_split = 1
        self.min_impurity_decrease = 0.1
        self.bootstrap = True
        self.min_child_weight = 3
        self.learning_rate = 0.07
        self.Subsample = 0.7
        self.Alpha = 0
        self.Lamda = 1.5
        self.random_state = 29 
        self.criterion = 'mse'
        
        print("Data Modelling object created")
        
        
    def OLS_Summary(self,data):
        
        model2 =sm.OLS(y_train,x_train).fit()
        
        return model2.summary()
        
        
    def Linear_Regression_Model(self,df):
        
        regressor = LinearRegression()
        
        reg=regressor.fit(x_train,y_train)
        
        LR_pred=regressor.predict(x_test)
        
        LR_RMSE = np.sqrt(metrics.mean_squared_error(y_test,LR_pred))
        
        LR_r2_score = r2_score(y_test,LR_pred)
        
        return LR_RMSE,LR_r2_score
        
        
    def Decision_Tree_Model(self,df):
        
        DT_Regressor = DecisionTreeRegressor(max_depth = self.max_depth,
                    min_samples_split = self.min_samples_split,
                    min_samples_leaf = self.min_samples_leaf,
                    max_leaf_nodes = self.max_leaf_nodes,
                    min_impurity_split = self.min_impurity_split,
                    min_impurity_decrease = self.min_impurity_decrease,
                    random_state = self.random_state)
        
        DT_Regressor.fit(x_train,y_train)
        
        DT_pred=DT_Regressor.predict(x_test)
        
        DT_RMSE = np.sqrt(metrics.mean_squared_error(y_test,DT_pred))
        
        DT_r2_score = r2_score(y_test,DT_pred)
        
        return DT_RMSE,DT_r2_score
        
        
    def Random_Forest_Model(self,df):
        
        RF_Regressor = RandomForestRegressor(n_estimators = self.n_estimators,
                    max_depth = self.max_depth,
                    min_samples_split = self.min_samples_split,
                    min_samples_leaf = self.min_samples_leaf,
                    max_leaf_nodes = self.max_leaf_nodes,
                    bootstrap = self.bootstrap,
                    criterion = self.criterion)
        
        RF_Regressor.fit(x_train,y_train)
        
        RF_pred=RF_Regressor.predict(x_test)
        
        RF_RMSE = np.sqrt(metrics.mean_squared_error(y_test,RF_pred))
        
        RF_r2_score = r2_score(y_test,RF_pred)
        
        return RF_RMSE,RF_r2_score
    
    
    def Extreme_Gradient_Boosting_Model(self,df):
        
        XGB_Regressor = XGBRegressor(n_estimators = self.n_estimators,
                    learning_rate = self.learning_rate,
                    max_depth = self.max_depth,
                    min_child_weight = self.min_child_weight,
                    random_state = self.random_state,
                    Subsample = self.Subsample,
                    Alpha = self.Alpha,
                    Lamda = self.Lamda)
        
        XGB_Regressor.fit(x_train,y_train)
        
        XGB_pred=XGB_Regressor.predict(x_test)
        
        XGB_RMSE = np.sqrt(metrics.mean_squared_error(y_test,XGB_pred))
        
        XGB_r2_score = r2_score(y_test,XGB_pred)
        
        return XGB_RMSE,XGB_r2_score

# base model
Basemodell = Data_Modelling(500,5,3,3,None,1,0.1,True,3,0.07,0.7,0,1.5,29,'mse')

Basemodell.OLS_Summary(df)
Basemodell.Linear_Regression_Model(df)
Basemodell.Decision_Tree_Model(df)
Basemodell.Random_Forest_Model(df)
Basemodell.Extreme_Gradient_Boosting_Model(df)

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Model Selection using KFold CrossValidation

from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
x = df.drop(['Customer_Lifetime_Value'],axis=1)
y = df['Customer_Lifetime_Value']
x_train,x_test,y_train,y_test=train_test_split(x\
                ,y,test_size=0.30,random_state=42)

class Model_Selector():
    
    

    def __init__(self):
        print("Model Selector object created")
        
    """
    This method helps to select
    the best machine learning 
    model to compute the relationship
    betweem i/p and d/p variable
    
    """    
        
        
    def Regression_Model_Selector(self,df):
        seed = 42
        models = []
        models.append(("LR", LinearRegression()))
        models.append(("RF", RandomForestRegressor()))
        models.append(("KNN", KNeighborsRegressor()))
        models.append(("CART", DecisionTreeRegressor()))
        models.append(("XGB", XGBRegressor()))
        result = []
        names = []
        scoring = 'r2'
        seed = 42
        
        

        for name, model in models:
            kfold = KFold(n_splits = 5, random_state =seed)
            cv_results = cross_val_score(model, x_train,\
                    y_train, cv = kfold, scoring = scoring)
            result.append(cv_results)
            names.append(name)
            msg = (name, cv_results.mean(), cv_results.std())
            print(msg)
            
            
            
        fig = plt.figure(figsize = (8,4))
        fig.suptitle('Algorithm Comparison')
        ax = fig.add_subplot(1,1,1)
        plt.boxplot(result)
        ax.set_xticklabels(names)
        plt.show()

##partialdependencyplot #info.column_information(df)info.agg_tabulation(df)info.num_count_summary(df)info.statistical_summary(df)

Imputing Numerical Variable

class DataFrame_numerical_Imputer():
    

    def __init__(self):
        print("numerical_Imputer object created")

        
   
    def KNN_Imputer(self,df):
        
        """
        This method is for
        imputation, behalf
        of all methods KNN
        imputation performs
        well, hence this method
        will helps to impute
        missing values in 
        dataset
        """
        
        knn_imputer = KNNImputer(n_neighbors=5)
        df.iloc[:, :] = knn_imputer.fit_transform(df)
        return df

Hyper Parameter Tuning

Feature Selection

Model Building

Model Selection using KFold CrossValidation

Imputing Numerical Variable

Save snippets that work with our extensions