%load_ext autoreload %autoreload 2 # Import modules import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt # Load df df = pd.read_csv('filepath.csv') df #Exploratory Data Analysis with SweetViz import sweetviz as sv analyze_report = sv.analyze(df) analyze_report.show_html('report.html', open_browser=False) # Generate Heat Map, could be loaded after standardisation of variables. %config InlineBackend.figure_format = 'retina' plt.figure(figsize = (10, 10)) sns.heatmap( df.corr(), cmap = 'crest', annot = True, annot_kws = {"size": 10} ) # Univariate regression # Use statsmodels with statsmodels.formula.api # model1: an ols regression of observed dependent variable (values of the variable being observed) # in the input dataset and the output of the (linear) function of the independent variable. # y = dep_variable, x = ind_variable import statsmodels.formula.api as smf model1 = smf.ols('dep_variable ~ indep_variable', data=df).fit() print(model1.summary()) # Multivariate Linear Regression # Using respective z-score (standardize) in order to compare the partial regression coefficients together. # Variables to compare features = [ "wait_time", "delay_vs_expected", "number_of_products", "number_of_sellers", "price", "freight_value", "distance_seller_customer", ] # Standardize independent variables. df_standardized = df.copy() for f in features: mu = df[f].mean() sigma = df[f].std() df_standardized[f] = df[f].map(lambda x: (x - mu) / sigma) df_standardized # Regression Formula formula = "dep_variable ~ " + ' + '.join(features) formula # Perform regression # Review if statsitically significant review output import statsmodels.formula.api as smf model2 = smf.ols(formula = formula, data = df_standardized).fit() print(model2.summary()) # Visualize the most important feature with a bar plot. model2.params[1:].sort_values().plot(kind='barh') # Check model's performance # Regression performance should not be measured only by its R-squared! # Always visualize the distribution of the predictions, and especially the residuals. predicted_dep_variable = model2.predict(df_standardized[features]) residuals = predicted_dep_variable - df_standardized.dep_variable # Plot the Resduals sns.histplot(residuals, kde=True, stat='density', discrete=True) plt.title('Residuals Density Plot') # Compute the associated RMSE. # RMSE values between 0.2 and 0.5 shows that the model can relatively predict the data accurately. # In addition, Adjusted R-squared more than 0.75 is a very good value for showing the accuracy. rmse = (residuals**2).mean()**0.5 rmse # Plot, on the same graph, both the distribution (KDE) of the (actual) dependent_variable # and the distribution of the predicted_dependent_variable. sns.kdeplot(predicted_dep_variable, label='Predicted dep_variable Distribution') sns.kdeplot(df.dep_variable, label = 'Actual dep_variable Distribution') plt.title('Distribution of Predicted and Actual Dependant Variable ') # Using seaborn, plot a regression line of the dep_variable vs.an independant_variable. # Do plot with sample and whole population. sample = df.sample(1000, random_state=42) plt.figure(figsize=(13,5)) plt.suptitle('Regression of dep_variable, 95% Confidence Interval') plt.subplot(1,2,1) sns.regplot(x = sample.dep_variable, y= sample.dep_variable, y_jitter=.1, ci=95) plt.xlim(right=70) plt.ylim(bottom=0) plt.subplot(1,2,2) sns.regplot(x = orders.ind_variable, y= orders.dep_variable, y_jitter=.1, ci=95) plt.xlim(right=70) plt.ylim(bottom=0)