features = [ "wait_time", "delay_vs_expected", "number_of_products", "number_of_sellers", "price", "freight_value", "distance_seller_customer", ] orders_standardized = orders.copy() for f in features: mu = orders[f].mean() sigma = orders[f].std() orders_standardized[f] = orders[f].map(lambda x: (x - mu) / sigma) orders_standardized formula = "review_score ~ " + ' + '.join(features) formula import statsmodels.formula.api as smf model4 = smf.ols(formula = formula, data = orders_standardized).fit() print(model4.summary()) #Plot overall regression performance model4.params[1:].sort_values().plot(kind='barh') # Check model performance #⚠️ Regression performance should not be measured only by its R-squared! #👀 Always visualize the distribution of the predictions, and especially the #residuals. #❓ Compute the residuals. #You should see that the mean of the residuals is equal to 0 (always verified #n a linear regression context) predicted_review_score = model4.predict(orders_standardized[features]) residuals = predicted_review_score - orders_standardized.review_score rmse = (residuals**2).mean()**0.5 rmse #📊 Plot the `residuals` in a histogram. sns.histplot(residuals, kde=True, stat='density', discrete=True) plt.title('residuals density plot') #❓ Can you guess why the residual distplot has such a strange shape? sns.kdeplot(predicted_review_score, label='predicted review score') sns.kdeplot(orders.review_score, label = 'actual review score') plt.title('distribution of predicted and actual values') #📈 Actually, using `seaborn`, we could have already plotted a `regression #line` of the `review_score` vs. the `delay_vs_expected` sample = orders.sample(10000, random_state=42) plt.figure(figsize=(13,5)) plt.suptitle('Regression of review_score, 95% confidence interval') plt.subplot(1,2,1) sns.regplot(x = sample.wait_time, y= sample.review_score, y_jitter=.1, ci=95) plt.xlim(right=70) plt.ylim(bottom=0) plt.subplot(1,2,2) sns.regplot(x = orders.delay_vs_expected, y= orders.review_score, y_jitter=.1, ci=95) plt.xlim(right=70) plt.ylim(bottom=0)