Linear Regression - Standardizing df for multi variant regression

PHOTO EMBED

Sat Nov 19 2022 19:39:54 GMT+0000 (Coordinated Universal Time)

Saved by @janduplessis883

features = [
    "wait_time",
    "delay_vs_expected",
    "number_of_products",
    "number_of_sellers",
    "price",
    "freight_value",
    "distance_seller_customer",
]

orders_standardized = orders.copy()
for f in features:
    mu = orders[f].mean()
    sigma = orders[f].std()
    orders_standardized[f] = orders[f].map(lambda x: (x - mu) / sigma)
orders_standardized

formula = "review_score ~ " + ' + '.join(features)
formula

import statsmodels.formula.api as smf

model4 = smf.ols(formula = formula, data = orders_standardized).fit()
print(model4.summary())

#Plot overall regression performance
model4.params[1:].sort_values().plot(kind='barh')

# Check model performance
#⚠️ Regression performance should not be measured only by its R-squared!  
#👀 Always visualize the distribution of the predictions, and especially the #residuals.
#❓ Compute the residuals.
#You should see that the mean of the residuals is equal to 0 (always verified #n a linear regression context)

predicted_review_score = model4.predict(orders_standardized[features])
residuals = predicted_review_score - orders_standardized.review_score

rmse = (residuals**2).mean()**0.5
rmse

#📊 Plot the `residuals` in a histogram.
sns.histplot(residuals, kde=True, stat='density', discrete=True)
plt.title('residuals density plot')

#❓ Can you guess why the residual distplot has such a strange shape? 
sns.kdeplot(predicted_review_score, label='predicted review score')
sns.kdeplot(orders.review_score, label = 'actual review score')
plt.title('distribution of predicted and actual values')

#📈 Actually, using `seaborn`, we could have already plotted a `regression #line` of the `review_score` vs. the `delay_vs_expected`
sample = orders.sample(10000, random_state=42)
plt.figure(figsize=(13,5))
plt.suptitle('Regression of review_score, 95% confidence interval')
plt.subplot(1,2,1)
sns.regplot(x = sample.wait_time, y= sample.review_score, y_jitter=.1, ci=95)
plt.xlim(right=70)
plt.ylim(bottom=0)

plt.subplot(1,2,2)
sns.regplot(x = orders.delay_vs_expected, y= orders.review_score, y_jitter=.1, ci=95)
plt.xlim(right=70)
plt.ylim(bottom=0)
content_copyCOPY

https://github.com/janduplessis883/data-orders-regression/blob/master/orders_regression.ipynb