```# Log-odd is the Logit intercept
log_odd = -0.473288
print(f"log_odd = -0.473288 ")

# Calculation odds -> compute e-function on the logit using exp() “de-logarithimize”
odds = np.exp(log_odd)
print(f'odds = {odds}  Round:{round(odds, 2)}')

# To convert from odds to a probability, divide the odds by one plus the odds.
prob = odds / (1 + odds)
print(f'prob = {prob} Round:{round(prob, 2)}')```
```# Setup a range from 1% probability to 100% probability
p = np.arange(0.01, 1, 0.005)
print('Probabilities: ', p[0:2],  p[-2:])

# Convert probabilities to odds 0.01 / (1-0.01) <=> (0.99)
odds = p / (1-p)
print('Odds: ', odds[0:2], odds[-2:])

# Get the natural log of those odds
log_odds = np.log(odds)
print('Log Odds: ', log_odds[0:2], log_odds[-2:])```
```%load_ext autoreload
# Import modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df

#Exploratory Data Analysis with SweetViz
import sweetviz as sv
analyze_report = sv.analyze(df)
analyze_report.show_html('report.html', open_browser=False)

# Generate Heat Map, could be loaded after standardisation of variables.
%config InlineBackend.figure_format = 'retina'
plt.figure(figsize = (10, 10))

sns.heatmap(
df.corr(),
cmap = 'crest',
annot = True,
annot_kws = {"size": 10}
)

# Univariate regression
# Use statsmodels with statsmodels.formula.api
# model1: an ols regression of observed dependent variable (values of the variable being observed)
# in the input dataset and the output of the (linear) function of the independent variable.
# y = dep_variable, x = ind_variable

import statsmodels.formula.api as smf

model1 = smf.ols('dep_variable ~ indep_variable', data=df).fit()
print(model1.summary())

# Multivariate Linear Regression
# Using respective z-score (standardize) in order to compare the partial regression coefficients together.

# Variables to compare
features = [
"wait_time",
"delay_vs_expected",
"number_of_products",
"number_of_sellers",
"price",
"freight_value",
"distance_seller_customer",
]

# Standardize independent variables.
df_standardized = df.copy()
for f in features:
mu = df[f].mean()
sigma = df[f].std()
df_standardized[f] = df[f].map(lambda x: (x - mu) / sigma)
df_standardized

# Regression Formula
formula = "dep_variable ~ " + ' + '.join(features)
formula

# Perform regression
# Review if statsitically significant review output
import statsmodels.formula.api as smf

model2 = smf.ols(formula = formula, data = df_standardized).fit()
print(model2.summary())

# Visualize the most important feature with a bar plot.
model2.params[1:].sort_values().plot(kind='barh')

# Check model's performance
# Regression performance should not be measured only by its R-squared!
# Always visualize the distribution of the predictions, and especially the residuals.
predicted_dep_variable = model2.predict(df_standardized[features])
residuals = predicted_dep_variable - df_standardized.dep_variable

# Plot the Resduals
sns.histplot(residuals, kde=True, stat='density', discrete=True)
plt.title('Residuals Density Plot')

# Compute the associated RMSE.
# RMSE values between 0.2 and 0.5 shows that the model can relatively predict the data accurately.
# In addition, Adjusted R-squared more than 0.75 is a very good value for showing the accuracy.
rmse = (residuals**2).mean()**0.5
rmse

# Plot, on the same graph, both the distribution (KDE) of the (actual) dependent_variable
# and the distribution of the predicted_dependent_variable.
sns.kdeplot(predicted_dep_variable, label='Predicted dep_variable Distribution')
sns.kdeplot(df.dep_variable, label = 'Actual dep_variable Distribution')
plt.title('Distribution of Predicted and Actual Dependant Variable ')

# Using seaborn, plot a regression line of the dep_variable vs.an independant_variable.
# Do plot with sample and whole population.
sample = df.sample(1000, random_state=42)
plt.figure(figsize=(13,5))
plt.suptitle('Regression of dep_variable, 95% Confidence Interval')
plt.subplot(1,2,1)
sns.regplot(x = sample.dep_variable, y= sample.dep_variable, y_jitter=.1, ci=95)
plt.xlim(right=70)
plt.ylim(bottom=0)

plt.subplot(1,2,2)
sns.regplot(x = orders.ind_variable, y= orders.dep_variable, y_jitter=.1, ci=95)
plt.xlim(right=70)
plt.ylim(bottom=0)```
star

Tue Nov 22 2022 20:52:35 GMT+0000 (UTC) https://github.com/janduplessis883/data-logit/blob/master/Logistic%20Regression.ipynb

#univariateregression #python
star

Tue Nov 22 2022 20:12:53 GMT+0000 (UTC)

#univariateregression #python
star

Tue Nov 22 2022 12:13:50 GMT+0000 (UTC)

#univariateregression #python