```# Create a function that we can re-use
def show_distribution(var_data):
'''
This function will make a distribution (graph) and display it
'''

# Get statistics
min_val = var_data.min()
max_val = var_data.max()
mean_val = var_data.mean()
med_val = var_data.median()
mod_val = var_data.mode()[0]

print('Minimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n'.format(min_val,
mean_val,
med_val,
mod_val,
max_val))

# Create a figure for 2 subplots (2 rows, 1 column)
fig, ax = plt.subplots(2, 1, figsize = (10,4))

# Plot the histogram
ax[0].hist(var_data)
ax[0].set_ylabel('Frequency')

# Add lines for the mean, median, and mode
ax[0].axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)

# Plot the boxplot
ax[1].boxplot(var_data, vert=False)
ax[1].set_xlabel('Value')

# Add a title to the Figure
fig.suptitle('Data Distribution')

# Show the figure
fig.show()

```df['year'] = df['date'].dt.year if we put dt.month is for month etc..
```
```def find_boundaries(df, variable, distance=1.5):

IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

return upper_boundary, lower_boundary```
```data.nunique().plot.bar(figsize=(12,6))
plt.ylabel('Number of unique categories')
plt.xlabel('Variables')
plt.title('Cardinality')

## Version with 5% threshold

fig = label_freq.sort_values(ascending=False).plot.bar()
fig.axhline(y=0.05, color='red')
fig.set_ylabel('percentage of cars within each category')
fig.set_xlabel('Variable: class')
fig.set_title('Identifying Rare Categories')
plt.show()```
```data.isnull().mean().plot.bar(figsize=(12,6))
plt.ylabel('Percentage of missing values')
plt.xlabel('Variables')
plt.title('Quantifying missing data')```
```# Get the Numerical Data list to infer distribution plots

numerical = [var for var in df.columns if df[var].dtype!='O']
print('There are {} numerical variables\n'.format(len(numerical)))
print('The numerical variables are :', numerical)

# Get the Categorical Data list to infer distribution plots

categorical = [var for var in df.columns if df[var].dtype =='O']
print('There are {} Categorical variables\n'.format(len(categorical)))
print('The Categorical variables are :', categorical)```
```import dtale
from pandas_profiling import ProfileReport
from dataprep.eda import create_report

class AutomateEDA:

def __init__(self, df):
self.df = df

def show_dtale(self):
d = dtale.show(self.df)
d.open_browser()
print('dtale opened in browser!')

def show_pandas_profile_report(self):
profile = ProfileReport(self.df, title="Pandas Profiling Report", explorative=True)
profile.to_file("pandas-profiling-report.html")
print('pandas-profile-report created and saved in the project folder!')

def show_dataprep(self):
create_report(self.df).show_browser()
print('dataprep opened in browser!')

def show_all(self):
self.show_dtale()
self.show_dataprep()
self.show_pandas_profile_report();

eda = AutomateEDA(df)
eda.show_all()```
```sns.pairplot(df,hue = ‘smoker’,diag_kind = “kde”,kind = “scatter”,palette = “husl”)

#where :
#kind : Kind of plot for the non-identity relationships. {‘scatter’, ‘reg’}
#diag_kind : Kind of plot for the diagonal subplots. {‘hist’, ‘kde’}
```
star

Sun Mar 26 2023 08:59:44 GMT+0000 (Coordinated Universal Time) https://learn.microsoft.com/en-us/training/modules/explore-analyze-data-with-python/7-exercise-real-world-data

#python #eda
star

Mon Sep 05 2022 09:56:58 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataset #eda #extractyear #datetime
star

Mon Sep 05 2022 09:55:05 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataset #eda #outliers #boundaries
star

Mon Sep 05 2022 09:53:01 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataset #eda #cardibality
star

Mon Sep 05 2022 09:49:27 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataset #eda #missingdata
star

Mon Sep 05 2022 09:46:40 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataset #numerical #categorical #eda
star

Tue Jan 19 2021 17:32:21 GMT+0000 (Coordinated Universal Time)

#pairplot #eda #plot