thiscodeWorks - Organizing the best of code online

Plot distribution and boxplot for a variable

# Create a function that we can re-use
def show_distribution(var_data):
    '''
    This function will make a distribution (graph) and display it
    '''

    # Get statistics
    min_val = var_data.min()
    max_val = var_data.max()
    mean_val = var_data.mean()
    med_val = var_data.median()
    mod_val = var_data.mode()[0]

    print('Minimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n'.format(min_val,
                                                                                            mean_val,
                                                                                            med_val,
                                                                                            mod_val,
                                                                                            max_val))

    # Create a figure for 2 subplots (2 rows, 1 column)
    fig, ax = plt.subplots(2, 1, figsize = (10,4))

    # Plot the histogram   
    ax[0].hist(var_data)
    ax[0].set_ylabel('Frequency')

    # Add lines for the mean, median, and mode
    ax[0].axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)

    # Plot the boxplot   
    ax[1].boxplot(var_data, vert=False)
    ax[1].set_xlabel('Value')

    # Add a title to the Figure
    fig.suptitle('Data Distribution')

    # Show the figure
    fig.show()


show_distribution(df_students['Grade'])

#python #pandas #dataset #eda #extractyear #datetime

Extract the year from a date data In pandas

df['year'] = df['date'].dt.year if we put dt.month is for month etc..
df.head()

#python #pandas #dataset #eda #outliers #boundaries

Function to find boundaries for outliers

def find_boundaries(df, variable, distance=1.5):

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary

#python #pandas #dataset #eda #cardibality

Quantifying cardinality In datasets

data.nunique().plot.bar(figsize=(12,6))
plt.ylabel('Number of unique categories')
plt.xlabel('Variables')
plt.title('Cardinality')

## Version with 5% threshold

fig = label_freq.sort_values(ascending=False).plot.bar()
fig.axhline(y=0.05, color='red')
fig.set_ylabel('percentage of cars within each category')
fig.set_xlabel('Variable: class')
fig.set_title('Identifying Rare Categories')
plt.show()

#python #pandas #dataset #eda #missingdata

Quantifying Missing data with charts

data.isnull().mean().plot.bar(figsize=(12,6))
plt.ylabel('Percentage of missing values')
plt.xlabel('Variables')
plt.title('Quantifying missing data')

#python #pandas #dataset #numerical #categorical #eda

Printing, listing and counting the numerical or categorical features in a dataset Pandas

# Get the Numerical Data list to infer distribution plots

numerical = [var for var in df.columns if df[var].dtype!='O'] 
print('There are {} numerical variables\n'.format(len(numerical))) 
print('The numerical variables are :', numerical)

# Get the Categorical Data list to infer distribution plots

categorical = [var for var in df.columns if df[var].dtype =='O'] 
print('There are {} Categorical variables\n'.format(len(categorical))) 
print('The Categorical variables are :', categorical)

#eda #automate #automateeda #data #datapreprocessing #datavisualization #featureengineering

Automate EDA class

import dtale
from pandas_profiling import ProfileReport
from dataprep.eda import create_report

class AutomateEDA:
    
    def __init__(self, df):
        self.df = df
     
    def show_dtale(self):
        d = dtale.show(self.df)
        d.open_browser()
        print('dtale opened in browser!')
        
    def show_pandas_profile_report(self):
        profile = ProfileReport(self.df, title="Pandas Profiling Report", explorative=True)
        profile.to_file("pandas-profiling-report.html")
        print('pandas-profile-report created and saved in the project folder!')
        
    def show_dataprep(self):
        create_report(self.df).show_browser()
        print('dataprep opened in browser!')
        
    def show_all(self):
        self.show_dtale()
        self.show_dataprep()
        self.show_pandas_profile_report();


eda = AutomateEDA(df)
eda.show_all()

#pairplot #eda #plot

PairPlot

sns.pairplot(df,hue = ‘smoker’,diag_kind = “kde”,kind = “scatter”,palette = “husl”)

#where :
#kind : Kind of plot for the non-identity relationships. {‘scatter’, ‘reg’}
#diag_kind : Kind of plot for the diagonal subplots. {‘hist’, ‘kde’}

Plot distribution and boxplot for a variable

Extract the year from a date data In pandas

Function to find boundaries for outliers

Quantifying cardinality In datasets

Quantifying Missing data with charts

Printing, listing and counting the numerical or categorical features in a dataset Pandas

Automate EDA class

PairPlot

Save snippets that work with our extensions