Snippets Collections
df['year'] = df['date'].dt.year if we put dt.month is for month etc..
df.head()
def find_boundaries(df, variable, distance=1.5):

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary
data.nunique().plot.bar(figsize=(12,6))
plt.ylabel('Number of unique categories')
plt.xlabel('Variables')
plt.title('Cardinality')

## Version with 5% threshold

fig = label_freq.sort_values(ascending=False).plot.bar()
fig.axhline(y=0.05, color='red')
fig.set_ylabel('percentage of cars within each category')
fig.set_xlabel('Variable: class')
fig.set_title('Identifying Rare Categories')
plt.show()
data.isnull().mean().plot.bar(figsize=(12,6))
plt.ylabel('Percentage of missing values')
plt.xlabel('Variables')
plt.title('Quantifying missing data')
# Get the Numerical Data list to infer distribution plots

numerical = [var for var in df.columns if df[var].dtype!='O'] 
print('There are {} numerical variables\n'.format(len(numerical))) 
print('The numerical variables are :', numerical)

# Get the Categorical Data list to infer distribution plots

categorical = [var for var in df.columns if df[var].dtype =='O'] 
print('There are {} Categorical variables\n'.format(len(categorical))) 
print('The Categorical variables are :', categorical)
import dtale
from pandas_profiling import ProfileReport
from dataprep.eda import create_report

class AutomateEDA:
    
    def __init__(self, df):
        self.df = df
     
    def show_dtale(self):
        d = dtale.show(self.df)
        d.open_browser()
        print('dtale opened in browser!')
        
    def show_pandas_profile_report(self):
        profile = ProfileReport(self.df, title="Pandas Profiling Report", explorative=True)
        profile.to_file("pandas-profiling-report.html")
        print('pandas-profile-report created and saved in the project folder!')
        
    def show_dataprep(self):
        create_report(self.df).show_browser()
        print('dataprep opened in browser!')
        
    def show_all(self):
        self.show_dtale()
        self.show_dataprep()
        self.show_pandas_profile_report();


eda = AutomateEDA(df)
eda.show_all()
sns.pairplot(df,hue = ‘smoker’,diag_kind = “kde”,kind = “scatter”,palette = “husl”)

#where :
#kind : Kind of plot for the non-identity relationships. {‘scatter’, ‘reg’}
#diag_kind : Kind of plot for the diagonal subplots. {‘hist’, ‘kde’}

Save snippets that work with our extensions

Available in the Chrome Web Store Get Firefox Add-on Get VS Code extension