Exploratory Data Analysis
Thu Aug 24 2023 14:55:21 GMT+0000 (Coordinated Universal Time)
Saved by @sumikk ##partialdependencyplot #info.column_information(df)info.agg_tabulation(df) info.num_count_summary(df) info.statistical_summary(df)
class Attribute_Information():
def __init__(self):
print("Attribute Information object created")
def Column_information(self,df):
"""
This method will give us a basic
information of the dataframe like
Count of Attributes,Count of rows,
Numerical Attributes, Categorical
Attributes, Factor Attributes etc..
"""
data_info = pd.DataFrame(
columns=['No of observation',
'No of Variables',
'No of Numerical Variables',
'No of Factor Variables',
'No of Categorical Variables',
'No of Logical Variables',
'No of Date Variables',
'No of zero variance variables'])
data_info.loc[0,'No of observation'] = df.shape[0]
data_info.loc[0,'No of Variables'] = df.shape[1]
data_info.loc[0,'No of Numerical Variables'] = df._get_numeric_data().shape[1]
data_info.loc[0,'No of Factor Variables'] = df.select_dtypes(include='category').shape[1]
data_info.loc[0,'No of Logical Variables'] = df.select_dtypes(include='bool').shape[1]
data_info.loc[0,'No of Categorical Variables'] = df.select_dtypes(include='object').shape[1]
data_info.loc[0,'No of Date Variables'] = df.select_dtypes(include='datetime64').shape[1]
data_info.loc[0,'No of zero variance variables'] = df.loc[:,df.apply(pd.Series.nunique)==1].shape[1]
data_info =data_info.transpose()
data_info.columns=['value']
data_info['value'] = data_info['value'].astype(int)
return data_info
def __get_missing_values(self,data):
"""
It is a Private method, so it cannot
be accessed by object outside the
class. This function will give us
a basic information like count
of missing values
"""
#Getting sum of missing values for each feature
missing_values = data.isnull().sum()
#Feature missing values are sorted from few to many
missing_values.sort_values(ascending=False, inplace=True)
#Returning missing values
return missing_values
def Agg_Tabulation(self,data):
"""
This method is a extension of
schema will gives the aditional
information about the data
like Entropy value, Missing
Value Percentage and some observations
"""
print("=" * 110)
print("Aggregation of Table")
print("=" * 110)
table = pd.DataFrame(data.dtypes,columns=['dtypes'])
table1 =pd.DataFrame(data.columns,columns=['Names'])
table = table.reset_index()
table= table.rename(columns={'index':'Name'})
table['No of Missing'] = data.isnull().sum().values
table['No of Uniques'] = data.nunique().values
table['Percent of Missing'] = ((data.isnull().sum().values)/ (data.shape[0])) *100
table['First Observation'] = data.loc[0].values
table['Second Observation'] = data.loc[1].values
table['Third Observation'] = data.loc[2].values
for name in table['Name'].value_counts().index:
table.loc[table['Name'] == name, 'Entropy'] = round(stats.entropy(data[name].value_counts(normalize=True), base=2),2)
return table
print("=" * 110)
def __iqr(self,x):
"""
It is a private method which
returns you interquartile Range
"""
return x.quantile(q=0.75) - x.quantile(q=0.25)
def __outlier_count(self,x):
"""
It is a private method which
returns you outlier present
in the interquartile Range
"""
upper_out = x.quantile(q=0.75) + 1.5 * self.__iqr(x)
lower_out = x.quantile(q=0.25) - 1.5 * self.__iqr(x)
return len(x[x > upper_out]) + len(x[x < lower_out])
def num_count_summary(self,df):
"""
This method will returns
you the information about
numerical attributes like
Positive values,Negative Values
Unique count, Zero count
positive and negative inf-
nity count and count of outliers
etc
"""
df_num = df._get_numeric_data()
data_info_num = pd.DataFrame()
i=0
for c in df_num.columns:
data_info_num.loc[c,'Negative values count']= df_num[df_num[c]<0].shape[0]
data_info_num.loc[c,'Positive values count']= df_num[df_num[c]>0].shape[0]
data_info_num.loc[c,'Zero count']= df_num[df_num[c]==0].shape[0]
data_info_num.loc[c,'Unique count']= len(df_num[c].unique())
data_info_num.loc[c,'Negative Infinity count']= df_num[df_num[c]== -np.inf].shape[0]
data_info_num.loc[c,'Positive Infinity count']= df_num[df_num[c]== np.inf].shape[0]
data_info_num.loc[c,'Missing Percentage']= df_num[df_num[c].isnull()].shape[0]/ df_num.shape[0]
data_info_num.loc[c,'Count of outliers']= self.__outlier_count(df_num[c])
i = i+1
return data_info_num
def statistical_summary(self,df):
"""
This method will returns
you the varoius percentile
of the data including count
and mean
"""
df_num = df._get_numeric_data()
data_stat_num = pd.DataFrame()
try:
data_stat_num = pd.concat([df_num.describe().transpose(),
pd.DataFrame(df_num.quantile(q=0.10)),
pd.DataFrame(df_num.quantile(q=0.90)),
pd.DataFrame(df_num.quantile(q=0.95))],axis=1)
data_stat_num.columns = ['count','mean','std','min','25%','50%','75%','max','10%','90%','95%']
except:
pass
return data_stat_num
Info = Attribute_Information()
Info.Column_information(df)
Info.Agg_Tabulation(df)
Info.num_count_summary(df)
Info.statistical_summary(df)



Comments