Exploratory Data Analysis
Thu Aug 24 2023 14:55:21 GMT+0000 (Coordinated Universal Time)
Saved by @sumikk ##partialdependencyplot #info.column_information(df)info.agg_tabulation(df) info.num_count_summary(df) info.statistical_summary(df)
class Attribute_Information(): def __init__(self): print("Attribute Information object created") def Column_information(self,df): """ This method will give us a basic information of the dataframe like Count of Attributes,Count of rows, Numerical Attributes, Categorical Attributes, Factor Attributes etc.. """ data_info = pd.DataFrame( columns=['No of observation', 'No of Variables', 'No of Numerical Variables', 'No of Factor Variables', 'No of Categorical Variables', 'No of Logical Variables', 'No of Date Variables', 'No of zero variance variables']) data_info.loc[0,'No of observation'] = df.shape[0] data_info.loc[0,'No of Variables'] = df.shape[1] data_info.loc[0,'No of Numerical Variables'] = df._get_numeric_data().shape[1] data_info.loc[0,'No of Factor Variables'] = df.select_dtypes(include='category').shape[1] data_info.loc[0,'No of Logical Variables'] = df.select_dtypes(include='bool').shape[1] data_info.loc[0,'No of Categorical Variables'] = df.select_dtypes(include='object').shape[1] data_info.loc[0,'No of Date Variables'] = df.select_dtypes(include='datetime64').shape[1] data_info.loc[0,'No of zero variance variables'] = df.loc[:,df.apply(pd.Series.nunique)==1].shape[1] data_info =data_info.transpose() data_info.columns=['value'] data_info['value'] = data_info['value'].astype(int) return data_info def __get_missing_values(self,data): """ It is a Private method, so it cannot be accessed by object outside the class. This function will give us a basic information like count of missing values """ #Getting sum of missing values for each feature missing_values = data.isnull().sum() #Feature missing values are sorted from few to many missing_values.sort_values(ascending=False, inplace=True) #Returning missing values return missing_values def Agg_Tabulation(self,data): """ This method is a extension of schema will gives the aditional information about the data like Entropy value, Missing Value Percentage and some observations """ print("=" * 110) print("Aggregation of Table") print("=" * 110) table = pd.DataFrame(data.dtypes,columns=['dtypes']) table1 =pd.DataFrame(data.columns,columns=['Names']) table = table.reset_index() table= table.rename(columns={'index':'Name'}) table['No of Missing'] = data.isnull().sum().values table['No of Uniques'] = data.nunique().values table['Percent of Missing'] = ((data.isnull().sum().values)/ (data.shape[0])) *100 table['First Observation'] = data.loc[0].values table['Second Observation'] = data.loc[1].values table['Third Observation'] = data.loc[2].values for name in table['Name'].value_counts().index: table.loc[table['Name'] == name, 'Entropy'] = round(stats.entropy(data[name].value_counts(normalize=True), base=2),2) return table print("=" * 110) def __iqr(self,x): """ It is a private method which returns you interquartile Range """ return x.quantile(q=0.75) - x.quantile(q=0.25) def __outlier_count(self,x): """ It is a private method which returns you outlier present in the interquartile Range """ upper_out = x.quantile(q=0.75) + 1.5 * self.__iqr(x) lower_out = x.quantile(q=0.25) - 1.5 * self.__iqr(x) return len(x[x > upper_out]) + len(x[x < lower_out]) def num_count_summary(self,df): """ This method will returns you the information about numerical attributes like Positive values,Negative Values Unique count, Zero count positive and negative inf- nity count and count of outliers etc """ df_num = df._get_numeric_data() data_info_num = pd.DataFrame() i=0 for c in df_num.columns: data_info_num.loc[c,'Negative values count']= df_num[df_num[c]<0].shape[0] data_info_num.loc[c,'Positive values count']= df_num[df_num[c]>0].shape[0] data_info_num.loc[c,'Zero count']= df_num[df_num[c]==0].shape[0] data_info_num.loc[c,'Unique count']= len(df_num[c].unique()) data_info_num.loc[c,'Negative Infinity count']= df_num[df_num[c]== -np.inf].shape[0] data_info_num.loc[c,'Positive Infinity count']= df_num[df_num[c]== np.inf].shape[0] data_info_num.loc[c,'Missing Percentage']= df_num[df_num[c].isnull()].shape[0]/ df_num.shape[0] data_info_num.loc[c,'Count of outliers']= self.__outlier_count(df_num[c]) i = i+1 return data_info_num def statistical_summary(self,df): """ This method will returns you the varoius percentile of the data including count and mean """ df_num = df._get_numeric_data() data_stat_num = pd.DataFrame() try: data_stat_num = pd.concat([df_num.describe().transpose(), pd.DataFrame(df_num.quantile(q=0.10)), pd.DataFrame(df_num.quantile(q=0.90)), pd.DataFrame(df_num.quantile(q=0.95))],axis=1) data_stat_num.columns = ['count','mean','std','min','25%','50%','75%','max','10%','90%','95%'] except: pass return data_stat_num Info = Attribute_Information() Info.Column_information(df) Info.Agg_Tabulation(df) Info.num_count_summary(df) Info.statistical_summary(df)
Comments