#-----------------------Feature Selection------------------------------------------- from sklearn.feature_selection import SelectKBest, f_regression # Assuming X is your feature matrix and y is the target vector selector = SelectKBest(score_func=f_regression, k=5) # Select top 5 features X_selected = selector.fit_transform(X, y) # ---------------------- Correlations ----------------------------------------------- # pearson and spearman correlation features = data.drop(columns=['target']) target = data['target'] # Pearson Correlation pearson_corr = features.corrwith(target, method='pearson') pearson_corr = pearson_corr.abs().sort_values(ascending=False) # Spearman Rank Correlation spearman_corr = features.corrwith(target, method='spearman') spearman_corr = spearman_corr.abs().sort_values(ascending=False) # --------------------- Mutual Info ------------------------------------------- from sklearn.feature_selection import mutual_info_classif # Assuming 'data' is your DataFrame with features and 'target' is the target variable. features = data.drop(columns=['target']) target = data['target'] # Calculate mutual information between each feature and the target. mutual_info = mutual_info_classif(features, target) # Create a DataFrame to store the mutual information scores. mutual_info_df = pd.DataFrame({'Feature': features.columns, 'Mutual_Info': mutual_info}) mutual_info_df = mutual_info_df.sort_values(by='Mutual_Info', ascending=False) #---------------------- Chi2 test ---------------------------------------------- from sklearn.feature_selection import chi2 # Assuming 'data' is your DataFrame with features and 'target' is the target variable. features = data.drop(columns=['target']) target = data['target'] # Apply chi2 test to get p-values for each feature. _, p_values = chi2(features, target) # Create a DataFrame to store the chi-square test results. chi2_df = pd.DataFrame({'Feature': features.columns, 'p-value': p_values}) chi2_df = chi2_df.sort_values(by='p-value') # ------------------- ANOVA ---------------------------------------------------- from scipy.stats import f_classif # Assuming 'data' is your DataFrame with features and 'target' is the target variable. features = data.drop(columns=['target']) target = data['target'] # Calculate F-statistic and p-values using ANOVA. f_statistic, p_values = f_classif(features, target) # Create a DataFrame to store the ANOVA results. anova_df = pd.DataFrame({'Feature': features.columns, 'F-Statistic': f_statistic, 'p-value': p_values}) anova_df = anova_df.sort_values(by='p-value')