#-----------------------Feature Selection-------------------------------------------
from sklearn.feature_selection import SelectKBest, f_regression
# Assuming X is your feature matrix and y is the target vector
selector = SelectKBest(score_func=f_regression, k=5) # Select top 5 features
X_selected = selector.fit_transform(X, y)
# ---------------------- Correlations -----------------------------------------------
# pearson and spearman correlation
features = data.drop(columns=['target'])
target = data['target']
# Pearson Correlation
pearson_corr = features.corrwith(target, method='pearson')
pearson_corr = pearson_corr.abs().sort_values(ascending=False)
# Spearman Rank Correlation
spearman_corr = features.corrwith(target, method='spearman')
spearman_corr = spearman_corr.abs().sort_values(ascending=False)
# --------------------- Mutual Info -------------------------------------------
from sklearn.feature_selection import mutual_info_classif
# Assuming 'data' is your DataFrame with features and 'target' is the target variable.
features = data.drop(columns=['target'])
target = data['target']
# Calculate mutual information between each feature and the target.
mutual_info = mutual_info_classif(features, target)
# Create a DataFrame to store the mutual information scores.
mutual_info_df = pd.DataFrame({'Feature': features.columns, 'Mutual_Info': mutual_info})
mutual_info_df = mutual_info_df.sort_values(by='Mutual_Info', ascending=False)
#---------------------- Chi2 test ----------------------------------------------
from sklearn.feature_selection import chi2
# Assuming 'data' is your DataFrame with features and 'target' is the target variable.
features = data.drop(columns=['target'])
target = data['target']
# Apply chi2 test to get p-values for each feature.
_, p_values = chi2(features, target)
# Create a DataFrame to store the chi-square test results.
chi2_df = pd.DataFrame({'Feature': features.columns, 'p-value': p_values})
chi2_df = chi2_df.sort_values(by='p-value')
# ------------------- ANOVA ----------------------------------------------------
from scipy.stats import f_classif
# Assuming 'data' is your DataFrame with features and 'target' is the target variable.
features = data.drop(columns=['target'])
target = data['target']
# Calculate F-statistic and p-values using ANOVA.
f_statistic, p_values = f_classif(features, target)
# Create a DataFrame to store the ANOVA results.
anova_df = pd.DataFrame({'Feature': features.columns, 'F-Statistic': f_statistic, 'p-value': p_values})
anova_df = anova_df.sort_values(by='p-value')
Comments