Univariate Feature Selection

PHOTO EMBED

Tue Jul 25 2023 14:26:11 GMT+0000 (Coordinated Universal Time)

Saved by @sumikk ##featureselection #selectkbest #correlation #mutualinfo

#-----------------------Feature Selection-------------------------------------------
from sklearn.feature_selection import SelectKBest, f_regression

# Assuming X is your feature matrix and y is the target vector
selector = SelectKBest(score_func=f_regression, k=5)  # Select top 5 features
X_selected = selector.fit_transform(X, y)


# ---------------------- Correlations -----------------------------------------------
# pearson and spearman correlation
features = data.drop(columns=['target'])
target = data['target']

# Pearson Correlation
pearson_corr = features.corrwith(target, method='pearson')
pearson_corr = pearson_corr.abs().sort_values(ascending=False)

# Spearman Rank Correlation
spearman_corr = features.corrwith(target, method='spearman')
spearman_corr = spearman_corr.abs().sort_values(ascending=False)


# ---------------------  Mutual Info -------------------------------------------
from sklearn.feature_selection import mutual_info_classif

# Assuming 'data' is your DataFrame with features and 'target' is the target variable.
features = data.drop(columns=['target'])
target = data['target']

# Calculate mutual information between each feature and the target.
mutual_info = mutual_info_classif(features, target)

# Create a DataFrame to store the mutual information scores.
mutual_info_df = pd.DataFrame({'Feature': features.columns, 'Mutual_Info': mutual_info})
mutual_info_df = mutual_info_df.sort_values(by='Mutual_Info', ascending=False)

#---------------------- Chi2 test ----------------------------------------------
from sklearn.feature_selection import chi2

# Assuming 'data' is your DataFrame with features and 'target' is the target variable.
features = data.drop(columns=['target'])
target = data['target']

# Apply chi2 test to get p-values for each feature.
_, p_values = chi2(features, target)

# Create a DataFrame to store the chi-square test results.
chi2_df = pd.DataFrame({'Feature': features.columns, 'p-value': p_values})
chi2_df = chi2_df.sort_values(by='p-value')

# -------------------  ANOVA ----------------------------------------------------
from scipy.stats import f_classif

# Assuming 'data' is your DataFrame with features and 'target' is the target variable.
features = data.drop(columns=['target'])
target = data['target']

# Calculate F-statistic and p-values using ANOVA.
f_statistic, p_values = f_classif(features, target)

# Create a DataFrame to store the ANOVA results.
anova_df = pd.DataFrame({'Feature': features.columns, 'F-Statistic': f_statistic, 'p-value': p_values})
anova_df = anova_df.sort_values(by='p-value')




content_copyCOPY