FE: Imputation, Encoding, Scaling, PCA, TEXTDATA

Wed Jul 26 2023 07:44:59 GMT+0000 (Coordinated Universal Time)

Saved by @sumikk ##imputation ##knnimputation #onehotencoding #labelencoding ##textdatat ##tokenization #countvectorizer #tf-idf

# ------------------- KNN Imputer ------------------------------------------------
import pandas as pd
from sklearn.impute import KNNImputer

# Assuming 'data' is your DataFrame with missing values.
# Let's say 'k' is the number of neighbors to consider for imputation.
knn_imputer = KNNImputer(n_neighbors=k)
data_filled_knn = pd.DataFrame(knn_imputer.fit_transform(data), columns=data.columns)

# -------------------Interpolation Imputer------------------------------------------
import pandas as pd

# Assuming 'data' is your DataFrame with missing values.
data_filled_interpolation = data.interpolate(method='linear')

# -------------- Label Encoding -------------------------------------------------------
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assuming 'data' is your DataFrame with a categorical column that you want to encode.
# Let's say 'categorical_column' is the column with categorical values.

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'categorical_column' using LabelEncoder
data['encoded_column'] = label_encoder.fit_transform(data['categorical_column'])
# ------------------- One Hot Encoding ------------------------------------------------
import pandas as pd

# Perform One-Hot Encoding
one_hot_encoded_data = pd.get_dummies(data, columns=['categorical_column'])

# The original 'categorical_column' is dropped, and new binary columns are added for each category.
print(one_hot_encoded_data.head())

# ------------------ Binning ------------------------------------------------------
import pandas as pd

# Define the number of bins you want to create.
num_bins = 5

# Equal-Width Binning  -CUT
data['binned_column'] = pd.cut(data['numerical_column'], bins=num_bins, labels=False)


# Equal frequency  -QCUT
data['binned_column'] = pd.qcut(data['numerical_column'], q=num_bins, labels=False)

# Custome Bins
custom_bins = [0, 10, 20, 30, 40, 50, 100]
bin_labels = ['Bin 1', 'Bin 2', 'Bin 3', 'Bin 4', 'Bin 5', 'Bin 6']
data['custom_binned_column'] = pd.cut(data['numerical_column'], bins=custom_bins, labels=bin_labels)

# --------------------- Featute Scaling -----------------------------------------
# MinmaxSclaer , Standard Scaler
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Perform Min-Max Scaling
data_min_max_scaled = pd.DataFrame(min_max_scaler.fit_transform(data), columns=data.columns)

# Perform Standardization
data_standard_scaled = pd.DataFrame(standard_scaler.fit_transform(data), columns=data.columns)
# -------------------- boxcox transformation ---------------------------------------
from scipy.stats import boxcox

# Assuming 'data' is your DataFrame with a skewed numerical column to transform.
# Let's say 'skewed_column' is the column with skewed data.

# Apply the Box-Cox transformation
transformed_data, best_lambda = boxcox(data['skewed_column'])
data['power_transformed'] = transformed_data
# ------------------- yeojohnson transformation -------------------------------------
import pandas as pd
from scipy.stats import yeojohnson

# Assuming 'data' is your DataFrame with a column to transform.
# Let's say 'column_to_transform' is the column with the data.

# Apply the Yeo-Johnson transformation
transformed_data, best_lambda = yeojohnson(data['column_to_transform'])

# Add the transformed data as a new column in the DataFrame
data['yeo_johnson_transformed'] = transformed_data

#---------------------------- DATE TIME FEATURES --------------------------------
import pandas as pd

# Assuming 'data' is your DataFrame with a column representing date/time data.
# Let's say 'datetime_column' is the column containing the date/time information.

# Convert the 'datetime_column' to a pandas datetime format (if not already done).
data['datetime_column'] = pd.to_datetime(data['datetime_column'])

# Extract date and time features
data['year'] = data['datetime_column'].dt.year
data['month'] = data['datetime_column'].dt.month
data['day'] = data['datetime_column'].dt.day
data['weekday'] = data['datetime_column'].dt.weekday
data['hour'] = data['datetime_column'].dt.hour
data['minute'] = data['datetime_column'].dt.minute

# Example: Create a binary 'weekday/weekend' feature
data['is_weekend'] = data['weekday'].isin([5, 6]).astype(int)

# --------------------- PCA- Principal Component Analysis ---------------------
import numpy as np
from sklearn.decomposition import PCA

# Assuming 'X' is your data matrix with features.
# X should be a 2D array or DataFrame with numerical values.

# Initialize PCA with the desired number of components (e.g., 2 for 2D visualization).
pca = PCA(n_components=2)

# Fit and transform the data to get the principal components.
principal_components = pca.fit_transform(X)

# The 'principal_components' matrix contains the data projected onto the principal components.

# Access the explained variance ratio of each principal component.
explained_variance_ratio = pca.explained_variance_ratio_

# Access the principal components (eigenvectors).
components = pca.components_

# Access the mean of each feature (used in centering the data during transformation).
mean = pca.mean_

# Access the singular values.
singular_values = pca.singular_values_
# ------------------------ Textdate -----------------------------------------------
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

# Sample text data
text_data = [
    "This is a sample sentence.",
    "Textual data transformation is important for NLP tasks.",
    "Machine learning algorithms analyze text data.",
    "Preprocessing text data involves tokenization and stopword removal."
]

# Tokenization
tokenized_data = [nltk.word_tokenize(text) for text in text_data]

# Stopword Removal
stop_words = set(stopwords.words("english"))
filtered_data = [[word for word in tokens if word.lower() not in stop_words] for tokens in tokenized_data]

# BoW Representation
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform([" ".join(tokens) for tokens in filtered_data])

# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(tokens) for tokens in filtered_data])

#---another method
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
def create_embeddings(messages, vectorizer):
    # fit and transform our messages
    embeddings = vectorizer.fit_transform(messages)
    
    # create our dataframe
    df = pd.DataFrame(embeddings.toarray(), \
                      columns=vectorizer.get_feature_names_out())
    return df

messages = ['I like to play at the park',
            'I play baseball with friends the park']

# create with CountVectorizer
vectorizer = CountVectorizer()

# send our embeddings through with our function
embeddings = create_embeddings(messages, vectorizer)

# return out embeddings
embeddings


# create with CountVectorizer
vectorizer = TfidfVectorizer()

# send our embeddings through with our function
embeddings = create_embeddings(messages, vectorizer)

# return out embeddings
embeddings
content_copyCOPY

Imputation - KNN and Interpolation Label Encoding, One Hot Encoding