# ------------------- KNN Imputer ------------------------------------------------
import pandas as pd
from sklearn.impute import KNNImputer
# Assuming 'data' is your DataFrame with missing values.
# Let's say 'k' is the number of neighbors to consider for imputation.
knn_imputer = KNNImputer(n_neighbors=k)
data_filled_knn = pd.DataFrame(knn_imputer.fit_transform(data), columns=data.columns)
# -------------------Interpolation Imputer------------------------------------------
import pandas as pd
# Assuming 'data' is your DataFrame with missing values.
data_filled_interpolation = data.interpolate(method='linear')
# -------------- Label Encoding -------------------------------------------------------
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Assuming 'data' is your DataFrame with a categorical column that you want to encode.
# Let's say 'categorical_column' is the column with categorical values.
# Initialize the LabelEncoder
label_encoder = LabelEncoder()
# Fit and transform the 'categorical_column' using LabelEncoder
data['encoded_column'] = label_encoder.fit_transform(data['categorical_column'])
# ------------------- One Hot Encoding ------------------------------------------------
import pandas as pd
# Perform One-Hot Encoding
one_hot_encoded_data = pd.get_dummies(data, columns=['categorical_column'])
# The original 'categorical_column' is dropped, and new binary columns are added for each category.
print(one_hot_encoded_data.head())
# ------------------ Binning ------------------------------------------------------
import pandas as pd
# Define the number of bins you want to create.
num_bins = 5
# Equal-Width Binning -CUT
data['binned_column'] = pd.cut(data['numerical_column'], bins=num_bins, labels=False)
# Equal frequency -QCUT
data['binned_column'] = pd.qcut(data['numerical_column'], q=num_bins, labels=False)
# Custome Bins
custom_bins = [0, 10, 20, 30, 40, 50, 100]
bin_labels = ['Bin 1', 'Bin 2', 'Bin 3', 'Bin 4', 'Bin 5', 'Bin 6']
data['custom_binned_column'] = pd.cut(data['numerical_column'], bins=custom_bins, labels=bin_labels)
# --------------------- Featute Scaling -----------------------------------------
# MinmaxSclaer , Standard Scaler
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()
# Perform Min-Max Scaling
data_min_max_scaled = pd.DataFrame(min_max_scaler.fit_transform(data), columns=data.columns)
# Perform Standardization
data_standard_scaled = pd.DataFrame(standard_scaler.fit_transform(data), columns=data.columns)
# -------------------- boxcox transformation ---------------------------------------
from scipy.stats import boxcox
# Assuming 'data' is your DataFrame with a skewed numerical column to transform.
# Let's say 'skewed_column' is the column with skewed data.
# Apply the Box-Cox transformation
transformed_data, best_lambda = boxcox(data['skewed_column'])
data['power_transformed'] = transformed_data
# ------------------- yeojohnson transformation -------------------------------------
import pandas as pd
from scipy.stats import yeojohnson
# Assuming 'data' is your DataFrame with a column to transform.
# Let's say 'column_to_transform' is the column with the data.
# Apply the Yeo-Johnson transformation
transformed_data, best_lambda = yeojohnson(data['column_to_transform'])
# Add the transformed data as a new column in the DataFrame
data['yeo_johnson_transformed'] = transformed_data
#---------------------------- DATE TIME FEATURES --------------------------------
import pandas as pd
# Assuming 'data' is your DataFrame with a column representing date/time data.
# Let's say 'datetime_column' is the column containing the date/time information.
# Convert the 'datetime_column' to a pandas datetime format (if not already done).
data['datetime_column'] = pd.to_datetime(data['datetime_column'])
# Extract date and time features
data['year'] = data['datetime_column'].dt.year
data['month'] = data['datetime_column'].dt.month
data['day'] = data['datetime_column'].dt.day
data['weekday'] = data['datetime_column'].dt.weekday
data['hour'] = data['datetime_column'].dt.hour
data['minute'] = data['datetime_column'].dt.minute
# Example: Create a binary 'weekday/weekend' feature
data['is_weekend'] = data['weekday'].isin([5, 6]).astype(int)
# --------------------- PCA- Principal Component Analysis ---------------------
import numpy as np
from sklearn.decomposition import PCA
# Assuming 'X' is your data matrix with features.
# X should be a 2D array or DataFrame with numerical values.
# Initialize PCA with the desired number of components (e.g., 2 for 2D visualization).
pca = PCA(n_components=2)
# Fit and transform the data to get the principal components.
principal_components = pca.fit_transform(X)
# The 'principal_components' matrix contains the data projected onto the principal components.
# Access the explained variance ratio of each principal component.
explained_variance_ratio = pca.explained_variance_ratio_
# Access the principal components (eigenvectors).
components = pca.components_
# Access the mean of each feature (used in centering the data during transformation).
mean = pca.mean_
# Access the singular values.
singular_values = pca.singular_values_
# ------------------------ Textdate -----------------------------------------------
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Sample text data
text_data = [
"This is a sample sentence.",
"Textual data transformation is important for NLP tasks.",
"Machine learning algorithms analyze text data.",
"Preprocessing text data involves tokenization and stopword removal."
]
# Tokenization
tokenized_data = [nltk.word_tokenize(text) for text in text_data]
# Stopword Removal
stop_words = set(stopwords.words("english"))
filtered_data = [[word for word in tokens if word.lower() not in stop_words] for tokens in tokenized_data]
# BoW Representation
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform([" ".join(tokens) for tokens in filtered_data])
# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(tokens) for tokens in filtered_data])
#---another method
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
def create_embeddings(messages, vectorizer):
# fit and transform our messages
embeddings = vectorizer.fit_transform(messages)
# create our dataframe
df = pd.DataFrame(embeddings.toarray(), \
columns=vectorizer.get_feature_names_out())
return df
messages = ['I like to play at the park',
'I play baseball with friends the park']
# create with CountVectorizer
vectorizer = CountVectorizer()
# send our embeddings through with our function
embeddings = create_embeddings(messages, vectorizer)
# return out embeddings
embeddings
# create with CountVectorizer
vectorizer = TfidfVectorizer()
# send our embeddings through with our function
embeddings = create_embeddings(messages, vectorizer)
# return out embeddings
embeddings