# ------------------- KNN Imputer ------------------------------------------------ import pandas as pd from sklearn.impute import KNNImputer # Assuming 'data' is your DataFrame with missing values. # Let's say 'k' is the number of neighbors to consider for imputation. knn_imputer = KNNImputer(n_neighbors=k) data_filled_knn = pd.DataFrame(knn_imputer.fit_transform(data), columns=data.columns) # -------------------Interpolation Imputer------------------------------------------ import pandas as pd # Assuming 'data' is your DataFrame with missing values. data_filled_interpolation = data.interpolate(method='linear') # -------------- Label Encoding ------------------------------------------------------- import pandas as pd from sklearn.preprocessing import LabelEncoder # Assuming 'data' is your DataFrame with a categorical column that you want to encode. # Let's say 'categorical_column' is the column with categorical values. # Initialize the LabelEncoder label_encoder = LabelEncoder() # Fit and transform the 'categorical_column' using LabelEncoder data['encoded_column'] = label_encoder.fit_transform(data['categorical_column']) # ------------------- One Hot Encoding ------------------------------------------------ import pandas as pd # Perform One-Hot Encoding one_hot_encoded_data = pd.get_dummies(data, columns=['categorical_column']) # The original 'categorical_column' is dropped, and new binary columns are added for each category. print(one_hot_encoded_data.head()) # ------------------ Binning ------------------------------------------------------ import pandas as pd # Define the number of bins you want to create. num_bins = 5 # Equal-Width Binning -CUT data['binned_column'] = pd.cut(data['numerical_column'], bins=num_bins, labels=False) # Equal frequency -QCUT data['binned_column'] = pd.qcut(data['numerical_column'], q=num_bins, labels=False) # Custome Bins custom_bins = [0, 10, 20, 30, 40, 50, 100] bin_labels = ['Bin 1', 'Bin 2', 'Bin 3', 'Bin 4', 'Bin 5', 'Bin 6'] data['custom_binned_column'] = pd.cut(data['numerical_column'], bins=custom_bins, labels=bin_labels) # --------------------- Featute Scaling ----------------------------------------- # MinmaxSclaer , Standard Scaler import pandas as pd from sklearn.preprocessing import MinMaxScaler, StandardScaler min_max_scaler = MinMaxScaler() standard_scaler = StandardScaler() # Perform Min-Max Scaling data_min_max_scaled = pd.DataFrame(min_max_scaler.fit_transform(data), columns=data.columns) # Perform Standardization data_standard_scaled = pd.DataFrame(standard_scaler.fit_transform(data), columns=data.columns) # -------------------- boxcox transformation --------------------------------------- from scipy.stats import boxcox # Assuming 'data' is your DataFrame with a skewed numerical column to transform. # Let's say 'skewed_column' is the column with skewed data. # Apply the Box-Cox transformation transformed_data, best_lambda = boxcox(data['skewed_column']) data['power_transformed'] = transformed_data # ------------------- yeojohnson transformation ------------------------------------- import pandas as pd from scipy.stats import yeojohnson # Assuming 'data' is your DataFrame with a column to transform. # Let's say 'column_to_transform' is the column with the data. # Apply the Yeo-Johnson transformation transformed_data, best_lambda = yeojohnson(data['column_to_transform']) # Add the transformed data as a new column in the DataFrame data['yeo_johnson_transformed'] = transformed_data #---------------------------- DATE TIME FEATURES -------------------------------- import pandas as pd # Assuming 'data' is your DataFrame with a column representing date/time data. # Let's say 'datetime_column' is the column containing the date/time information. # Convert the 'datetime_column' to a pandas datetime format (if not already done). data['datetime_column'] = pd.to_datetime(data['datetime_column']) # Extract date and time features data['year'] = data['datetime_column'].dt.year data['month'] = data['datetime_column'].dt.month data['day'] = data['datetime_column'].dt.day data['weekday'] = data['datetime_column'].dt.weekday data['hour'] = data['datetime_column'].dt.hour data['minute'] = data['datetime_column'].dt.minute # Example: Create a binary 'weekday/weekend' feature data['is_weekend'] = data['weekday'].isin([5, 6]).astype(int) # --------------------- PCA- Principal Component Analysis --------------------- import numpy as np from sklearn.decomposition import PCA # Assuming 'X' is your data matrix with features. # X should be a 2D array or DataFrame with numerical values. # Initialize PCA with the desired number of components (e.g., 2 for 2D visualization). pca = PCA(n_components=2) # Fit and transform the data to get the principal components. principal_components = pca.fit_transform(X) # The 'principal_components' matrix contains the data projected onto the principal components. # Access the explained variance ratio of each principal component. explained_variance_ratio = pca.explained_variance_ratio_ # Access the principal components (eigenvectors). components = pca.components_ # Access the mean of each feature (used in centering the data during transformation). mean = pca.mean_ # Access the singular values. singular_values = pca.singular_values_ # ------------------------ Textdate ----------------------------------------------- from nltk.corpus import stopwords from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Sample text data text_data = [ "This is a sample sentence.", "Textual data transformation is important for NLP tasks.", "Machine learning algorithms analyze text data.", "Preprocessing text data involves tokenization and stopword removal." ] # Tokenization tokenized_data = [nltk.word_tokenize(text) for text in text_data] # Stopword Removal stop_words = set(stopwords.words("english")) filtered_data = [[word for word in tokens if word.lower() not in stop_words] for tokens in tokenized_data] # BoW Representation bow_vectorizer = CountVectorizer() bow_matrix = bow_vectorizer.fit_transform([" ".join(tokens) for tokens in filtered_data]) # TF-IDF Representation tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(tokens) for tokens in filtered_data]) #---another method import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer def create_embeddings(messages, vectorizer): # fit and transform our messages embeddings = vectorizer.fit_transform(messages) # create our dataframe df = pd.DataFrame(embeddings.toarray(), \ columns=vectorizer.get_feature_names_out()) return df messages = ['I like to play at the park', 'I play baseball with friends the park'] # create with CountVectorizer vectorizer = CountVectorizer() # send our embeddings through with our function embeddings = create_embeddings(messages, vectorizer) # return out embeddings embeddings # create with CountVectorizer vectorizer = TfidfVectorizer() # send our embeddings through with our function embeddings = create_embeddings(messages, vectorizer) # return out embeddings embeddings
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter