naive bayes and outliers
Mon Nov 18 2024 11:29:52 GMT+0000 (Coordinated Universal Time)
Saved by @wtlab
# Import necessary libraries import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score, classification_report # Step 1: Load the dataset df = pd.read_csv('Iris.csv') # Replace with the correct path if needed # Step 1.1: Data Exploration print("Dataset Preview:") print(df.head()) # Data Overview print("\nDataset Information:") print(df.info()) # Step 1.2: Verify Missing Values print("\nMissing Values:") print(df.isnull().sum()) # Step 1.3: Verify Duplicates print("\nDuplicate Rows:") print(df.duplicated().sum()) # Step 1.4: Detect and Remove Outliers using IQR method # IQR method to detect outliers Q1 = df.select_dtypes(include=[np.number]).quantile(0.25) Q3 = df.select_dtypes(include=[np.number]).quantile(0.75) IQR = Q3 - Q1 # Outliers are those outside the range (Q1 - 1.5*IQR) and (Q3 + 1.5*IQR) outliers_iqr = ((df.select_dtypes(include=[np.number]) < (Q1 - 1.5 * IQR)) | (df.select_dtypes(include=[np.number]) > (Q3 + 1.5 * IQR))).any(axis=1) # Removing outliers df_no_outliers_iqr = df[~outliers_iqr] print(f"\nShape after removing outliers: {df_no_outliers_iqr.shape}") # Step 1.5: Split Dataset and Perform Naive Bayes Classification # Define features (X) and target (y) X = df_no_outliers_iqr.drop(['Id', 'Species'], axis=1) # Drop 'Id' and 'Species' y = df_no_outliers_iqr['Species'] # Split the data into training and testing sets (70% training, 30% testing) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Initialize Naive Bayes classifier (GaussianNB for continuous data) nb_classifier = GaussianNB() # Train the model nb_classifier.fit(X_train, y_train) # Make predictions on the test set y_pred = nb_classifier.predict(X_test) # Evaluate the model accuracy = accuracy_score(y_test, y_pred) print(f"\nAccuracy of the Naive Bayes model: {accuracy * 100:.2f}%") print("\nClassification Report:") print(classification_report(y_test, y_pred)) # Optional: Visualize the distribution of data and outliers plt.figure(figsize=(10, 6)) sns.boxplot(data=df_no_outliers_iqr.drop(['Id', 'Species'], axis=1)) plt.title('Boxplot to Visualize Outliers After Removal') plt.show()
Comments