# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
# Step 1: Load the dataset
df = pd.read_csv('Iris.csv') # Replace with the correct path if needed
# Step 1.1: Data Exploration
print("Dataset Preview:")
print(df.head())
# Data Overview
print("\nDataset Information:")
print(df.info())
# Step 1.2: Verify Missing Values
print("\nMissing Values:")
print(df.isnull().sum())
# Step 1.3: Verify Duplicates
print("\nDuplicate Rows:")
print(df.duplicated().sum())
# Step 1.4: Detect and Remove Outliers using IQR method
# IQR method to detect outliers
Q1 = df.select_dtypes(include=[np.number]).quantile(0.25)
Q3 = df.select_dtypes(include=[np.number]).quantile(0.75)
IQR = Q3 - Q1
# Outliers are those outside the range (Q1 - 1.5*IQR) and (Q3 + 1.5*IQR)
outliers_iqr = ((df.select_dtypes(include=[np.number]) < (Q1 - 1.5 * IQR)) |
(df.select_dtypes(include=[np.number]) > (Q3 + 1.5 * IQR))).any(axis=1)
# Removing outliers
df_no_outliers_iqr = df[~outliers_iqr]
print(f"\nShape after removing outliers: {df_no_outliers_iqr.shape}")
# Step 1.5: Split Dataset and Perform Naive Bayes Classification
# Define features (X) and target (y)
X = df_no_outliers_iqr.drop(['Id', 'Species'], axis=1) # Drop 'Id' and 'Species'
y = df_no_outliers_iqr['Species']
# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize Naive Bayes classifier (GaussianNB for continuous data)
nb_classifier = GaussianNB()
# Train the model
nb_classifier.fit(X_train, y_train)
# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy of the Naive Bayes model: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Optional: Visualize the distribution of data and outliers
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_no_outliers_iqr.drop(['Id', 'Species'], axis=1))
plt.title('Boxplot to Visualize Outliers After Removal')
plt.show()
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter