8.Design and implement k-Means clustering to cluster species of flower. Estimate the accuracy of the model. Also write the program to visualize insights of the Iris dataset.

PHOTO EMBED

Sun Nov 03 2024 13:01:17 GMT+0000 (Coordinated Universal Time)

Saved by @varuntej #python

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import StandardScaler 
from sklearn.cluster import KMeans 
from sklearn.metrics import accuracy_score, confusion_matrix 
from sklearn.decomposition import PCA 
# Load the Iris dataset 
df = pd.read_csv('Iris.csv') 
# Display the first few rows (optional) 
print(df.head()) 
# Remove the 'Id' column if present 
df = df.drop(columns=['Id']) 
# Separate features and the actual labels (for comparison) 
X = df.drop(columns=['Species']) 
y_true = df['Species'] 
# Standardize the features 
scaler = StandardScaler() 
X_scaled = scaler.fit_transform(X) 
# Apply k-Means clustering 
kmeans = KMeans(n_clusters=3, random_state=42) 
kmeans.fit(X_scaled) 
y_kmeans = kmeans.labels_ 
# Map cluster labels to the actual classes for accuracy estimation 
# Since k-means labels are arbitrary, we map them using the most common species in each cluster. 
y_kmeans_mapped = np.zeros_like(y_kmeans) 
for i in range(3): 
mask = (y_kmeans == i) 
y_kmeans_mapped[mask] = np.bincount(y_true[mask].factorize()[0]).argmax() 
# Estimate accuracy by comparing clusters to actual labels 
accuracy = accuracy_score(y_true.factorize()[0], y_kmeans_mapped) 
print("Estimated Accuracy of k-Means Clustering:", accuracy) 
# Confusion matrix to visualize clustering performance 
conf_matrix = confusion_matrix(y_true.factorize()[0], y_kmeans_mapped) 
plt.figure(figsize=(6, 4)) 
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Setosa', 'Versicolor', 'Virginica'], 
yticklabels=['Setosa', 'Versicolor', 'Virginica']) 
plt.xlabel("Cluster Label") 
plt.ylabel("True Label") 
plt.title("Confusion Matrix of k-Means Clustering") 
plt.show() 
# Visualize clusters using PCA (reduce to 2D for plotting) 
pca = PCA(n_components=2) 
X_pca = pca.fit_transform(X_scaled) 
# Plot the clusters and actual labels 
plt.figure(figsize=(12, 5)) 
# Plot k-Means clusters 
plt.subplot(1, 2, 1) 
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_kmeans, cmap="viridis", s=50) 
plt.title("k-Means Clustering on Iris Dataset (PCA-reduced)") 
plt.xlabel("Principal Component 1") 
plt.ylabel("Principal Component 2") 
plt.colorbar(label="Cluster") 
# Plot actual species 
species_to_num = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} 
y_true_numeric = y_true.map(species_to_num) 
plt.subplot(1, 2, 2) 
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_true_numeric, cmap="viridis", s=50) 
plt.title("Actual Species Labels (PCA-reduced)") 
plt.xlabel("Principal Component 1") 
plt.ylabel("Principal Component 2") 
plt.colorbar(label="Species") 
plt.tight_layout() 
plt.show()
content_copyCOPY