9.Design and implement Hierarchical clustering to cluster species of flower. Estimate the accuracy of the model. Also write the program to visualize insights of the Iris dataset.

PHOTO EMBED

Sun Nov 03 2024 13:02:03 GMT+0000 (Coordinated Universal Time)

Saved by @varuntej #python

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import StandardScaler 
from sklearn.cluster import AgglomerativeClustering 
from sklearn.metrics import accuracy_score, confusion_matrix 
from scipy.cluster.hierarchy import dendrogram, linkage 
from sklearn.decomposition import PCA 
# Load the Iris dataset 
df = pd.read_csv('Iris.csv') 
# Display the first few rows to understand the structure (optional) 
print(df.head()) 
# Remove the 'Id' column if present 
df = df.drop(columns=['Id']) 
# Separate features and the actual labels (for comparison) 
X = df.drop(columns=['Species']) 
y_true = df['Species'] 
# Standardize the features 
scaler = StandardScaler() 
X_scaled = scaler.fit_transform(X) 
# Generate a dendrogram to visualize hierarchical clustering structure 
plt.figure(figsize=(10, 7)) 
linked = linkage(X_scaled, method='ward') 
dendrogram(linked, labels=y_true.values, orientation='top', distance_sort='descending', show_leaf_counts=True) 
plt.title("Dendrogram for Iris Dataset") 
plt.xlabel("Samples") 
plt.ylabel("Euclidean Distance") 
plt.show() 
# Apply Hierarchical Clustering 
hc = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward') 
y_hc = hc.fit_predict(X_scaled) 
# Map cluster labels to actual classes for accuracy estimation 
y_hc_mapped = np.zeros_like(y_hc) 
for i in range(3): 
mask = (y_hc == i) 
y_hc_mapped[mask] = np.bincount(y_true[mask].factorize()[0]).argmax() 
# Estimate accuracy by comparing clusters to actual labels 
accuracy = accuracy_score(y_true.factorize()[0], y_hc_mapped) 
print("Estimated Accuracy of Hierarchical Clustering:", accuracy) 
# Confusion matrix to visualize clustering performance 
conf_matrix = confusion_matrix(y_true.factorize()[0], y_hc_mapped) 
plt.figure(figsize=(6, 4)) 
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Setosa', 'Versicolor', 'Virginica'], 
yticklabels=['Setosa', 'Versicolor', 'Virginica']) 
plt.xlabel("Cluster Label") 
plt.ylabel("True Label") 
plt.title("Confusion Matrix of Hierarchical Clustering") 
plt.show() 
# Visualize clusters using PCA (reduce to 2D for plotting) 
pca = PCA(n_components=2) 
X_pca = pca.fit_transform(X_scaled) 
# Plot the clusters and actual labels 
plt.figure(figsize=(12, 5)) 
# Plot hierarchical clusters 
plt.subplot(1, 2, 1) 
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_hc, cmap="viridis", s=50) 
plt.title("Hierarchical Clustering on Iris Dataset (PCA-reduced)") 
plt.xlabel("Principal Component 1") 
plt.ylabel("Principal Component 2") 
plt.colorbar(label="Cluster") 
# Plot actual species 
species_to_num = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} 
y_true_numeric = y_true.map(species_to_num) 
plt.subplot(1, 2, 2) 
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_true_numeric, cmap="viridis", s=50) 
plt.title("Actual Species Labels (PCA-reduced)") 
plt.xlabel("Principal Component 1") 
plt.ylabel("Principal Component 2") 
plt.colorbar(label="Species") 
plt.tight_layout() 
plt.show()
content_copyCOPY