import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import PCA
# Load the Iris dataset
df = pd.read_csv('Iris.csv')
# Display the first few rows to understand the structure (optional)
print(df.head())
# Remove the 'Id' column if present
df = df.drop(columns=['Id'])
# Separate features and the actual labels (for comparison)
X = df.drop(columns=['Species'])
y_true = df['Species']
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Generate a dendrogram to visualize hierarchical clustering structure
plt.figure(figsize=(10, 7))
linked = linkage(X_scaled, method='ward')
dendrogram(linked, labels=y_true.values, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title("Dendrogram for Iris Dataset")
plt.xlabel("Samples")
plt.ylabel("Euclidean Distance")
plt.show()
# Apply Hierarchical Clustering
hc = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
y_hc = hc.fit_predict(X_scaled)
# Map cluster labels to actual classes for accuracy estimation
y_hc_mapped = np.zeros_like(y_hc)
for i in range(3):
mask = (y_hc == i)
y_hc_mapped[mask] = np.bincount(y_true[mask].factorize()[0]).argmax()
# Estimate accuracy by comparing clusters to actual labels
accuracy = accuracy_score(y_true.factorize()[0], y_hc_mapped)
print("Estimated Accuracy of Hierarchical Clustering:", accuracy)
# Confusion matrix to visualize clustering performance
conf_matrix = confusion_matrix(y_true.factorize()[0], y_hc_mapped)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Setosa', 'Versicolor', 'Virginica'],
yticklabels=['Setosa', 'Versicolor', 'Virginica'])
plt.xlabel("Cluster Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix of Hierarchical Clustering")
plt.show()
# Visualize clusters using PCA (reduce to 2D for plotting)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Plot the clusters and actual labels
plt.figure(figsize=(12, 5))
# Plot hierarchical clusters
plt.subplot(1, 2, 1)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_hc, cmap="viridis", s=50)
plt.title("Hierarchical Clustering on Iris Dataset (PCA-reduced)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.colorbar(label="Cluster")
# Plot actual species
species_to_num = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
y_true_numeric = y_true.map(species_to_num)
plt.subplot(1, 2, 2)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_true_numeric, cmap="viridis", s=50)
plt.title("Actual Species Labels (PCA-reduced)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.colorbar(label="Species")
plt.tight_layout()
plt.show()
Comments