Preview:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import StandardScaler 
from sklearn.cluster import AgglomerativeClustering 
from sklearn.metrics import accuracy_score, confusion_matrix 
from scipy.cluster.hierarchy import dendrogram, linkage 
from sklearn.decomposition import PCA 
# Load the Iris dataset 
df = pd.read_csv('Iris.csv') 
# Display the first few rows to understand the structure (optional) 
print(df.head()) 
# Remove the 'Id' column if present 
df = df.drop(columns=['Id']) 
# Separate features and the actual labels (for comparison) 
X = df.drop(columns=['Species']) 
y_true = df['Species'] 
# Standardize the features 
scaler = StandardScaler() 
X_scaled = scaler.fit_transform(X) 
# Generate a dendrogram to visualize hierarchical clustering structure 
plt.figure(figsize=(10, 7)) 
linked = linkage(X_scaled, method='ward') 
dendrogram(linked, labels=y_true.values, orientation='top', distance_sort='descending', show_leaf_counts=True) 
plt.title("Dendrogram for Iris Dataset") 
plt.xlabel("Samples") 
plt.ylabel("Euclidean Distance") 
plt.show() 
# Apply Hierarchical Clustering 
hc = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward') 
y_hc = hc.fit_predict(X_scaled) 
# Map cluster labels to actual classes for accuracy estimation 
y_hc_mapped = np.zeros_like(y_hc) 
for i in range(3): 
mask = (y_hc == i) 
y_hc_mapped[mask] = np.bincount(y_true[mask].factorize()[0]).argmax() 
# Estimate accuracy by comparing clusters to actual labels 
accuracy = accuracy_score(y_true.factorize()[0], y_hc_mapped) 
print("Estimated Accuracy of Hierarchical Clustering:", accuracy) 
# Confusion matrix to visualize clustering performance 
conf_matrix = confusion_matrix(y_true.factorize()[0], y_hc_mapped) 
plt.figure(figsize=(6, 4)) 
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Setosa', 'Versicolor', 'Virginica'], 
yticklabels=['Setosa', 'Versicolor', 'Virginica']) 
plt.xlabel("Cluster Label") 
plt.ylabel("True Label") 
plt.title("Confusion Matrix of Hierarchical Clustering") 
plt.show() 
# Visualize clusters using PCA (reduce to 2D for plotting) 
pca = PCA(n_components=2) 
X_pca = pca.fit_transform(X_scaled) 
# Plot the clusters and actual labels 
plt.figure(figsize=(12, 5)) 
# Plot hierarchical clusters 
plt.subplot(1, 2, 1) 
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_hc, cmap="viridis", s=50) 
plt.title("Hierarchical Clustering on Iris Dataset (PCA-reduced)") 
plt.xlabel("Principal Component 1") 
plt.ylabel("Principal Component 2") 
plt.colorbar(label="Cluster") 
# Plot actual species 
species_to_num = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} 
y_true_numeric = y_true.map(species_to_num) 
plt.subplot(1, 2, 2) 
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_true_numeric, cmap="viridis", s=50) 
plt.title("Actual Species Labels (PCA-reduced)") 
plt.xlabel("Principal Component 1") 
plt.ylabel("Principal Component 2") 
plt.colorbar(label="Species") 
plt.tight_layout() 
plt.show()
downloadDownload PNG downloadDownload JPEG downloadDownload SVG

Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!

Click to optimize width for Twitter