import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import accuracy_score, confusion_matrix from scipy.cluster.hierarchy import dendrogram, linkage from sklearn.decomposition import PCA # Load the Iris dataset df = pd.read_csv('Iris.csv') # Display the first few rows to understand the structure (optional) print(df.head()) # Remove the 'Id' column if present df = df.drop(columns=['Id']) # Separate features and the actual labels (for comparison) X = df.drop(columns=['Species']) y_true = df['Species'] # Standardize the features scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Generate a dendrogram to visualize hierarchical clustering structure plt.figure(figsize=(10, 7)) linked = linkage(X_scaled, method='ward') dendrogram(linked, labels=y_true.values, orientation='top', distance_sort='descending', show_leaf_counts=True) plt.title("Dendrogram for Iris Dataset") plt.xlabel("Samples") plt.ylabel("Euclidean Distance") plt.show() # Apply Hierarchical Clustering hc = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward') y_hc = hc.fit_predict(X_scaled) # Map cluster labels to actual classes for accuracy estimation y_hc_mapped = np.zeros_like(y_hc) for i in range(3): mask = (y_hc == i) y_hc_mapped[mask] = np.bincount(y_true[mask].factorize()[0]).argmax() # Estimate accuracy by comparing clusters to actual labels accuracy = accuracy_score(y_true.factorize()[0], y_hc_mapped) print("Estimated Accuracy of Hierarchical Clustering:", accuracy) # Confusion matrix to visualize clustering performance conf_matrix = confusion_matrix(y_true.factorize()[0], y_hc_mapped) plt.figure(figsize=(6, 4)) sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Setosa', 'Versicolor', 'Virginica'], yticklabels=['Setosa', 'Versicolor', 'Virginica']) plt.xlabel("Cluster Label") plt.ylabel("True Label") plt.title("Confusion Matrix of Hierarchical Clustering") plt.show() # Visualize clusters using PCA (reduce to 2D for plotting) pca = PCA(n_components=2) X_pca = pca.fit_transform(X_scaled) # Plot the clusters and actual labels plt.figure(figsize=(12, 5)) # Plot hierarchical clusters plt.subplot(1, 2, 1) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_hc, cmap="viridis", s=50) plt.title("Hierarchical Clustering on Iris Dataset (PCA-reduced)") plt.xlabel("Principal Component 1") plt.ylabel("Principal Component 2") plt.colorbar(label="Cluster") # Plot actual species species_to_num = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} y_true_numeric = y_true.map(species_to_num) plt.subplot(1, 2, 2) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_true_numeric, cmap="viridis", s=50) plt.title("Actual Species Labels (PCA-reduced)") plt.xlabel("Principal Component 1") plt.ylabel("Principal Component 2") plt.colorbar(label="Species") plt.tight_layout() plt.show()
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter