import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans from sklearn.metrics import accuracy_score, confusion_matrix from sklearn.decomposition import PCA # Load the Iris dataset df = pd.read_csv('Iris.csv') # Display the first few rows (optional) print(df.head()) # Remove the 'Id' column if present df = df.drop(columns=['Id']) # Separate features and the actual labels (for comparison) X = df.drop(columns=['Species']) y_true = df['Species'] # Standardize the features scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Apply k-Means clustering kmeans = KMeans(n_clusters=3, random_state=42) kmeans.fit(X_scaled) y_kmeans = kmeans.labels_ # Map cluster labels to the actual classes for accuracy estimation # Since k-means labels are arbitrary, we map them using the most common species in each cluster. y_kmeans_mapped = np.zeros_like(y_kmeans) for i in range(3): mask = (y_kmeans == i) y_kmeans_mapped[mask] = np.bincount(y_true[mask].factorize()[0]).argmax() # Estimate accuracy by comparing clusters to actual labels accuracy = accuracy_score(y_true.factorize()[0], y_kmeans_mapped) print("Estimated Accuracy of k-Means Clustering:", accuracy) # Confusion matrix to visualize clustering performance conf_matrix = confusion_matrix(y_true.factorize()[0], y_kmeans_mapped) plt.figure(figsize=(6, 4)) sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Setosa', 'Versicolor', 'Virginica'], yticklabels=['Setosa', 'Versicolor', 'Virginica']) plt.xlabel("Cluster Label") plt.ylabel("True Label") plt.title("Confusion Matrix of k-Means Clustering") plt.show() # Visualize clusters using PCA (reduce to 2D for plotting) pca = PCA(n_components=2) X_pca = pca.fit_transform(X_scaled) # Plot the clusters and actual labels plt.figure(figsize=(12, 5)) # Plot k-Means clusters plt.subplot(1, 2, 1) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_kmeans, cmap="viridis", s=50) plt.title("k-Means Clustering on Iris Dataset (PCA-reduced)") plt.xlabel("Principal Component 1") plt.ylabel("Principal Component 2") plt.colorbar(label="Cluster") # Plot actual species species_to_num = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} y_true_numeric = y_true.map(species_to_num) plt.subplot(1, 2, 2) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_true_numeric, cmap="viridis", s=50) plt.title("Actual Species Labels (PCA-reduced)") plt.xlabel("Principal Component 1") plt.ylabel("Principal Component 2") plt.colorbar(label="Species") plt.tight_layout() plt.show()