import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
# Load the Iris dataset
df = pd.read_csv('Iris.csv')
# Display the first few rows (optional)
print(df.head())
# Remove the 'Id' column if present
df = df.drop(columns=['Id'])
# Separate features and the actual labels (for comparison)
X = df.drop(columns=['Species'])
y_true = df['Species']
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply k-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_scaled)
y_kmeans = kmeans.labels_
# Map cluster labels to the actual classes for accuracy estimation
# Since k-means labels are arbitrary, we map them using the most common species in each cluster.
y_kmeans_mapped = np.zeros_like(y_kmeans)
for i in range(3):
mask = (y_kmeans == i)
y_kmeans_mapped[mask] = np.bincount(y_true[mask].factorize()[0]).argmax()
# Estimate accuracy by comparing clusters to actual labels
accuracy = accuracy_score(y_true.factorize()[0], y_kmeans_mapped)
print("Estimated Accuracy of k-Means Clustering:", accuracy)
# Confusion matrix to visualize clustering performance
conf_matrix = confusion_matrix(y_true.factorize()[0], y_kmeans_mapped)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Setosa', 'Versicolor', 'Virginica'],
yticklabels=['Setosa', 'Versicolor', 'Virginica'])
plt.xlabel("Cluster Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix of k-Means Clustering")
plt.show()
# Visualize clusters using PCA (reduce to 2D for plotting)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Plot the clusters and actual labels
plt.figure(figsize=(12, 5))
# Plot k-Means clusters
plt.subplot(1, 2, 1)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_kmeans, cmap="viridis", s=50)
plt.title("k-Means Clustering on Iris Dataset (PCA-reduced)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.colorbar(label="Cluster")
# Plot actual species
species_to_num = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
y_true_numeric = y_true.map(species_to_num)
plt.subplot(1, 2, 2)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_true_numeric, cmap="viridis", s=50)
plt.title("Actual Species Labels (PCA-reduced)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.colorbar(label="Species")
plt.tight_layout()
plt.show()
Comments