T1. Write a python program to import and export data using Pandas Library Functions.
import pandas as pd
csv_file = r"\sample_data.csv"
csv_data = pd.read_csv(csv_file,sep=",")
print("Data imported successfully:")
print(csv_data)
excel_file = r"\sample.xlsx"
excel_data = pd.read_excel(excel_file)
print(excel_data)

T2. Demonstrate the following data pre-processing techniques on the given dataset 2
a. Standardization
b. normalization
c. summarization
d. de-duplication
e. Imputation

Program:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
# Sample Data with Missing Values and Duplicates
data = {
 'Name': ['Alice', 'Bob', 'Charlie', 'Alice'],
 'Age': [25, 30, 35, 25],
 'Salary': [50000, 60000, None, 50000],
 'City': ['New York', 'Los Angeles', 'Chicago', 'New York']
}
# Create DataFrame
df = pd.DataFrame(data)
# a. Standardization
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])
print("\nStandardized Data:\n", df)
# b. Normalization
normalizer = MinMaxScaler()
df[['Age', 'Salary']] = normalizer.fit_transform(df[['Age', 'Salary']])
print("\nNormalized Data:\n", df)
# c. Summarization
summary = df.describe()
print("\nData Summary:\n", summary)
# d. De-duplication
df_deduplicated = df.drop_duplicates()
print("\nDe-duplicated Data:\n", df_deduplicated)
# e. Imputation
imputer = SimpleImputer(strategy='mean')
df[['Salary']] = imputer.fit_transform(df[['Salary']])
print("\nData with Imputed Values:\n", df)

T3.Implement findS algorithm and Candidate elimination algorithm
def find_s_algorithm(examples):
    # Start with the most specific hypothesis
    hypothesis = ['0'] * len(examples[0][0])

    for attributes, label in examples:
        if label == 'Yes':  # Only consider positive examples
            for i in range(len(hypothesis)):
                if hypothesis[i] == '0':
                    hypothesis[i] = attributes[i]
                elif hypothesis[i] != attributes[i]:
                    hypothesis[i] = '?'  # Generalize
    return hypothesis

def candidate_elimination_algorithm(examples):
    num_attributes = len(examples[0][0])
    # Start with most specific S and most general G
    S = ['0'] * num_attributes
    G = [['?' for _ in range(num_attributes)]]

    for instance, label in examples:
        if label == 'Yes':
            # Remove from G any hypothesis inconsistent with the instance
            G = [g for g in G if consistent(g, instance)]

            for i in range(num_attributes):
                if S[i] == '0':
                    S[i] = instance[i]
                elif S[i] != instance[i]:
                    S[i] = '?'
        else:  # label == 'No'
            G_new = []
            for g in G:
                for i in range(num_attributes):
                    if g[i] == '?':
                        if S[i] != instance[i]:
                            g_new = g.copy()
                            g_new[i] = S[i]
                            if g_new not in G_new:
                                G_new.append(g_new)
            G = G_new
    return S, G

def consistent(hypothesis, instance):
    for h, x in zip(hypothesis, instance):
        if h != '?' and h != x:
            return False
    return True

# Each row is a tuple (attributes, label)
dataset = [
    (['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same'], 'Yes'),
    (['Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same'], 'Yes'),
    (['Rainy', 'Cold', 'High', 'Strong', 'Warm', 'Change'], 'No'),
    (['Sunny', 'Warm', 'High', 'Strong', 'Cool', 'Change'], 'Yes'),
]

# Find-S Output
hypothesis = find_s_algorithm(dataset)
print("Final hypothesis from Find-S:", hypothesis)

# Candidate Elimination Output
S, G = candidate_elimination_algorithm(dataset)
print("Final specific hypothesis (S):", S)
print("Final general hypotheses (G):", G)


T4. . Demonstrate regression technique to predict the responses at unknown locations by fitting the linear and polynomial regression surfaces. Extract error measures and plot the residuals. Further, add a regulizer and demonstrate the reduction in the variance. (Ridge and LASSO)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
#incmd , pip install statsmodel
# 1. Generate synthetic data
np.random.seed(42)
X = 2 - 3 * np.random.normal(0, 1, 100)
y = X**3 + X**2 + np.random.normal(0, 5, 100)
X = X.reshape(-1, 1)

# 2. Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)
rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred_lin))
r2_lin = r2_score(y_test, y_pred_lin)

# 4. Polynomial Regression
poly = PolynomialFeatures(degree=3)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

poly_reg = LinearRegression()
poly_reg.fit(X_train_poly, y_train)
y_pred_poly = poly_reg.predict(X_test_poly)
rmse_poly = np.sqrt(mean_squared_error(y_test, y_pred_poly))
r2_poly = r2_score(y_test, y_pred_poly)

# 5. Residual Plot for Polynomial Regression
residuals = y_test - y_pred_poly
plt.figure(figsize=(8, 5))
sns.residplot(x=y_pred_poly, y=residuals, lowess=False, color='g')
plt.title("Residual Plot - Polynomial Regression")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.axhline(0, color='red', linestyle='--')
plt.show()

# 6. Ridge Regression
ridge = Ridge(alpha=1)
ridge.fit(X_train_poly, y_train)
y_pred_ridge = ridge.predict(X_test_poly)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
r2_ridge = r2_score(y_test, y_pred_ridge)

# 7. Lasso Regression
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_poly, y_train)
y_pred_lasso = lasso.predict(X_test_poly)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
r2_lasso = r2_score(y_test, y_pred_lasso)

# 8. Plotting all models
X_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
X_range_poly = poly.transform(X_range)

plt.figure(figsize=(10, 6))
plt.scatter(X, y, label="Original Data", alpha=0.6)
plt.plot(X_range, lin_reg.predict(X_range), label="Linear", color="blue")
plt.plot(X_range, poly_reg.predict(X_range_poly), label="Polynomial (deg 3)", color="green")
plt.plot(X_range, ridge.predict(X_range_poly), label="Ridge", color="purple")
plt.plot(X_range, lasso.predict(X_range_poly), label="Lasso", color="orange")
plt.title("Regression Models Comparison")
plt.xlabel("X")
plt.ylabel("y")
plt.legend()
plt.grid(True)
plt.show()

# 9. Print performance
print("Model Performance Summary:\n")
print(f"Linear Regression     -> RMSE: {rmse_lin:.2f}, R²: {r2_lin:.2f}")
print(f"Polynomial Regression -> RMSE: {rmse_poly:.2f}, R²: {r2_poly:.2f}")
print(f"Ridge Regression      -> RMSE: {rmse_ridge:.2f}, R²: {r2_ridge:.2f}")
print(f"Lasso Regression      -> RMSE: {rmse_lasso:.2f}, R²: {r2_lasso:.2f}")

T5. Demonstrate the capability of PCA and LDA in dimensionality reduction.
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target
target_names = iris.target_names

# Apply PCA to reduce dimensions to 2
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Apply LDA to reduce dimensions to 2
lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(X, y)

# Plot PCA results
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
colors = ['navy', 'turquoise', 'darkorange']
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], alpha=0.8, color=color, label=target_name)
plt.title('PCA on Iris Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()

# Plot LDA results
plt.subplot(1, 2, 2)
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(X_lda[y == i, 0], X_lda[y == i, 1], alpha=0.8, color=color, label=target_name)
plt.title('LDA on Iris Dataset')
plt.xlabel('Linear Discriminant 1')
plt.ylabel('Linear Discriminant 2')
plt.legend()

plt.tight_layout()
plt.show()

# Explained variance for PCA
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained variance by PCA components:", explained_variance_ratio)


T6. KNN
# Import necessary libraries
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the K-NN classifier with K=5
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the model
knn.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = knn.predict(X_test_scaled)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, cmap="Blues", xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - KNN")
plt.show()


T7.Apply suitable classifier model to classify the credit status to be good or bad on German credit dataset.csv, create confusion matrix to measure the accuracy of the model (using Logistic Regression/SVM/Naïve Bayes).
Dataset -> https://online.stat.psu.edu/stat857/node/215/
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay

# Step 2: Load the German Credit dataset
df = pd.read_csv("German credit dataset.csv")

# Step 3: Preprocess the data
# Encode categorical columns
df_encoded = df.copy()
label_encoders = {}

for column in df_encoded.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_encoded[column] = le.fit_transform(df_encoded[column])
    label_encoders[column] = le

# Step 4: Split data into features (X) and target (y)
# Assuming 'Creditability' or similar is the target column; adjust if needed
target_column = 'Creditability'  # Update this if your dataset has a different column
X = df_encoded.drop(target_column, axis=1)
y = df_encoded[target_column]

# Step 5: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train a classifier (e.g., Logistic Regression)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate the model
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("✅ Confusion Matrix:\n", cm)
print("\n🎯 Accuracy Score:", accuracy)

# Optional: Display confusion matrix visually
ConfusionMatrixDisplay(confusion_matrix=cm).plot()


T8.Apply train set split and develop a regression model to predict the sold price of players using imb381ipl2013.csv build a correlation matrix between all the numeric features in dataset and visualize the heatmap. RMSE of train and test data.

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = "imb381ipl2013.csv"  # Replace with your file path if needed
data = pd.read_csv(file_path)

# Display basic information and head of the dataset
print("Dataset Info:")
print(data.info())
print("\nFirst 5 Rows:")
print(data.head())

# Check for missing values and drop rows with NaN
data.dropna(inplace=True)

# Define target variable (Sold Price) and features
X = data.drop(columns=['Sold Price'])
y = data['Sold Price']

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate RMSE for train and test data
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"\nRMSE on Training Data: {rmse_train:.2f}")
print(f"RMSE on Test Data: {rmse_test:.2f}")

# Build correlation matrix for numeric features
numeric_features = data.select_dtypes(include=[np.number])
correlation_matrix = numeric_features.corr()

# Plot heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Features')
plt.show()

T11. For the glass identification dataset, fit random forest classifier to classify glass type

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
# Download from: https://archive.ics.uci.edu/ml/datasets/glass+identification
# Assuming the file is named 'glass.csv' with proper column names
column_names = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type']
data = pd.read_csv('glass.csv', names=column_names)

# Features and target
X = data.drop('Type', axis=1)
y = data['Type']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Optional: Plot confusion matrix heatmap
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


T12.. Implement the K-Means clustering algorithm using Python. You may use a library such as scikit-learn for this purpose

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

# Generate sample data using make_blobs
# Create 300 samples with 3 cluster centers
X, y_true = make_blobs(n_samples=300, centers=3, cluster_std=0.60, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Visualize the raw data
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], s=50, c='gray', marker='o')
plt.title('Generated Raw Data')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

# Apply K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans.fit(X_scaled)

# Get the cluster labels and cluster centers
y_kmeans = kmeans.labels_
centers = kmeans.cluster_centers_

# Visualize the clusters
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y_kmeans, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='X', s=200, label='Centroids')
plt.title('K-Means Clustering Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()

# Print cluster centers and inertia
print("Cluster Centers (after scaling):\n", centers)
print(f"Inertia (Sum of Squared Distances): {kmeans.inertia_:.2f}")

# Calculate the optimal number of clusters using the Elbow Method
inertia_values = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia_values.append(kmeans.inertia_)

# Plot the Elbow Method
plt.plot(k_range, inertia_values, marker='o')
plt.title('Elbow Method to Determine Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Sum of Squared Distances)')
plt.show()


T13. Implement the Agglomerative Hierarchical clustering algorithm using Python. Utilize linkage methods such as 'ward,' 'complete,' or 'average.

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

# Generate sample data using make_blobs
# Create 300 samples with 3 cluster centers
X, y_true = make_blobs(n_samples=300, centers=3, cluster_std=0.70, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Plot the raw data
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], s=50, c='gray', marker='o')
plt.title('Generated Raw Data')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

# Define linkage methods to be used
linkage_methods = ['ward', 'complete', 'average']

# Plot dendrograms for different linkage methods
plt.figure(figsize=(15, 5))
for i, method in enumerate(linkage_methods):
    plt.subplot(1, 3, i + 1)
    Z = linkage(X_scaled, method=method)
    dendrogram(Z, truncate_mode='level', p=5)
    plt.title(f'Dendrogram using {method.capitalize()} Linkage')
    plt.xlabel('Data Points')
    plt.ylabel('Distance')

plt.tight_layout()
plt.show()

# Apply Agglomerative Clustering using 'ward' linkage
n_clusters = 3  # Number of clusters
model = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
y_pred = model.fit_predict(X_scaled)

# Plot the clusters
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y_pred, cmap='viridis', s=50)
plt.title('Agglomerative Hierarchical Clustering (Ward Linkage)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

T