ml
Wed Apr 09 2025 02:08:14 GMT+0000 (Coordinated Universal Time)
Saved by @sem
T1. Write a python program to import and export data using Pandas Library Functions. import pandas as pd csv_file = r"\sample_data.csv" csv_data = pd.read_csv(csv_file,sep=",") print("Data imported successfully:") print(csv_data) excel_file = r"\sample.xlsx" excel_data = pd.read_excel(excel_file) print(excel_data) T2. Demonstrate the following data pre-processing techniques on the given dataset 2 a. Standardization b. normalization c. summarization d. de-duplication e. Imputation Program: import pandas as pd from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.impute import SimpleImputer # Sample Data with Missing Values and Duplicates data = { 'Name': ['Alice', 'Bob', 'Charlie', 'Alice'], 'Age': [25, 30, 35, 25], 'Salary': [50000, 60000, None, 50000], 'City': ['New York', 'Los Angeles', 'Chicago', 'New York'] } # Create DataFrame df = pd.DataFrame(data) # a. Standardization scaler = StandardScaler() df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']]) print("\nStandardized Data:\n", df) # b. Normalization normalizer = MinMaxScaler() df[['Age', 'Salary']] = normalizer.fit_transform(df[['Age', 'Salary']]) print("\nNormalized Data:\n", df) # c. Summarization summary = df.describe() print("\nData Summary:\n", summary) # d. De-duplication df_deduplicated = df.drop_duplicates() print("\nDe-duplicated Data:\n", df_deduplicated) # e. Imputation imputer = SimpleImputer(strategy='mean') df[['Salary']] = imputer.fit_transform(df[['Salary']]) print("\nData with Imputed Values:\n", df) T3.Implement findS algorithm and Candidate elimination algorithm def find_s_algorithm(examples): # Start with the most specific hypothesis hypothesis = ['0'] * len(examples[0][0]) for attributes, label in examples: if label == 'Yes': # Only consider positive examples for i in range(len(hypothesis)): if hypothesis[i] == '0': hypothesis[i] = attributes[i] elif hypothesis[i] != attributes[i]: hypothesis[i] = '?' # Generalize return hypothesis def candidate_elimination_algorithm(examples): num_attributes = len(examples[0][0]) # Start with most specific S and most general G S = ['0'] * num_attributes G = [['?' for _ in range(num_attributes)]] for instance, label in examples: if label == 'Yes': # Remove from G any hypothesis inconsistent with the instance G = [g for g in G if consistent(g, instance)] for i in range(num_attributes): if S[i] == '0': S[i] = instance[i] elif S[i] != instance[i]: S[i] = '?' else: # label == 'No' G_new = [] for g in G: for i in range(num_attributes): if g[i] == '?': if S[i] != instance[i]: g_new = g.copy() g_new[i] = S[i] if g_new not in G_new: G_new.append(g_new) G = G_new return S, G def consistent(hypothesis, instance): for h, x in zip(hypothesis, instance): if h != '?' and h != x: return False return True # Each row is a tuple (attributes, label) dataset = [ (['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same'], 'Yes'), (['Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same'], 'Yes'), (['Rainy', 'Cold', 'High', 'Strong', 'Warm', 'Change'], 'No'), (['Sunny', 'Warm', 'High', 'Strong', 'Cool', 'Change'], 'Yes'), ] # Find-S Output hypothesis = find_s_algorithm(dataset) print("Final hypothesis from Find-S:", hypothesis) # Candidate Elimination Output S, G = candidate_elimination_algorithm(dataset) print("Final specific hypothesis (S):", S) print("Final general hypotheses (G):", G) T4. . Demonstrate regression technique to predict the responses at unknown locations by fitting the linear and polynomial regression surfaces. Extract error measures and plot the residuals. Further, add a regulizer and demonstrate the reduction in the variance. (Ridge and LASSO) import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.linear_model import LinearRegression, Ridge, Lasso from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics import mean_squared_error, r2_score from sklearn.model_selection import train_test_split #incmd , pip install statsmodel # 1. Generate synthetic data np.random.seed(42) X = 2 - 3 * np.random.normal(0, 1, 100) y = X**3 + X**2 + np.random.normal(0, 5, 100) X = X.reshape(-1, 1) # 2. Split the dataset X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 3. Linear Regression lin_reg = LinearRegression() lin_reg.fit(X_train, y_train) y_pred_lin = lin_reg.predict(X_test) rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred_lin)) r2_lin = r2_score(y_test, y_pred_lin) # 4. Polynomial Regression poly = PolynomialFeatures(degree=3) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.transform(X_test) poly_reg = LinearRegression() poly_reg.fit(X_train_poly, y_train) y_pred_poly = poly_reg.predict(X_test_poly) rmse_poly = np.sqrt(mean_squared_error(y_test, y_pred_poly)) r2_poly = r2_score(y_test, y_pred_poly) # 5. Residual Plot for Polynomial Regression residuals = y_test - y_pred_poly plt.figure(figsize=(8, 5)) sns.residplot(x=y_pred_poly, y=residuals, lowess=False, color='g') plt.title("Residual Plot - Polynomial Regression") plt.xlabel("Predicted") plt.ylabel("Residuals") plt.axhline(0, color='red', linestyle='--') plt.show() # 6. Ridge Regression ridge = Ridge(alpha=1) ridge.fit(X_train_poly, y_train) y_pred_ridge = ridge.predict(X_test_poly) rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge)) r2_ridge = r2_score(y_test, y_pred_ridge) # 7. Lasso Regression lasso = Lasso(alpha=0.1) lasso.fit(X_train_poly, y_train) y_pred_lasso = lasso.predict(X_test_poly) rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso)) r2_lasso = r2_score(y_test, y_pred_lasso) # 8. Plotting all models X_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) X_range_poly = poly.transform(X_range) plt.figure(figsize=(10, 6)) plt.scatter(X, y, label="Original Data", alpha=0.6) plt.plot(X_range, lin_reg.predict(X_range), label="Linear", color="blue") plt.plot(X_range, poly_reg.predict(X_range_poly), label="Polynomial (deg 3)", color="green") plt.plot(X_range, ridge.predict(X_range_poly), label="Ridge", color="purple") plt.plot(X_range, lasso.predict(X_range_poly), label="Lasso", color="orange") plt.title("Regression Models Comparison") plt.xlabel("X") plt.ylabel("y") plt.legend() plt.grid(True) plt.show() # 9. Print performance print("Model Performance Summary:\n") print(f"Linear Regression -> RMSE: {rmse_lin:.2f}, R²: {r2_lin:.2f}") print(f"Polynomial Regression -> RMSE: {rmse_poly:.2f}, R²: {r2_poly:.2f}") print(f"Ridge Regression -> RMSE: {rmse_ridge:.2f}, R²: {r2_ridge:.2f}") print(f"Lasso Regression -> RMSE: {rmse_lasso:.2f}, R²: {r2_lasso:.2f}") T5. Demonstrate the capability of PCA and LDA in dimensionality reduction. # Import necessary libraries import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # Load the Iris dataset iris = load_iris() X, y = iris.data, iris.target target_names = iris.target_names # Apply PCA to reduce dimensions to 2 pca = PCA(n_components=2) X_pca = pca.fit_transform(X) # Apply LDA to reduce dimensions to 2 lda = LinearDiscriminantAnalysis(n_components=2) X_lda = lda.fit_transform(X, y) # Plot PCA results plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) colors = ['navy', 'turquoise', 'darkorange'] for color, i, target_name in zip(colors, [0, 1, 2], target_names): plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], alpha=0.8, color=color, label=target_name) plt.title('PCA on Iris Dataset') plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.legend() # Plot LDA results plt.subplot(1, 2, 2) for color, i, target_name in zip(colors, [0, 1, 2], target_names): plt.scatter(X_lda[y == i, 0], X_lda[y == i, 1], alpha=0.8, color=color, label=target_name) plt.title('LDA on Iris Dataset') plt.xlabel('Linear Discriminant 1') plt.ylabel('Linear Discriminant 2') plt.legend() plt.tight_layout() plt.show() # Explained variance for PCA explained_variance_ratio = pca.explained_variance_ratio_ print("Explained variance by PCA components:", explained_variance_ratio) T6. KNN # Import necessary libraries import numpy as np from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import classification_report, accuracy_score, confusion_matrix import seaborn as sns import matplotlib.pyplot as plt # Load the Iris dataset iris = load_iris() X, y = iris.data, iris.target # Split into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Standardize the features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Initialize the K-NN classifier with K=5 knn = KNeighborsClassifier(n_neighbors=5) # Fit the model knn.fit(X_train_scaled, y_train) # Predict on the test set y_pred = knn.predict(X_test_scaled) # Evaluation print("Accuracy:", accuracy_score(y_test, y_pred)) print("\nClassification Report:\n", classification_report(y_test, y_pred)) # Confusion Matrix conf_matrix = confusion_matrix(y_test, y_pred) sns.heatmap(conf_matrix, annot=True, cmap="Blues", xticklabels=iris.target_names, yticklabels=iris.target_names) plt.xlabel("Predicted") plt.ylabel("Actual") plt.title("Confusion Matrix - KNN") plt.show() T7.Apply suitable classifier model to classify the credit status to be good or bad on German credit dataset.csv, create confusion matrix to measure the accuracy of the model (using Logistic Regression/SVM/Naïve Bayes). Dataset -> https://online.stat.psu.edu/stat857/node/215/ # Step 1: Import libraries import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay # Step 2: Load the German Credit dataset df = pd.read_csv("German credit dataset.csv") # Step 3: Preprocess the data # Encode categorical columns df_encoded = df.copy() label_encoders = {} for column in df_encoded.select_dtypes(include=['object']).columns: le = LabelEncoder() df_encoded[column] = le.fit_transform(df_encoded[column]) label_encoders[column] = le # Step 4: Split data into features (X) and target (y) # Assuming 'Creditability' or similar is the target column; adjust if needed target_column = 'Creditability' # Update this if your dataset has a different column X = df_encoded.drop(target_column, axis=1) y = df_encoded[target_column] # Step 5: Split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Step 6: Train a classifier (e.g., Logistic Regression) model = LogisticRegression(max_iter=1000) model.fit(X_train, y_train) # Step 7: Make predictions y_pred = model.predict(X_test) # Step 8: Evaluate the model cm = confusion_matrix(y_test, y_pred) accuracy = accuracy_score(y_test, y_pred) print("✅ Confusion Matrix:\n", cm) print("\n🎯 Accuracy Score:", accuracy) # Optional: Display confusion matrix visually ConfusionMatrixDisplay(confusion_matrix=cm).plot() T8.Apply train set split and develop a regression model to predict the sold price of players using imb381ipl2013.csv build a correlation matrix between all the numeric features in dataset and visualize the heatmap. RMSE of train and test data. # Import necessary libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error # Load the dataset file_path = "imb381ipl2013.csv" # Replace with your file path if needed data = pd.read_csv(file_path) # Display basic information and head of the dataset print("Dataset Info:") print(data.info()) print("\nFirst 5 Rows:") print(data.head()) # Check for missing values and drop rows with NaN data.dropna(inplace=True) # Define target variable (Sold Price) and features X = data.drop(columns=['Sold Price']) y = data['Sold Price'] # Split the data into training and testing sets (80-20 split) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Build the Linear Regression model model = LinearRegression() model.fit(X_train, y_train) # Make predictions y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) # Calculate RMSE for train and test data rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred)) rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred)) print(f"\nRMSE on Training Data: {rmse_train:.2f}") print(f"RMSE on Test Data: {rmse_test:.2f}") # Build correlation matrix for numeric features numeric_features = data.select_dtypes(include=[np.number]) correlation_matrix = numeric_features.corr() # Plot heatmap of the correlation matrix plt.figure(figsize=(10, 8)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5) plt.title('Correlation Matrix of Numeric Features') plt.show() T11. For the glass identification dataset, fit random forest classifier to classify glass type import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.preprocessing import StandardScaler import seaborn as sns import matplotlib.pyplot as plt # Load dataset # Download from: https://archive.ics.uci.edu/ml/datasets/glass+identification # Assuming the file is named 'glass.csv' with proper column names column_names = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type'] data = pd.read_csv('glass.csv', names=column_names) # Features and target X = data.drop('Type', axis=1) y = data['Type'] # Normalize features scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Train-test split X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) # Random Forest Classifier clf = RandomForestClassifier(n_estimators=100, random_state=42) clf.fit(X_train, y_train) # Predictions y_pred = clf.predict(X_test) # Evaluation print("Accuracy:", accuracy_score(y_test, y_pred)) print("\nClassification Report:\n", classification_report(y_test, y_pred)) print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred)) # Optional: Plot confusion matrix heatmap plt.figure(figsize=(8,6)) sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap='Blues') plt.xlabel('Predicted') plt.ylabel('Actual') plt.title('Confusion Matrix') plt.show() T12.. Implement the K-Means clustering algorithm using Python. You may use a library such as scikit-learn for this purpose # Import necessary libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.datasets import make_blobs from sklearn.preprocessing import StandardScaler # Generate sample data using make_blobs # Create 300 samples with 3 cluster centers X, y_true = make_blobs(n_samples=300, centers=3, cluster_std=0.60, random_state=42) # Standardize the data scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Visualize the raw data plt.scatter(X_scaled[:, 0], X_scaled[:, 1], s=50, c='gray', marker='o') plt.title('Generated Raw Data') plt.xlabel('Feature 1') plt.ylabel('Feature 2') plt.show() # Apply K-Means Clustering kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) kmeans.fit(X_scaled) # Get the cluster labels and cluster centers y_kmeans = kmeans.labels_ centers = kmeans.cluster_centers_ # Visualize the clusters plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y_kmeans, s=50, cmap='viridis') plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='X', s=200, label='Centroids') plt.title('K-Means Clustering Results') plt.xlabel('Feature 1') plt.ylabel('Feature 2') plt.legend() plt.show() # Print cluster centers and inertia print("Cluster Centers (after scaling):\n", centers) print(f"Inertia (Sum of Squared Distances): {kmeans.inertia_:.2f}") # Calculate the optimal number of clusters using the Elbow Method inertia_values = [] k_range = range(1, 11) for k in k_range: kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) kmeans.fit(X_scaled) inertia_values.append(kmeans.inertia_) # Plot the Elbow Method plt.plot(k_range, inertia_values, marker='o') plt.title('Elbow Method to Determine Optimal k') plt.xlabel('Number of Clusters (k)') plt.ylabel('Inertia (Sum of Squared Distances)') plt.show() T13. Implement the Agglomerative Hierarchical clustering algorithm using Python. Utilize linkage methods such as 'ward,' 'complete,' or 'average. # Import necessary libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.cluster.hierarchy import dendrogram, linkage from sklearn.datasets import make_blobs from sklearn.preprocessing import StandardScaler from sklearn.cluster import AgglomerativeClustering # Generate sample data using make_blobs # Create 300 samples with 3 cluster centers X, y_true = make_blobs(n_samples=300, centers=3, cluster_std=0.70, random_state=42) # Standardize the data scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Plot the raw data plt.scatter(X_scaled[:, 0], X_scaled[:, 1], s=50, c='gray', marker='o') plt.title('Generated Raw Data') plt.xlabel('Feature 1') plt.ylabel('Feature 2') plt.show() # Define linkage methods to be used linkage_methods = ['ward', 'complete', 'average'] # Plot dendrograms for different linkage methods plt.figure(figsize=(15, 5)) for i, method in enumerate(linkage_methods): plt.subplot(1, 3, i + 1) Z = linkage(X_scaled, method=method) dendrogram(Z, truncate_mode='level', p=5) plt.title(f'Dendrogram using {method.capitalize()} Linkage') plt.xlabel('Data Points') plt.ylabel('Distance') plt.tight_layout() plt.show() # Apply Agglomerative Clustering using 'ward' linkage n_clusters = 3 # Number of clusters model = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward') y_pred = model.fit_predict(X_scaled) # Plot the clusters plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y_pred, cmap='viridis', s=50) plt.title('Agglomerative Hierarchical Clustering (Ward Linkage)') plt.xlabel('Feature 1') plt.ylabel('Feature 2') plt.show() T
Comments