//1 ....datapreprocessing**************************************************
import pandas as pd
df=pd.read_csv("/content/sample.csv")
#print(df)
print("DATA SET:\n", df)
print("DATA SET SIZE:",df.size)
print("DATA SET SHAPE:",df.shape)
print("DATA SET DIMENSIONS:",df.ndim)
print("Head\n",df.head())
print("Tail\n", df.tail())
print("Head(2)\n",df.head(2))
print("Tail(2)\n",df.tail(2))
print("Head(-2)\n",df.head(-2))
print("Tail(-2) \n",df.tail(-2))
print("DATA TYPES")
df.info()
print("STATISTICS:\n",df.describe().T)
print("FRE. COUNT OF RECORDS:\n",df.value_counts())
print("\nFRE. COUNT OF GENDER",df['GENDER'].value_counts())
#print("TWO FEATURES FRQ", df[['GENDER','M1']].value_counts())
print("\nEXISTANCE of NaNs in data set", df.isna())
print("\nCOL-WISE NaNs in data set", df.isna().sum())
print("\nOVERALL NaNs in data set", df.isna().sum().sum())
print("\nTOT NaNs in M1", df['M1'].isna().sum())
print("\nBefore Filling\n", df)
df['M1'].fillna(df['M1'].mean(),inplace=True) #saving update/permament
df['PHY'].fillna(df['PHY'].mean(),inplace=True) #saving update/permament
print("\nAFTER Filling\n", df)
print("BEFORE DROP - DF")
print(df)
df.drop_duplicates('SID',keep='first',inplace=True,ignore_index=True)
print("AFTER DROP DF")
print(df)
def remove_outliers_iqr(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print("lower_bound :",lower_bound,"upper_bound:",upper_bound)
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
# Remove outliers from the 'Math' column
df_no_outliers_math = remove_outliers_iqr(df, 'M1')
print("\nDataFrame after Removing Outliers in 'M1':")
print(df_no_outliers_math)
import matplotlib.pyplot as plt
import seaborn as sns
# Line Plot
plt.plot(df['M1'], df['PHY'],color='green')
plt.xlabel('M1')
plt.ylabel('PHY')
plt.title('Line Plot')
plt.show()
# Scatter Plot
plt.scatter(df['M1'], df['PHY'])
plt.xlabel('M1')
plt.ylabel('PHY')
plt.title('Scatter Plot')
plt.show()
plt.hist(df['M1'], bins=30, edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram')
plt.show()
sns.boxplot(data=df)
plt.title('Box Plot')
plt.show()
sns.pairplot(df)
plt.title('Pair Plot')
plt.show()
sns.barplot(x='GENDER', y='M1', data=df)
plt.title('Bar Plot')
plt.show()
2.regression************************************************************
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics
# Sample data
df=pd.read_csv("/content/Stud_Data.csv")
# Reshape the data :: Single Feature
hours_studied = np.array(df["hours_studied"])
print(hours_studied.shape)
hours_studied =hours_studied.reshape(-1, 1)
print(hours_studied.shape)
scores=df["scores"]
# Create a linear regression model
model = LinearRegression()
# Fit the model
model.fit(hours_studied, scores)
#Print the Parameters
print("Beta 0 :", model.intercept_)
print("Beta 1 :", model.coef_[0])
#Regression Model
print("Y=",model.intercept_,"+",model.coef_[0],"X")
# Make predictions
predicted_scores = model.predict(hours_studied)
df["predicted_scores"]=predicted_scores
print("ORIGINAL SCORES:\n",df["scores"])
print("PREDICTED SCORES:\n",df["predicted_scores"])
print("MAE",metrics.mean_absolute_error(scores,predicted_scores))
print("MSE",metrics.mean_squared_error(scores,predicted_scores))
print("RMSE",np.sqrt(metrics.mean_squared_error(scores,predicted_scores)))
r2 = metrics.r2_score(scores,predicted_scores)
print('r2 score/Coefficient of Determination for perfect model is', r2)
print("\nCorrelation Coefficient: r =",df['hours_studied'].corr(df['scores']))
### USING MACHINE LEARNING APPROACH
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
# Sample data
df=pd.read_csv("/content/AgeData.csv")
#print(df.describe())
x = df[['Income (in $1000s)', 'Education Level (Years)', 'Years of Experience']]
y= df['Age']
#print(x)
#print(y)
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size = 0.3)
#Fitting the Multiple Linear Regression model
mlr = LinearRegression()
mlr.fit(x_train, y_train)
#Intercept and Coefficient
print("Intercept: (Beta 0) ", mlr.intercept_)
#print("Coefficients:")
#print(list(zip(x, mlr.coef_)))
print("\nCoefficients:\n Beta 1:",mlr.coef_[0])
print("\n Beta 2:",mlr.coef_[1])
print("\n Beta 3:",mlr.coef_[2])
print("\nRegression Equation:",mlr.intercept_,"+",mlr.coef_[0],"*Income (in $1000s)+"
,mlr.coef_[1],"*Education Level (Years)+",mlr.coef_[2],"*Years of Experience")
#Prediction of test set
y_pred_mlr= mlr.predict(x_test)
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
#print('\nR squared: {:.2f}'.format(mlr.score(x,y)*100))
print('\nR squared: {:.2f}'.format(mlr.score(x,y)))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)
#### PREDICTING AGE BASED ON TEST/NEW OBSERVATION
newobs_df=pd.DataFrame([[38,15,12]], columns=x.columns)
y_pred_new= mlr.predict(newobs_df)
print("PREDICTED AGE OF NEW RESPONDENT",y_pred_new[0])
5 a..decision tree**********************************************
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
# Load and display top 2 records of the Soybean dataset
df=pd.read_csv("/content/Soybean.csv")
print(df.head(2))
# DATA EXPLORATION
#Data set details/info
print(df.info())
print(df.describe())
#### DATA PRE-PROCESSING
# Missing Values , Duplicated and Outliers Handling
# Handling Missing Values
#Verify Missing Values
#print(df.isna().sum())
#Fill Missing Values with mean value of the respective feature/column
cols=list(df.columns)
print("Before Pre-Processing - Total Missing Values",df.isna().sum().sum())
for i in range(0,len(cols)-1):
#print(cols[i])
if(df[cols[i]].isna().sum()>0):
df[cols[i]].fillna(df[cols[i]].mean(),inplace=True)
print("After Pre-Processing - Total Missing Values",df.isna().sum().sum())
# Handling Duplicate Values
#Verify Duplicate Values/records
print("BEFORE DROP :: DATA SIZE", df.shape)
df.drop_duplicates(keep="first",inplace=True)
#Verify Duplicate Values/records
print("AFTER DROP :: DATA SIZE", df.shape)
# Handling Outliers
#verify Outliers
#Plotting the box plot
plt.figure(figsize=(20, 8))
sns.boxplot(data=df, orient="v", palette="Set2")
plt.title("Box Plot of Soybean Dataset Features")
plt.show()
''' NOTE:: DATA HAS OUTLIERS BUT THEY ARE VALID RESPONSES , HENCE WE ARE NOT DROPPING THE OUTLIERS.
IF THEY ARE REALLY OUTLERS THEN WE SHOULD DROP THE OUTLIERS USING THE BELOW CODE
# #DROP Outliers
# def remove_outliers_iqr(df, column):
# Q1 = df[column].quantile(0.25)
# Q3 = df[column].quantile(0.75)
# IQR = Q3 - Q1
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR
# print(column,":","lower_bound :",lower_bound,"upper_bound:",upper_bound)
# return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
# # Remove outliers from the 'sepal_width' column
# print("BOX PLOT - B4", df.shape)
# for i in range(0,len(cols)-1):
# df = remove_outliers_iqr(df, cols[i])
# print("BOX PLOT - AFTER", df.shape)
'''
### MACHINE LEANING MODEL DESIGN AND EVALUATION
#Feature Set
X= df.iloc[:, :-1] #"Input Features
#Target Variable/ Classs variable
Y=df.iloc[:, [-1]]
# print("Input Features (X) : \n" , X)
# print("Target Variable/Class Variable (Y) : \n" , Y)
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Initialising Decision Tree Classifier
clf = DecisionTreeClassifier()
clf.fit(X_train,Y_train)
# Train the model
# Y_train array Should be flatten into a 1D array
clf.fit(X_train, np.array(Y_train).ravel())
# Predict on the test data
Y_pred = clf.predict(X_test)
# Print accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")
# # Print detailed classification report
# report = classification_report(Y_test, Y_pred)
# print("Classification Report:")
# print(report)
#Predict the class of the new observation
new_observation= pd.DataFrame([[6,0,2,1,0,3,0,1,1,1,1,1,0,2,2,0,0,0,1,0,3,1,1,1,0,0,0,0,4,0,0,0,0,0,0]], columns=X.columns)
predicted_class = clf.predict(new_observation)
print("Predicted Class of NEW OBSERVATION :: ", predicted_class[0])
5 b knn*************************************************************
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
# Load the Iris dataset
df=pd.read_csv("/content/sample_data/Iris.csv")
# DATA EXPLORATION
print(df.head())
print(df.info())
print(df.describe())
#Verify Missing Values
print(df.isna().sum())
#Verify Duplicate Values/records
print("BEFORE", df[df.duplicated()])
df.drop_duplicates(keep="first",inplace=True)
#Verify Duplicate Values/records
print("AFTER",df[df.duplicated()])
#verify Outliers
#Plotting the box plot
plt.figure(figsize=(12, 8))
sns.boxplot(data=df, orient="v", palette="Set2")
plt.title("Box Plot of Iris Dataset Features")
plt.show()
#DROP Outliers
def remove_outliers_iqr(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print("lower_bound :",lower_bound,"upper_bound:",upper_bound)
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
# Remove outliers from the 'sepal_width' column
df_no_outliers_sepal_width = remove_outliers_iqr(df, 'sepal_width')
print("\nDataFrame after Removing Outliers in 'sepal_width':")
print(df_no_outliers_sepal_width)
df=df_no_outliers_sepal_width
#verify Outliers
#Plotting the box plot
plt.figure(figsize=(12, 8))
sns.boxplot(data=df, orient="v", palette="Set2")
plt.title("Box Plot of Iris Dataset Features AFTER OULIERS DROPPED")
plt.show()
### MACHINE LEANING MODEL DESIGN AND EVALUATION
#Feature Set
X= df.iloc[:, :-1]
#Target Variable/ Classs variable
Y=df.iloc[:, [-1]]
print("Input Features (X) : \n" , X)
print("Target Variable/Class Variable (Y) : \n" , Y)
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Initialize KNN classifier
k = 3 # Define the number of neighbors
knn = KNeighborsClassifier(n_neighbors=k)
# Train the model
# Y_train array Should be flatten into a 1D array
knn.fit(X_train, np.array(Y_train).ravel())
# Predict on the test data
Y_pred = knn.predict(X_test)
# Print accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Print detailed classification report
report = classification_report(Y_test, Y_pred)
print("Classification Report:")
print(report)
#Predict the class of the new observation
#new_observation with sepal_length sepal_width petal_length petal_width
new_observation= pd.DataFrame([[5.1, 3.5, 1.4, 0.2]], columns=X.columns)
predicted_class = knn.predict(new_observation)
print("Predicted Class of NEW OBSERVATION :: ", predicted_class[0])
6.random forest *************************************************
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
# Load and display top 2 records of the Soybean dataset
df=pd.read_csv("/content/Loandata.csv")
# DATA EXPLORATION
print(df.info())
#print(df.head(2))
#DROP LOAN ID - NOT USEFUL IN ANALYSIS
df.drop(labels='Loan_ID', axis=1, inplace=True)
#REPLACE DEPENDENTS COUNT 3+ to 3
df['Dependents'] = df['Dependents'].replace('3+', '3')
#Missing Values Management
print("Before :: MISSING VALUE COUNT\n",df.isna().sum())
df['Gender'].fillna(df['Gender'].mode()[0],inplace=True)
df['Married'].fillna(df['Married'].mode()[0],inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0],inplace=True)
df['Education'].fillna(df['Education'].mode()[0],inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0],inplace=True)
df['Property_Area'].fillna(df['Property_Area'].mode()[0],inplace=True)
df['Loan_Status'].fillna(df['Loan_Status'].mode()[0],inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0],inplace=True)
### FILL WITH MEAN
df['LoanAmount'].fillna(df['LoanAmount'].mean(),inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean(),inplace=True)
print("After :: MISSING VALUE COUNT\n",df.isna().sum())
# Handling Duplicate Values
#Verify Duplicate Values/records
print("BEFORE DROP :: DATA SIZE", df.shape)
df.drop_duplicates(keep="first",inplace=True)
#Verify Duplicate Values/records
print("AFTER DROP :: DATA SIZE", df.shape)
# Handling Outliers
#NOTE:: DATA HAS OUTLIERS BUT THEY ARE VALID RESPONSES , HENCE WE ARE NOT DROPPING THE OUTLIERS.
#verify Outliers
#Plotting the box plot
plt.figure(figsize=(20, 8))
sns.boxplot(data=df, orient="v", palette="Set2")
plt.title("Box Plot of Soybean Dataset Features")
plt.show()
#DATA TRANSORMATION
# Initialize the LabelEncoder
label_encoder = LabelEncoder()
print(df['Loan_Status'].value_counts())
# Fit and transform the columns
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Married'] = label_encoder.fit_transform(df['Married'])
df['Education'] = label_encoder.fit_transform(df['Education'])
df['Self_Employed'] = label_encoder.fit_transform(df['Self_Employed'])
df['Property_Area'] = label_encoder.fit_transform(df['Property_Area'])
df['Loan_Status'] = label_encoder.fit_transform(df['Loan_Status'])
print(df['Loan_Status'].value_counts())
df.to_csv('CleanFile.csv', index=False)
### MACHINE LEANING MODEL DESIGN AND EVALUATION
#Feature Set
X= df.iloc[:, :-1] #"Input Features
#Target Variable/ Classs variable
Y=df.iloc[:, [-1]]
# print("Input Features (X) : \n" , X)
# print("Target Variable/Class Variable (Y) : \n" , Y)
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Ensure Y_train and Y_test are 1D arrays
Y_train = np.array(Y_train).ravel()
Y_test = np.array(Y_test).ravel()
# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, Y_train)
# Train the model
# Y_train array Should be flatten into a 1D array
clf.fit(X_train, Y_train)
# Predict on the test data
Y_pred = clf.predict(X_test)
# Print accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Print detailed classification report
report = classification_report(Y_test, Y_pred)
print("Classification Report:")
print(report)
#Predict the class of the new observation
new_observation= pd.DataFrame([[1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2]], columns=X.columns)
predicted_class = clf.predict(new_observation)
#### 0 - NO 1 - YES
if (predicted_class[0]=='0') :
classLable="No"
else:
classLable="Yes"
print("Predicted Class of NEW OBSERVATION :: ",classLable)
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter