Preview:
//1 ....datapreprocessing**************************************************
import pandas as pd

df=pd.read_csv("/content/sample.csv")

#print(df)
print("DATA SET:\n", df)
print("DATA SET SIZE:",df.size)
print("DATA SET SHAPE:",df.shape)
print("DATA SET DIMENSIONS:",df.ndim)

print("Head\n",df.head())
print("Tail\n", df.tail())

print("Head(2)\n",df.head(2))
print("Tail(2)\n",df.tail(2))
print("Head(-2)\n",df.head(-2))
print("Tail(-2) \n",df.tail(-2))

print("DATA TYPES")
df.info()

print("STATISTICS:\n",df.describe().T)

print("FRE. COUNT OF RECORDS:\n",df.value_counts())
print("\nFRE. COUNT OF GENDER",df['GENDER'].value_counts())
#print("TWO FEATURES FRQ", df[['GENDER','M1']].value_counts())

print("\nEXISTANCE of NaNs in data set", df.isna())
print("\nCOL-WISE NaNs in data set", df.isna().sum())
print("\nOVERALL NaNs in data set", df.isna().sum().sum())
print("\nTOT NaNs in M1", df['M1'].isna().sum())

print("\nBefore Filling\n", df)
df['M1'].fillna(df['M1'].mean(),inplace=True) #saving update/permament
df['PHY'].fillna(df['PHY'].mean(),inplace=True) #saving update/permament
print("\nAFTER Filling\n", df)

print("BEFORE DROP - DF")
print(df)

df.drop_duplicates('SID',keep='first',inplace=True,ignore_index=True)
print("AFTER DROP DF")
print(df)

def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    print("lower_bound :",lower_bound,"upper_bound:",upper_bound)
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers from the 'Math' column
df_no_outliers_math = remove_outliers_iqr(df, 'M1')
print("\nDataFrame after Removing Outliers in 'M1':")
print(df_no_outliers_math)

import matplotlib.pyplot as plt
import seaborn as sns

# Line Plot
plt.plot(df['M1'], df['PHY'],color='green')
plt.xlabel('M1')
plt.ylabel('PHY')
plt.title('Line Plot')
plt.show()

# Scatter Plot
plt.scatter(df['M1'], df['PHY'])
plt.xlabel('M1')
plt.ylabel('PHY')
plt.title('Scatter Plot')
plt.show()

plt.hist(df['M1'], bins=30, edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram')
plt.show()

sns.boxplot(data=df)
plt.title('Box Plot')
plt.show()

sns.pairplot(df)
plt.title('Pair Plot')
plt.show()

sns.barplot(x='GENDER', y='M1', data=df)
plt.title('Bar Plot')
plt.show()
 

2.regression************************************************************
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Sample data
df=pd.read_csv("/content/Stud_Data.csv")

# Reshape the data :: Single Feature
hours_studied = np.array(df["hours_studied"])
print(hours_studied.shape)

hours_studied =hours_studied.reshape(-1, 1)
print(hours_studied.shape)

scores=df["scores"]

# Create a linear regression model
model = LinearRegression()

# Fit the model
model.fit(hours_studied, scores)

#Print the Parameters
print("Beta 0 :", model.intercept_)
print("Beta 1 :", model.coef_[0])

#Regression Model

print("Y=",model.intercept_,"+",model.coef_[0],"X")
# Make predictions
predicted_scores = model.predict(hours_studied)

df["predicted_scores"]=predicted_scores

print("ORIGINAL SCORES:\n",df["scores"])
print("PREDICTED SCORES:\n",df["predicted_scores"])


print("MAE",metrics.mean_absolute_error(scores,predicted_scores))
print("MSE",metrics.mean_squared_error(scores,predicted_scores))
print("RMSE",np.sqrt(metrics.mean_squared_error(scores,predicted_scores)))

r2 = metrics.r2_score(scores,predicted_scores)
print('r2 score/Coefficient of Determination for perfect model is', r2)
print("\nCorrelation Coefficient: r =",df['hours_studied'].corr(df['scores']))



### USING MACHINE LEARNING APPROACH

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Sample data
df=pd.read_csv("/content/AgeData.csv")
#print(df.describe())

x = df[['Income (in $1000s)', 'Education Level (Years)', 'Years of Experience']]
y= df['Age']

#print(x)
#print(y)

x_train, x_test, y_train, y_test=train_test_split(x, y, test_size = 0.3)

#Fitting the Multiple Linear Regression model
mlr = LinearRegression()
mlr.fit(x_train, y_train)

#Intercept and Coefficient
print("Intercept: (Beta 0) ", mlr.intercept_)

#print("Coefficients:")
#print(list(zip(x, mlr.coef_)))

print("\nCoefficients:\n Beta 1:",mlr.coef_[0])
print("\n Beta 2:",mlr.coef_[1])
print("\n Beta 3:",mlr.coef_[2])

print("\nRegression Equation:",mlr.intercept_,"+",mlr.coef_[0],"*Income (in $1000s)+"
,mlr.coef_[1],"*Education Level (Years)+",mlr.coef_[2],"*Years of Experience")

#Prediction of test set
y_pred_mlr= mlr.predict(x_test)

meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
#print('\nR squared: {:.2f}'.format(mlr.score(x,y)*100))
print('\nR squared: {:.2f}'.format(mlr.score(x,y)))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)


#### PREDICTING AGE BASED ON TEST/NEW OBSERVATION
newobs_df=pd.DataFrame([[38,15,12]], columns=x.columns)

y_pred_new= mlr.predict(newobs_df)
print("PREDICTED AGE OF NEW RESPONDENT",y_pred_new[0])


5 a..decision tree**********************************************

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt

# Load and display top 2 records of the Soybean dataset
df=pd.read_csv("/content/Soybean.csv")
print(df.head(2))

# DATA EXPLORATION
#Data set details/info
print(df.info())
print(df.describe())

#### DATA PRE-PROCESSING
# Missing Values , Duplicated and Outliers Handling

# Handling Missing Values

#Verify Missing Values
#print(df.isna().sum())

#Fill Missing Values with mean value of the respective feature/column
cols=list(df.columns)

print("Before Pre-Processing - Total Missing Values",df.isna().sum().sum())

for i in range(0,len(cols)-1):
  #print(cols[i])
  if(df[cols[i]].isna().sum()>0):
    df[cols[i]].fillna(df[cols[i]].mean(),inplace=True)

print("After Pre-Processing - Total Missing Values",df.isna().sum().sum())

# Handling Duplicate Values
#Verify Duplicate Values/records


print("BEFORE DROP :: DATA SIZE", df.shape)

df.drop_duplicates(keep="first",inplace=True)

#Verify Duplicate Values/records
print("AFTER DROP :: DATA SIZE", df.shape)

# Handling Outliers

#verify Outliers

#Plotting the box plot
plt.figure(figsize=(20, 8))
sns.boxplot(data=df, orient="v", palette="Set2")
plt.title("Box Plot of Soybean Dataset Features")
plt.show()

''' NOTE:: DATA HAS OUTLIERS BUT THEY ARE VALID RESPONSES , HENCE WE ARE NOT DROPPING THE OUTLIERS.
IF THEY ARE REALLY OUTLERS THEN WE SHOULD DROP THE OUTLIERS USING THE BELOW CODE

# #DROP Outliers
# def remove_outliers_iqr(df, column):
#     Q1 = df[column].quantile(0.25)
#     Q3 = df[column].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     print(column,":","lower_bound :",lower_bound,"upper_bound:",upper_bound)
#     return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# # Remove outliers from the 'sepal_width' column

# print("BOX PLOT - B4", df.shape)

# for i in range(0,len(cols)-1):
#   df = remove_outliers_iqr(df, cols[i])

# print("BOX PLOT - AFTER", df.shape)

'''

### MACHINE LEANING MODEL DESIGN AND EVALUATION

#Feature Set
X= df.iloc[:, :-1] #"Input Features

#Target Variable/ Classs variable
Y=df.iloc[:, [-1]]

# print("Input Features (X) : \n" , X)
# print("Target Variable/Class Variable (Y) : \n" , Y)

# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialising Decision Tree Classifier
clf = DecisionTreeClassifier()
clf.fit(X_train,Y_train)

# Train the model
# Y_train array Should be flatten into a 1D array
clf.fit(X_train, np.array(Y_train).ravel())

# Predict on the test data
Y_pred = clf.predict(X_test)

# Print accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")

# # Print detailed classification report
# report = classification_report(Y_test, Y_pred)
# print("Classification Report:")
# print(report)

#Predict the class of the new observation

new_observation= pd.DataFrame([[6,0,2,1,0,3,0,1,1,1,1,1,0,2,2,0,0,0,1,0,3,1,1,1,0,0,0,0,4,0,0,0,0,0,0]], columns=X.columns)

predicted_class = clf.predict(new_observation)

print("Predicted Class of NEW OBSERVATION :: ", predicted_class[0])


5 b knn*************************************************************

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt

# Load the Iris dataset
df=pd.read_csv("/content/sample_data/Iris.csv")

# DATA EXPLORATION
print(df.head())
print(df.info())
print(df.describe())

#Verify Missing Values
print(df.isna().sum())

#Verify Duplicate Values/records
print("BEFORE", df[df.duplicated()])

df.drop_duplicates(keep="first",inplace=True)

#Verify Duplicate Values/records
print("AFTER",df[df.duplicated()])


#verify Outliers

#Plotting the box plot
plt.figure(figsize=(12, 8))
sns.boxplot(data=df, orient="v", palette="Set2")
plt.title("Box Plot of Iris Dataset Features")
plt.show()

#DROP Outliers
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    print("lower_bound :",lower_bound,"upper_bound:",upper_bound)
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers from the 'sepal_width' column
df_no_outliers_sepal_width = remove_outliers_iqr(df, 'sepal_width')
print("\nDataFrame after Removing Outliers in 'sepal_width':")
print(df_no_outliers_sepal_width)

df=df_no_outliers_sepal_width

#verify Outliers

#Plotting the box plot
plt.figure(figsize=(12, 8))
sns.boxplot(data=df, orient="v", palette="Set2")
plt.title("Box Plot of Iris Dataset Features AFTER OULIERS DROPPED")
plt.show()


### MACHINE LEANING MODEL DESIGN AND EVALUATION

#Feature Set
X= df.iloc[:, :-1]

#Target Variable/ Classs variable
Y=df.iloc[:, [-1]]

print("Input Features (X) : \n" , X)
print("Target Variable/Class Variable (Y) : \n" , Y)

# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize KNN classifier
k = 3  # Define the number of neighbors
knn = KNeighborsClassifier(n_neighbors=k)

# Train the model
# Y_train array Should be flatten into a 1D array
knn.fit(X_train, np.array(Y_train).ravel())

# Predict on the test data
Y_pred = knn.predict(X_test)

# Print accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print detailed classification report
report = classification_report(Y_test, Y_pred)
print("Classification Report:")
print(report)

#Predict the class of the new observation

#new_observation with sepal_length  sepal_width  petal_length  petal_width
new_observation= pd.DataFrame([[5.1, 3.5, 1.4, 0.2]], columns=X.columns)

predicted_class = knn.predict(new_observation)

print("Predicted Class of NEW OBSERVATION :: ", predicted_class[0])

6.random forest *************************************************

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt

# Load and display top 2 records of the Soybean dataset
df=pd.read_csv("/content/Loandata.csv")

# DATA EXPLORATION

print(df.info())
#print(df.head(2))

#DROP LOAN ID - NOT USEFUL IN ANALYSIS
df.drop(labels='Loan_ID', axis=1, inplace=True)

#REPLACE DEPENDENTS COUNT 3+ to 3
df['Dependents'] = df['Dependents'].replace('3+', '3')

#Missing Values Management

print("Before :: MISSING VALUE COUNT\n",df.isna().sum())

df['Gender'].fillna(df['Gender'].mode()[0],inplace=True)
df['Married'].fillna(df['Married'].mode()[0],inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0],inplace=True)
df['Education'].fillna(df['Education'].mode()[0],inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0],inplace=True)
df['Property_Area'].fillna(df['Property_Area'].mode()[0],inplace=True)
df['Loan_Status'].fillna(df['Loan_Status'].mode()[0],inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0],inplace=True)

### FILL WITH MEAN
df['LoanAmount'].fillna(df['LoanAmount'].mean(),inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean(),inplace=True)

print("After :: MISSING VALUE COUNT\n",df.isna().sum())

# Handling Duplicate Values
#Verify Duplicate Values/records


print("BEFORE DROP :: DATA SIZE", df.shape)

df.drop_duplicates(keep="first",inplace=True)

#Verify Duplicate Values/records
print("AFTER DROP :: DATA SIZE", df.shape)

# Handling Outliers

#NOTE:: DATA HAS OUTLIERS BUT THEY ARE VALID RESPONSES , HENCE WE ARE NOT DROPPING THE OUTLIERS.

#verify Outliers

#Plotting the box plot
plt.figure(figsize=(20, 8))
sns.boxplot(data=df, orient="v", palette="Set2")
plt.title("Box Plot of Soybean Dataset Features")
plt.show()

#DATA TRANSORMATION
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

print(df['Loan_Status'].value_counts())

# Fit and transform the columns
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Married'] = label_encoder.fit_transform(df['Married'])
df['Education'] = label_encoder.fit_transform(df['Education'])
df['Self_Employed'] = label_encoder.fit_transform(df['Self_Employed'])
df['Property_Area'] = label_encoder.fit_transform(df['Property_Area'])
df['Loan_Status'] = label_encoder.fit_transform(df['Loan_Status'])

print(df['Loan_Status'].value_counts())

df.to_csv('CleanFile.csv', index=False)

### MACHINE LEANING MODEL DESIGN AND EVALUATION

#Feature Set
X= df.iloc[:, :-1] #"Input Features

#Target Variable/ Classs variable
Y=df.iloc[:, [-1]]

# print("Input Features (X) : \n" , X)
# print("Target Variable/Class Variable (Y) : \n" , Y)

# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Ensure Y_train and Y_test are 1D arrays
Y_train = np.array(Y_train).ravel()
Y_test = np.array(Y_test).ravel()

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, Y_train)


# Train the model
# Y_train array Should be flatten into a 1D array
clf.fit(X_train, Y_train)

# Predict on the test data
Y_pred = clf.predict(X_test)

# Print accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print detailed classification report
report = classification_report(Y_test, Y_pred)
print("Classification Report:")
print(report)

#Predict the class of the new observation
new_observation= pd.DataFrame([[1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2]], columns=X.columns)

predicted_class = clf.predict(new_observation)

#### 0 - NO 1 - YES
if (predicted_class[0]=='0') :
  classLable="No"
else:
    classLable="Yes"

print("Predicted Class of NEW OBSERVATION :: ",classLable)
downloadDownload PNG downloadDownload JPEG downloadDownload SVG

Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!

Click to optimize width for Twitter