//1 ....datapreprocessing************************************************** import pandas as pd df=pd.read_csv("/content/sample.csv") #print(df) print("DATA SET:\n", df) print("DATA SET SIZE:",df.size) print("DATA SET SHAPE:",df.shape) print("DATA SET DIMENSIONS:",df.ndim) print("Head\n",df.head()) print("Tail\n", df.tail()) print("Head(2)\n",df.head(2)) print("Tail(2)\n",df.tail(2)) print("Head(-2)\n",df.head(-2)) print("Tail(-2) \n",df.tail(-2)) print("DATA TYPES") df.info() print("STATISTICS:\n",df.describe().T) print("FRE. COUNT OF RECORDS:\n",df.value_counts()) print("\nFRE. COUNT OF GENDER",df['GENDER'].value_counts()) #print("TWO FEATURES FRQ", df[['GENDER','M1']].value_counts()) print("\nEXISTANCE of NaNs in data set", df.isna()) print("\nCOL-WISE NaNs in data set", df.isna().sum()) print("\nOVERALL NaNs in data set", df.isna().sum().sum()) print("\nTOT NaNs in M1", df['M1'].isna().sum()) print("\nBefore Filling\n", df) df['M1'].fillna(df['M1'].mean(),inplace=True) #saving update/permament df['PHY'].fillna(df['PHY'].mean(),inplace=True) #saving update/permament print("\nAFTER Filling\n", df) print("BEFORE DROP - DF") print(df) df.drop_duplicates('SID',keep='first',inplace=True,ignore_index=True) print("AFTER DROP DF") print(df) def remove_outliers_iqr(df, column): Q1 = df[column].quantile(0.25) Q3 = df[column].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR print("lower_bound :",lower_bound,"upper_bound:",upper_bound) return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)] # Remove outliers from the 'Math' column df_no_outliers_math = remove_outliers_iqr(df, 'M1') print("\nDataFrame after Removing Outliers in 'M1':") print(df_no_outliers_math) import matplotlib.pyplot as plt import seaborn as sns # Line Plot plt.plot(df['M1'], df['PHY'],color='green') plt.xlabel('M1') plt.ylabel('PHY') plt.title('Line Plot') plt.show() # Scatter Plot plt.scatter(df['M1'], df['PHY']) plt.xlabel('M1') plt.ylabel('PHY') plt.title('Scatter Plot') plt.show() plt.hist(df['M1'], bins=30, edgecolor='black') plt.xlabel('Value') plt.ylabel('Frequency') plt.title('Histogram') plt.show() sns.boxplot(data=df) plt.title('Box Plot') plt.show() sns.pairplot(df) plt.title('Pair Plot') plt.show() sns.barplot(x='GENDER', y='M1', data=df) plt.title('Bar Plot') plt.show() 2.regression************************************************************ import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression from sklearn import metrics # Sample data df=pd.read_csv("/content/Stud_Data.csv") # Reshape the data :: Single Feature hours_studied = np.array(df["hours_studied"]) print(hours_studied.shape) hours_studied =hours_studied.reshape(-1, 1) print(hours_studied.shape) scores=df["scores"] # Create a linear regression model model = LinearRegression() # Fit the model model.fit(hours_studied, scores) #Print the Parameters print("Beta 0 :", model.intercept_) print("Beta 1 :", model.coef_[0]) #Regression Model print("Y=",model.intercept_,"+",model.coef_[0],"X") # Make predictions predicted_scores = model.predict(hours_studied) df["predicted_scores"]=predicted_scores print("ORIGINAL SCORES:\n",df["scores"]) print("PREDICTED SCORES:\n",df["predicted_scores"]) print("MAE",metrics.mean_absolute_error(scores,predicted_scores)) print("MSE",metrics.mean_squared_error(scores,predicted_scores)) print("RMSE",np.sqrt(metrics.mean_squared_error(scores,predicted_scores))) r2 = metrics.r2_score(scores,predicted_scores) print('r2 score/Coefficient of Determination for perfect model is', r2) print("\nCorrelation Coefficient: r =",df['hours_studied'].corr(df['scores'])) ### USING MACHINE LEARNING APPROACH import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression from sklearn import metrics from sklearn.model_selection import train_test_split # Sample data df=pd.read_csv("/content/AgeData.csv") #print(df.describe()) x = df[['Income (in $1000s)', 'Education Level (Years)', 'Years of Experience']] y= df['Age'] #print(x) #print(y) x_train, x_test, y_train, y_test=train_test_split(x, y, test_size = 0.3) #Fitting the Multiple Linear Regression model mlr = LinearRegression() mlr.fit(x_train, y_train) #Intercept and Coefficient print("Intercept: (Beta 0) ", mlr.intercept_) #print("Coefficients:") #print(list(zip(x, mlr.coef_))) print("\nCoefficients:\n Beta 1:",mlr.coef_[0]) print("\n Beta 2:",mlr.coef_[1]) print("\n Beta 3:",mlr.coef_[2]) print("\nRegression Equation:",mlr.intercept_,"+",mlr.coef_[0],"*Income (in $1000s)+" ,mlr.coef_[1],"*Education Level (Years)+",mlr.coef_[2],"*Years of Experience") #Prediction of test set y_pred_mlr= mlr.predict(x_test) meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr) meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr) rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr)) #print('\nR squared: {:.2f}'.format(mlr.score(x,y)*100)) print('\nR squared: {:.2f}'.format(mlr.score(x,y))) print('Mean Absolute Error:', meanAbErr) print('Mean Square Error:', meanSqErr) print('Root Mean Square Error:', rootMeanSqErr) #### PREDICTING AGE BASED ON TEST/NEW OBSERVATION newobs_df=pd.DataFrame([[38,15,12]], columns=x.columns) y_pred_new= mlr.predict(newobs_df) print("PREDICTED AGE OF NEW RESPONDENT",y_pred_new[0]) 5 a..decision tree********************************************** import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report,accuracy_score import seaborn as sns import matplotlib.pyplot as plt # Load and display top 2 records of the Soybean dataset df=pd.read_csv("/content/Soybean.csv") print(df.head(2)) # DATA EXPLORATION #Data set details/info print(df.info()) print(df.describe()) #### DATA PRE-PROCESSING # Missing Values , Duplicated and Outliers Handling # Handling Missing Values #Verify Missing Values #print(df.isna().sum()) #Fill Missing Values with mean value of the respective feature/column cols=list(df.columns) print("Before Pre-Processing - Total Missing Values",df.isna().sum().sum()) for i in range(0,len(cols)-1): #print(cols[i]) if(df[cols[i]].isna().sum()>0): df[cols[i]].fillna(df[cols[i]].mean(),inplace=True) print("After Pre-Processing - Total Missing Values",df.isna().sum().sum()) # Handling Duplicate Values #Verify Duplicate Values/records print("BEFORE DROP :: DATA SIZE", df.shape) df.drop_duplicates(keep="first",inplace=True) #Verify Duplicate Values/records print("AFTER DROP :: DATA SIZE", df.shape) # Handling Outliers #verify Outliers #Plotting the box plot plt.figure(figsize=(20, 8)) sns.boxplot(data=df, orient="v", palette="Set2") plt.title("Box Plot of Soybean Dataset Features") plt.show() ''' NOTE:: DATA HAS OUTLIERS BUT THEY ARE VALID RESPONSES , HENCE WE ARE NOT DROPPING THE OUTLIERS. IF THEY ARE REALLY OUTLERS THEN WE SHOULD DROP THE OUTLIERS USING THE BELOW CODE # #DROP Outliers # def remove_outliers_iqr(df, column): # Q1 = df[column].quantile(0.25) # Q3 = df[column].quantile(0.75) # IQR = Q3 - Q1 # lower_bound = Q1 - 1.5 * IQR # upper_bound = Q3 + 1.5 * IQR # print(column,":","lower_bound :",lower_bound,"upper_bound:",upper_bound) # return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)] # # Remove outliers from the 'sepal_width' column # print("BOX PLOT - B4", df.shape) # for i in range(0,len(cols)-1): # df = remove_outliers_iqr(df, cols[i]) # print("BOX PLOT - AFTER", df.shape) ''' ### MACHINE LEANING MODEL DESIGN AND EVALUATION #Feature Set X= df.iloc[:, :-1] #"Input Features #Target Variable/ Classs variable Y=df.iloc[:, [-1]] # print("Input Features (X) : \n" , X) # print("Target Variable/Class Variable (Y) : \n" , Y) # Split data into train and test sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) # Initialising Decision Tree Classifier clf = DecisionTreeClassifier() clf.fit(X_train,Y_train) # Train the model # Y_train array Should be flatten into a 1D array clf.fit(X_train, np.array(Y_train).ravel()) # Predict on the test data Y_pred = clf.predict(X_test) # Print accuracy accuracy = accuracy_score(Y_test, Y_pred) print(f"Accuracy: {accuracy:.2f}") # # Print detailed classification report # report = classification_report(Y_test, Y_pred) # print("Classification Report:") # print(report) #Predict the class of the new observation new_observation= pd.DataFrame([[6,0,2,1,0,3,0,1,1,1,1,1,0,2,2,0,0,0,1,0,3,1,1,1,0,0,0,0,4,0,0,0,0,0,0]], columns=X.columns) predicted_class = clf.predict(new_observation) print("Predicted Class of NEW OBSERVATION :: ", predicted_class[0]) 5 b knn************************************************************* import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import classification_report,accuracy_score import seaborn as sns import matplotlib.pyplot as plt # Load the Iris dataset df=pd.read_csv("/content/sample_data/Iris.csv") # DATA EXPLORATION print(df.head()) print(df.info()) print(df.describe()) #Verify Missing Values print(df.isna().sum()) #Verify Duplicate Values/records print("BEFORE", df[df.duplicated()]) df.drop_duplicates(keep="first",inplace=True) #Verify Duplicate Values/records print("AFTER",df[df.duplicated()]) #verify Outliers #Plotting the box plot plt.figure(figsize=(12, 8)) sns.boxplot(data=df, orient="v", palette="Set2") plt.title("Box Plot of Iris Dataset Features") plt.show() #DROP Outliers def remove_outliers_iqr(df, column): Q1 = df[column].quantile(0.25) Q3 = df[column].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR print("lower_bound :",lower_bound,"upper_bound:",upper_bound) return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)] # Remove outliers from the 'sepal_width' column df_no_outliers_sepal_width = remove_outliers_iqr(df, 'sepal_width') print("\nDataFrame after Removing Outliers in 'sepal_width':") print(df_no_outliers_sepal_width) df=df_no_outliers_sepal_width #verify Outliers #Plotting the box plot plt.figure(figsize=(12, 8)) sns.boxplot(data=df, orient="v", palette="Set2") plt.title("Box Plot of Iris Dataset Features AFTER OULIERS DROPPED") plt.show() ### MACHINE LEANING MODEL DESIGN AND EVALUATION #Feature Set X= df.iloc[:, :-1] #Target Variable/ Classs variable Y=df.iloc[:, [-1]] print("Input Features (X) : \n" , X) print("Target Variable/Class Variable (Y) : \n" , Y) # Split data into train and test sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) # Initialize KNN classifier k = 3 # Define the number of neighbors knn = KNeighborsClassifier(n_neighbors=k) # Train the model # Y_train array Should be flatten into a 1D array knn.fit(X_train, np.array(Y_train).ravel()) # Predict on the test data Y_pred = knn.predict(X_test) # Print accuracy accuracy = accuracy_score(Y_test, Y_pred) print(f"Accuracy: {accuracy:.2f}") # Print detailed classification report report = classification_report(Y_test, Y_pred) print("Classification Report:") print(report) #Predict the class of the new observation #new_observation with sepal_length sepal_width petal_length petal_width new_observation= pd.DataFrame([[5.1, 3.5, 1.4, 0.2]], columns=X.columns) predicted_class = knn.predict(new_observation) print("Predicted Class of NEW OBSERVATION :: ", predicted_class[0]) 6.random forest ************************************************* import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report,accuracy_score import seaborn as sns import matplotlib.pyplot as plt # Load and display top 2 records of the Soybean dataset df=pd.read_csv("/content/Loandata.csv") # DATA EXPLORATION print(df.info()) #print(df.head(2)) #DROP LOAN ID - NOT USEFUL IN ANALYSIS df.drop(labels='Loan_ID', axis=1, inplace=True) #REPLACE DEPENDENTS COUNT 3+ to 3 df['Dependents'] = df['Dependents'].replace('3+', '3') #Missing Values Management print("Before :: MISSING VALUE COUNT\n",df.isna().sum()) df['Gender'].fillna(df['Gender'].mode()[0],inplace=True) df['Married'].fillna(df['Married'].mode()[0],inplace=True) df['Dependents'].fillna(df['Dependents'].mode()[0],inplace=True) df['Education'].fillna(df['Education'].mode()[0],inplace=True) df['Self_Employed'].fillna(df['Self_Employed'].mode()[0],inplace=True) df['Property_Area'].fillna(df['Property_Area'].mode()[0],inplace=True) df['Loan_Status'].fillna(df['Loan_Status'].mode()[0],inplace=True) df['Credit_History'].fillna(df['Credit_History'].mode()[0],inplace=True) ### FILL WITH MEAN df['LoanAmount'].fillna(df['LoanAmount'].mean(),inplace=True) df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean(),inplace=True) print("After :: MISSING VALUE COUNT\n",df.isna().sum()) # Handling Duplicate Values #Verify Duplicate Values/records print("BEFORE DROP :: DATA SIZE", df.shape) df.drop_duplicates(keep="first",inplace=True) #Verify Duplicate Values/records print("AFTER DROP :: DATA SIZE", df.shape) # Handling Outliers #NOTE:: DATA HAS OUTLIERS BUT THEY ARE VALID RESPONSES , HENCE WE ARE NOT DROPPING THE OUTLIERS. #verify Outliers #Plotting the box plot plt.figure(figsize=(20, 8)) sns.boxplot(data=df, orient="v", palette="Set2") plt.title("Box Plot of Soybean Dataset Features") plt.show() #DATA TRANSORMATION # Initialize the LabelEncoder label_encoder = LabelEncoder() print(df['Loan_Status'].value_counts()) # Fit and transform the columns df['Gender'] = label_encoder.fit_transform(df['Gender']) df['Married'] = label_encoder.fit_transform(df['Married']) df['Education'] = label_encoder.fit_transform(df['Education']) df['Self_Employed'] = label_encoder.fit_transform(df['Self_Employed']) df['Property_Area'] = label_encoder.fit_transform(df['Property_Area']) df['Loan_Status'] = label_encoder.fit_transform(df['Loan_Status']) print(df['Loan_Status'].value_counts()) df.to_csv('CleanFile.csv', index=False) ### MACHINE LEANING MODEL DESIGN AND EVALUATION #Feature Set X= df.iloc[:, :-1] #"Input Features #Target Variable/ Classs variable Y=df.iloc[:, [-1]] # print("Input Features (X) : \n" , X) # print("Target Variable/Class Variable (Y) : \n" , Y) # Split data into train and test sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) # Ensure Y_train and Y_test are 1D arrays Y_train = np.array(Y_train).ravel() Y_test = np.array(Y_test).ravel() # Train a Random Forest Classifier clf = RandomForestClassifier(n_estimators=100, random_state=42) clf.fit(X_train, Y_train) # Train the model # Y_train array Should be flatten into a 1D array clf.fit(X_train, Y_train) # Predict on the test data Y_pred = clf.predict(X_test) # Print accuracy accuracy = accuracy_score(Y_test, Y_pred) print(f"Accuracy: {accuracy:.2f}") # Print detailed classification report report = classification_report(Y_test, Y_pred) print("Classification Report:") print(report) #Predict the class of the new observation new_observation= pd.DataFrame([[1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2]], columns=X.columns) predicted_class = clf.predict(new_observation) #### 0 - NO 1 - YES if (predicted_class[0]=='0') : classLable="No" else: classLable="Yes" print("Predicted Class of NEW OBSERVATION :: ",classLable)
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter