Naive Bayes

PHOTO EMBED

Sun Jun 08 2025 17:59:31 GMT+0000 (Coordinated Universal Time)

Saved by @wayneinvein

# Load libraries
library(tm)
library(SnowballC)
library(caret)
library(e1071)

# Load and prepare data
sms_data <- read.csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/sms_spam.csv", stringsAsFactors = FALSE)
colnames(sms_data) <- c("Label", "Message")
sms_data$Label <- factor(sms_data$Label, levels = c("ham", "spam"))

# Clean and preprocess text
corpus <- VCorpus(VectorSource(sms_data$Message))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)

# Create Document-Term Matrix
dtm <- DocumentTermMatrix(corpus)
dtm_df <- as.data.frame(as.matrix(dtm))
dtm_df$Label <- sms_data$Label

# Split into training and testing sets
set.seed(123)
split_index <- createDataPartition(dtm_df$Label, p = 0.8, list = FALSE)
train_data <- dtm_df[split_index, ]
test_data <- dtm_df[-split_index, ]

# Separate features and labels
x_train <- train_data[, -ncol(train_data)]
y_train <- train_data$Label
x_test <- test_data[, -ncol(test_data)]
y_test <- test_data$Label

# Train Naive Bayes model and predict
nb_model <- naiveBayes(x_train, y_train)
predictions <- predict(nb_model, x_test)

# Evaluate performance
conf_mat <- confusionMatrix(predictions, y_test)
print(conf_mat)
cat("Accuracy:", round(conf_mat$overall["Accuracy"] * 100, 2), "%\n")
content_copyCOPY