Categorizing data into lengthier category (part of jobs they like); 10% Keyword Match
Mon Sep 04 2023 14:33:24 GMT+0000 (Coordinated Universal Time)
Saved by @dr_dziedzorm
# Load necessary libraries
library(dplyr)
library(stringr)
library(readxl) # For reading .xlsx files
library(writexl) # For writing .xlsx files
# Read the data
data <- read_excel("/Users/felixdavis/Desktop/civicpulse_important part of job.xlsx")
# Print out column names for debugging
print(colnames(data))
# Define the categories and keywords
categories <- list(
"Communication and Information Dissemination" = c("Clear and accurate information", "Debunking", "misinformation", "Open and honest communication"),
"Public Safety and Health" = c("Safety measures", "Health guidelines", "Emergency services"),
"Leadership and Decision-Making" = c("Policy-making", "Oversight and guidance", "Long-term planning"),
"Emotional and Psychological Support" = c("Calm and reassurance", "Hope and positivity", "Emotional well-being"),
"Resource Management" = c("Budget management", "Resource allocation", "Efficient service delivery"),
"Public Engagement" = c("Listening to constituents", "Public engagement", "Transparency"),
"Crisis Management" = c("Crisis preparedness", "Emergency response", "PPE provision"),
"Legal and Ethical Responsibility" = c("Following state directives", "Ethical considerations", "Legal obligations"),
"Future Planning" = c("Financial planning", "Future vision", "Sustainability"),
"Community Building" = c("Quality of life", "Community needs", "Social connectivity")
)
# Function to categorize each row (NEW CODE)
categorize_row <- function(row) {
# Check for NA values first
if (is.na(row)) {
return("Uncategorized - NA")
}
# Proceed with categorization if the row is not NA
category_names <- names(categories)
for (cat in category_names) {
keywords <- categories[[cat]]
for (keyword in keywords) {
# Split the keyword into individual words
keyword_words <- str_split(keyword, " ")[[1]]
# Count how many of these words appear in the row
matching_words <- sum(sapply(keyword_words, function(word) str_detect(row, regex(word, ignore_case = TRUE))))
# If more than 10% of the words in the keyword are in the row, categorize it
if (matching_words / length(keyword_words) > 0.1) {
return(cat)
}
}
}
return("Uncategorized")
}
# Apply the function to categorize each row
data$Category <- sapply(data$`What do you think is the single most important part of your job as a public official?`, categorize_row)
# Write the categorized data back to a new Excel file
write_xlsx(data, "/Users/felixdavis/Desktop/categorized_civicpulse_important part of job.xlsx")



Comments