Categorizing data into lengthier category (part of jobs they like); 10% Keyword Match

PHOTO EMBED

Mon Sep 04 2023 14:33:24 GMT+0000 (Coordinated Universal Time)

Saved by @dr_dziedzorm

# Load necessary libraries
library(dplyr)
library(stringr)
library(readxl)  # For reading .xlsx files
library(writexl)  # For writing .xlsx files

# Read the data
data <- read_excel("/Users/felixdavis/Desktop/civicpulse_important part of job.xlsx")

# Print out column names for debugging
print(colnames(data))

# Define the categories and keywords
categories <- list(
  "Communication and Information Dissemination" = c("Clear and accurate information", "Debunking", "misinformation", "Open and honest communication"),
  "Public Safety and Health" = c("Safety measures", "Health guidelines", "Emergency services"),
  "Leadership and Decision-Making" = c("Policy-making", "Oversight and guidance", "Long-term planning"),
  "Emotional and Psychological Support" = c("Calm and reassurance", "Hope and positivity", "Emotional well-being"),
  "Resource Management" = c("Budget management", "Resource allocation", "Efficient service delivery"),
  "Public Engagement" = c("Listening to constituents", "Public engagement", "Transparency"),
  "Crisis Management" = c("Crisis preparedness", "Emergency response", "PPE provision"),
  "Legal and Ethical Responsibility" = c("Following state directives", "Ethical considerations", "Legal obligations"),
  "Future Planning" = c("Financial planning", "Future vision", "Sustainability"),
  "Community Building" = c("Quality of life", "Community needs", "Social connectivity")
)

# Function to categorize each row (NEW CODE)
categorize_row <- function(row) {
  # Check for NA values first
  if (is.na(row)) {
    return("Uncategorized - NA")
  }
  
  # Proceed with categorization if the row is not NA
  category_names <- names(categories)
  for (cat in category_names) {
    keywords <- categories[[cat]]
    for (keyword in keywords) {
      # Split the keyword into individual words
      keyword_words <- str_split(keyword, " ")[[1]]
      
      # Count how many of these words appear in the row
      matching_words <- sum(sapply(keyword_words, function(word) str_detect(row, regex(word, ignore_case = TRUE))))
      
      # If more than 10% of the words in the keyword are in the row, categorize it
      if (matching_words / length(keyword_words) > 0.1) {
        return(cat)
      }
    }
  }
  return("Uncategorized")
}

# Apply the function to categorize each row
data$Category <- sapply(data$`What do you think is the single most important part of your job as a public official?`, categorize_row)

# Write the categorized data back to a new Excel file
write_xlsx(data, "/Users/felixdavis/Desktop/categorized_civicpulse_important part of job.xlsx")
content_copyCOPY