Choose a publicly available dataset, load it into R, and perform data preparation tasks, such as data cleaning, handling missing values, and feature engineering.

PHOTO EMBED

Wed Mar 19 2025 17:30:27 GMT+0000 (Coordinated Universal Time)

Saved by @wayneinvein

# Install and Load Necessary Libraries
install.packages(c("titanic", "dplyr"))
library(titanic)
library(dplyr)

# Load Titanic Dataset
data <- titanic::titanic_train

# Handle Missing Values
data$Age[is.na(data$Age)] <- median(data$Age, na.rm = TRUE)
data <- filter(data, !is.na(Embarked))

# Convert Categorical Variables to Factors
data <- data %>%
  mutate(
    Sex = as.factor(Sex),
    Embarked = as.factor(Embarked),
    Pclass = as.factor(Pclass),
    FamilySize = SibSp + Parch + 1,
    IsAlone = as.integer(FamilySize == 1),
    Fare = scale(Fare)
  )

# Final Dataset Check
str(data)
summary(data)
content_copyCOPY