Choose a publicly available dataset, load it into R, and perform data preparation tasks, such as data cleaning, handling missing values, and feature engineering.
Wed Mar 19 2025 17:30:27 GMT+0000 (Coordinated Universal Time)
Saved by
@wayneinvein
# Install and Load Necessary Libraries
install.packages(c("titanic", "dplyr"))
library(titanic)
library(dplyr)
# Load Titanic Dataset
data <- titanic::titanic_train
# Handle Missing Values
data$Age[is.na(data$Age)] <- median(data$Age, na.rm = TRUE)
data <- filter(data, !is.na(Embarked))
# Convert Categorical Variables to Factors
data <- data %>%
mutate(
Sex = as.factor(Sex),
Embarked = as.factor(Embarked),
Pclass = as.factor(Pclass),
FamilySize = SibSp + Parch + 1,
IsAlone = as.integer(FamilySize == 1),
Fare = scale(Fare)
)
# Final Dataset Check
str(data)
summary(data)
content_copyCOPY
Comments