dplyr #data manipulation
Tidyr #data cleaning
%>% #pipe operator in r
install.packages('dplyr')
#install.packages('nycflights13')
library(dplyr)
#dplyr functions
1. filter(df, <conditional expressions>) #select a set of rows from a data frame
#filter rows of data.frame with: (filter(), slice())
head(filter(df, month==11, day==3, carrier=='AA'))
head(flights[flights$month == 11 $ flights$day == 3, ])
head(slice(df, 1:10))
#ordering rows of a data frame
arrange(df, year,month,day, desc(arr_time))
#select the columns of the data.frame
select(df, month, day)
#rename columns
syntax: rename(df, new_col_name = old_col_name)
rename(df, airline_carrier == carrier)
#select unique values of a column
distinct(select(df, airlines))
#Add new columns to data frame with mutate() function
mutate(df, new_column = arrival_column - depart_column)
#Transmute() returns the new column back
transmute(df, new_column = arrival_column - depart_column)
#collapsing the data in a column into a single value eg mean or sum or sd
summarise(flights, new_average = mean(airtime_column, na.rm = TRUE))
#random sampling of rows
sample_n(flights, 10) #random samples 10 rows
sample_frac(flights, 0.1) #10% of the rows
Comments