dplyr #data manipulation Tidyr #data cleaning %>% #pipe operator in r install.packages('dplyr') #install.packages('nycflights13') library(dplyr) #dplyr functions 1. filter(df, <conditional expressions>) #select a set of rows from a data frame #filter rows of data.frame with: (filter(), slice()) head(filter(df, month==11, day==3, carrier=='AA')) head(flights[flights$month == 11 $ flights$day == 3, ]) head(slice(df, 1:10)) #ordering rows of a data frame arrange(df, year,month,day, desc(arr_time)) #select the columns of the data.frame select(df, month, day) #rename columns syntax: rename(df, new_col_name = old_col_name) rename(df, airline_carrier == carrier) #select unique values of a column distinct(select(df, airlines)) #Add new columns to data frame with mutate() function mutate(df, new_column = arrival_column - depart_column) #Transmute() returns the new column back transmute(df, new_column = arrival_column - depart_column) #collapsing the data in a column into a single value eg mean or sum or sd summarise(flights, new_average = mean(airtime_column, na.rm = TRUE)) #random sampling of rows sample_n(flights, 10) #random samples 10 rows sample_frac(flights, 0.1) #10% of the rows