Snippets Collections
    DT[ i,  j,  by ] # + extra arguments
        |   |   |
        |   |    -------> grouped by what?
        |    -------> what to do?
         ---> on which rows?
function (w, d, s, l, i) {
                    w[l] = w[l] || [];
                    w[l].push({'gtm.start':
                                new Date().getTime(), event: 'gtm.js'});
                    var f = d.getElementsByTagName(s)[0],
                            j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : '';
                    j.async = true;
                    j.src =
                            'https://www.googletagmanager.com/gtm.js?id=' + i + dl;
                    f.parentNode.insertBefore(j, f);
                })(window, document, 'script', 'dataLayer', 'GTM-KK5FBSF')
ggplot(mpg) + 
  geom_point(var1, var2, colour = var3) + #specify variable to colour mapping
             scale_colour_brewer(type = "qual") 
pivot_longer(
  data,
  cols, #to select based on name use starts_with("pattern_"),
  names_to = "name",
  names_prefix = NULL,
  names_sep = NULL,
  names_pattern = NULL,
  names_ptypes = list(),
  names_transform = list(),
  names_repair = "check_unique",
  values_to = "value",
  values_drop_na = FALSE,
  values_ptypes = list(),
  values_transform = list(col1 = as.integer
                         col2 = as.numeric),
)

### Deriving variables from complex column headers ###

df %>%
  pivot_longer (
  	cols,
	names_to = c("var1", ".value"),
	names_sep = "_")
df %>% 
  # Impute/fill the column
  fill(col_name, .direction = "up") # .direction can be "up" or "down"
# put multiple obs from 1 cell on to multiple rows

df %>%
  separate_rows(col_w_many_obs, sep = ", ") 


#LINK TO:




#########Separate############

df %>% 
  separate(column_w_2_obs, into = c("obs1", "obs2"), sep = ",") 
### use "convert = TRUE" after sep arg to convert separeted columns into numeric if applicable.

#########Unite############
df %>%
  unite(united_col, col1, col2, sep = " ") #Sep = " " makes white space



# Generate pairs with same 1 variable in common (block)
pair_blocking(df1, df2, blocking = "variable_in_common") %>%
# See how pairs match on varialbes
  compare_pairs(by = c("var1", "var2", "var3"), 
      default_comparator = jaro_winkler()/lcs()/lm()) %>%
  # Score pairs
  score_problink() %>%
  # Select pairs that match best
  select_n_to_m() %>%
  # Link data according to score
  link()
library(fuzzyjoin) 

df1 %>% # Left join based on stringdist using city and city_actual cols
    stringdist_left_join(df2, by = c("df1_col" = "df2_col")) 
library(visdat)

vis_mis(df) # shows missing values in black and present values in grey for all columns
df %>%
  # theoretical_age: age of person based on birth
  mutate(theoretical_age = floor(as.numeric(birth %--% today(), "years"))) # get age in whole years
  
  # Define the date formats
formats <- c("%Y-%m-%d", "%B %d, %Y")

# Convert dates to the same format
df %>%
  mutate(date_column_cleaned = parse_date_time(date_column, formats)) 
# Define the date formats
formats <- c("%Y-%m-%d", "%B %d, %Y")

# Convert dates to the same format
df %>%
  mutate(date_column_cleaned = parse_date_time(date_column, formats)) 
#library(stringr)

df %>%     ##### if detecting regular characters #####
  filter(str_detect(colum1, "patern1" | str_detect(column1, "patern2")))


df %>%    ##### if detecting special characters #####
  filter(str_detect(colum1, fixed("(")) | str_detect(column1, fixed(")")))
vector_with_old_varnames_to_be_collapsed <- c("Var", "var", "Variable", "variables")

df %>% 
  mutate(collapsed_variables = fct_collapse(variables_column, 
                                     Variable = vector_with_old_varnames_to_be_collapsed)))
df %>%
  mutate(trimmed_column = str_trim(column),
        trimmed_column_lower = str_to_lower(trimmed_column))

df1 %>%
  full_join(df2, by = c("column_name_1", "column_name_2"), 
            suffix = c("_df1", "_df2")) 
%>%
  replace_na(list(n_batman = 0, n_star_wars = 0)) 
udl <- 32 #upper detection limit of machine is 32

matrinem %>% #dataset
  mutate(values_in_range = 
        replace(values, values > udl, udl)) #replace(col, condition, result)
  
library(DBI)
connection <- dbConnect(RMySQL::MySQL(),
                 dbname = "name",
                 host = "adress.amazonaws.com",
                 port = number,
                 user = "id",
                 password = "pw")
info <- dbGetQuery(connection, "SELECT column1 FROM database WHERE argument = something")

info
usethis::edit_rstudio_snippets()

#Use this syntax
snippet plonger #Snippet name
	pivot_longer(${1:mydf},
	             cols = ${2:columns to pivot long},
	             names_to = "${3:desired name for category column}",
	             values_to = "${4:desired name for value column}"
	)
library(ggpubr)

dat <- my_data

# Edit from here #
x <- which(names(dat) == "Species") # name of grouping variable
y <- which(names(dat) == "Sepal.Length" # names of variables to test
| names(dat) == "Sepal.Width"
| names(dat) == "Petal.Length"
| names(dat) == "Petal.Width")
method <- "t.test" # one of "wilcox.test" or "t.test"
paired <- FALSE # if paired make sure that in the dataframe you have first all individuals at T1, then all individuals again at T2

# Edit until here
# Edit at your own risk
for (i in y) {
  for (j in x) {
    ifelse(paired == TRUE,
      p <- ggpaired(dat,
        x = colnames(dat[j]), y = colnames(dat[i]),
        color = colnames(dat[j]), line.color = "gray", line.size = 0.4,
        palette = "npg",
        legend = "none",
        xlab = colnames(dat[j]),
        ylab = colnames(dat[i]),
        add = "jitter"
      ),
      p <- ggboxplot(dat,
        x = colnames(dat[j]), y = colnames(dat[i]),
        color = colnames(dat[j]),
        palette = "npg",
        legend = "none",
        add = "jitter"
      )
    )
    #  Add p-value
    print(p + stat_compare_means(aes(label = paste0(..method.., ", p-value = ", ..p.format.., " (", ifelse(..p.adj.. >= 0.05, "not significant", ..p.signif..), ")")),
      method = method,
      paired = paired,
      # group.by = NULL,
      ref.group = NULL
    ))
  }
}
library(ggpubr) #required package

dat <- my_data

# Edit from here
x <- which(names(dat) == "Group") # name of grouping variable
y <- which(
  names(dat) == "INFg" # names of variables to test
| names(dat) == "IL-10"
| names(dat) == "IL-12p70"
| names(dat) == "IL-1b"
| names(dat) == "IL-2"
| names(dat) == "IL-4"
| names(dat) == "IL-5"
| names(dat) == "IL-6"
| names(dat) == "KCGRO"
| names(dat) == "TNFa"
)
method1 <- "anova" # one of "anova" or "kruskal.test"
method2 <- "t.test" # one of "wilcox.test" or "t.test"
my_comparisons <- list(
  c("CON-BF", "FVT-FORM"
    ), 
  c("CON-BF", "SM-FORM"
    ), 
  c("FVT-FORM", "SM-FORM")
  ) # comparisons for post-hoc tests
# Edit until here


# Edit at your own risk
for (i in y) {
  for (j in x) {
    p <- ggboxplot(dat,
      x = colnames(dat[j]), y = colnames(dat[i]),
      color = colnames(dat[j]),
      legend = "none",
      palette = "npg",
      add = "jitter"
    )
    print(
      p + stat_compare_means(aes(label = paste0(..method.., ", p-value = ", ..p.format.., " (", ifelse(..p.adj.. > 0.05, "not significant", ..p.signif..), ")")),
        method = method1, label.y = max(dat[, i], na.rm = TRUE)
      )
      + stat_compare_means(comparisons = my_comparisons, method = method2, label = "p.format") # remove if p-value of ANOVA or Kruskal-Wallis test >= 0.05
    )
  }
}
library(motifmatchr)
anno <- getPeakAnnotation(ArchRProj = proj, name = "Motif")
motif_ix <- matchMotifs(anno$motifs, GR, genome = "hg38",out = "score")
score <- assays(motif_ix)$motifScores
apply(score,1,function(x) which(x==max(x)))
colnames(score)[tail(sort(as.matrix(score)), 10)]
#!/usr/bin/env bash

# Install R on WSL
sudo apt-get update -qq -y
sudo apt-get install -y wget git
OS_DISTRIBUTION=$(lsb_release -cs)
wget -O- http://neuro.debian.net/lists/${OS_DISTRIBUTION}.us-nh.full | sudo tee /etc/apt/sources.list.d/neurodebian.sources.list
sudo apt-key adv --recv-keys --keyserver hkp://pool.sks-keyservers.net:80 0xA5D32F012649A5A9
sudo apt-get update

sudo apt-get install libopenblas-base r-base
sudo apt-get update -qq -y
sudo apt-get install -y libgit2-dev
sudo apt-get install -y libcurl4-openssl-dev libssl-dev
sudo apt-get install -y zlib1g-dev libssh2-1-dev libpq-dev libxml2-dev
# create grouped column
test1$grouped_time = lubridate::floor_date(test1$DateTime, unit = "hour")
# (use ceiling_date instead if you want to round the half hours up instead of down)

# sum by group
library(dplyr)
test2 = test1 %>%
  group_by(grouped_time, LCLid, stdorToU, Acorn, Acorn_grouped) %>%
  summarize(KWH.hh.per.hour = sum(KWH.hh..per.half.hour.))
library(ggplot2)
library(ggrepel)

nba <- read.csv("http://datasets.flowingdata.com/ppg2008.csv", sep = ",")

nbaplot <- ggplot(nba, aes(x= MIN, y = PTS)) + 
  geom_point(color = "blue", size = 3)

### geom_label_repel
nbaplot + 
  geom_label_repel(aes(label = Name),
                  box.padding   = 0.35, 
                  point.padding = 0.5,
                  segment.color = 'grey50') +
  theme_classic()
wt_var <- function(x, w, na.rm = FALSE) {
  if (na.rm) {
    na <- is.na(x) | is.na(w)
    x <- x[!na]
    w <- w[!na]
  }
  wm <- weighted.mean(x, w)
  sqrdev <- (x - wm)^2
  (sum(sqrdev*w))
}


#==========================================================================
# R script for the Submodule 2.6 Population Density Maps - FB MOOC 
# Data-Pop Alliance
# Guillermo Romero, Researcher and data scientist
# Comments and questions: gromero@datapopalliance.org
# August, 2021
#==========================================================================

# To install libraries
install.packages("data.table")
install.packages("rgdal")
install.packages("ggplot2")
install.packages("lubridate")
install.packages("plyr")
install.packages("viridis")
install.packages("ggthemes")
install.packages("mapproj")
install.packages("spdplyr")
install.packages("geojsonio")

# Load libraries
library(data.table)
library(rgdal)
library(ggplot2)
library(lubridate)
library(plyr)
library(viridis)
library(ggthemes)
library(mapproj)
library(geojsonio)
library(spdplyr)


################################
# First Part
# Assign Total population into
# the area of interest
################################


# Set the working directory
setwd("~/Documents/DPA_tutorial")

# Open the total population file 
totalPop = fread('unzip -p population_mex_2018-10-01.csv.zip')

# Since this population density map file is big
# We are going to use only a subset of this file (for the area of interest) 
# zm stands for an area capturing Mexico city and the state of Mexico

zm <- subset(totalPop, longitude >= -101 & longitude <= -98)
zm <- subset(zm, latitude >= 18 & latitude <= 21)


# Open the spatial file (shapefile)
# The area of interest is Distrito Federal (Mexico city)
# And the neighborhood state called Mexico
# Note: in the DPA_tutorial folder, create a subfolder named shapefiles
# where you need to save your spatial files

geo <- readOGR(dsn="shapefiles", layer="gadm36_MEX_2")
geo <- geo[,c(4,6,7)]
g1<-subset(geo, NAME_1=="Distrito Federal")
g2<-subset(geo,NAME_1=="México")
geo<-rbind(g1,g2)

# Fortify allows you to work a dataframe from the spatial object

geodf<-fortify(geo)
geo$id <- row.names(geo) # allocate an id variable to the spatial object
coords<-zm[,c(2,1)] # define the coordinates columns
sp <- SpatialPoints(coords)
rm(coords)

# Define the Coordinate Reference System (CRS)
proj4string(sp) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
proj4string(geo) <-"+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"

# Assigning points into polygons
assign <- over(sp, geo)

# Use rownames to easily merge in the following step
assign$rous <- rownames(assign)
zm$rous<-rownames(zm)

assign$rous<-as.factor(as.character(assign$rous))
zm$rous<-as.factor(as.character(zm$rous))

names(assign)[1]<-"ID"

# Merge the assign object with the zm (Mexico city and Mexico state) area
zm.Map <- merge(x=assign, y=zm, by.x="rous", by.y="rous")
dim(zm.Map)
zm.Map <- zm.Map[!is.na(zm.Map$GID_2),]
head(zm.Map)

# Sum the population per municipality
# This is the total population density map per municipality
total.zm.Mun<-aggregate(population_2020~NAME_2+GID_2, FUN=sum, data=zm.Map, na.rm=TRUE)
head(total.zm.Mun)





################################
# Second Part
# Assign Women population into
# the area of interest
################################



# Women - High resolution population density map
# Open the women population file 
totalPop = fread('unzip -p mex_women_of_reproductive_age_15_49_2019-06-01_csv.zip')

# Since this population density map file is big
# We are going to use only a subset of this file (for the area of interest) 
# zm stands for an area capturing Mexico city and the state of Mexico

zm <- subset(totalPop, longitude >= -101 & longitude <= -98)
zm <- subset(zm, latitude >= 18 & latitude <= 21)

# Open the spatial file (shapefile)
# The area of interest is Distrito Federal (Mexico city)
# And the neighborhood state called Mexico

geo <- readOGR(dsn="shapes", layer="gadm36_MEX_2")
geo <- geo[,c(4,6,7)]
g1<-subset(geo, NAME_1=="Distrito Federal")
g2<-subset(geo,NAME_1=="México")
geo<-rbind(g1,g2)

geodf<-fortify(geo)
geo$id <- row.names(geo) 
coords<-zm[,c(2,1)]
sp <- SpatialPoints(coords)
rm(coords)

# Use the following Coordinate Reference System (CRS)
proj4string(sp) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
proj4string(geo) <-"+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"

#assigning points into polygons
assign <- over(sp, geo)
dim(assign)

assign$rous <- rownames(assign)
zm$rous<-rownames(zm)

assign$rous<-as.factor(as.character(assign$rous))
zm$rous<-as.factor(as.character(zm$rous))
names(assign)[1]<-"ID"

# merge
zm.Map <- merge(x=assign, y=zm, by.x="rous", by.y="rous")
dim(zm.Map)
zm.Map <- zm.Map[!is.na(zm.Map$GID_2),]
head(zm.Map)

# Sum the women population per municipality
# This is the women population density map per municipality
women.zm.Mun<-aggregate(population~NAME_2+GID_2, FUN=sum, data=zm.Map, na.rm=TRUE)
names(women.zm.Mun)[3]<-"women_population"
head(women.zm.Mun)


# Merge total- and women population per mun
pop.Map <- merge(x=total.zm.Mun, y=women.zm.Mun, by.x="GID_2", by.y="GID_2")

# pop.Map is the object containing both, the total and the women population info
pop.Map$women.perc<-pop.Map$women_population/pop.Map$population_2020
pop.Map<-pop.Map[,c(1,2,3,5,6)]
names(pop.Map)[2]<-"municipality"
head(pop.Map)

rm(totalPop)
rm(zm)


################################
# Third Part
# Incorporate mobility data and
# Calculate a risk score
################################


# Load Fb Range Maps (Mobility data) and select the country of interest
mobility = fread('unzip -p movement-range-data-2021-07-20.zip')
mex<-subset(mobility, country=="MEX")
rm(mobility)

# Merge mobility with the population (pop.Map) object previously created
mobility.pop.Map <- merge(x=pop.Map, y=mex, by.x="GID_2", by.y="polygon_id")
names(mobility.pop.Map)[6]<-"date"
head(mobility.pop.Map)

# Select dates of interest
# In this case, we are going to use a week during the quarantine
# Rename column accordingly

mobility.pop.Map$date<-as.Date(mobility.pop.Map$date, format="%Y-%m-%d")
mobility.pop.quarantine<-subset(mobility.pop.Map, date>="2020-05-01" & date<="2020-05-07")
names(mobility.pop.quarantine)[11]<-"population.staying.home" 
head(mobility.pop.quarantine)

# Get the media of population staying home per municipality
mobility.pop.quarantine<-aggregate(population.staying.home~GID_2+women_population+women.perc+population_2020, FUN=mean, data=mobility.pop.quarantine, na.rm=TRUE)

# Calculate a weighted risk score
attach(mobility.pop.quarantine)
mobility.pop.quarantine$Risk.Score<-(women.perc)*(1-population.staying.home)*(sum(pop.Map$women_population))

# Normalize risk score
attach(mobility.pop.quarantine)
mobility.pop.quarantine$nRisk.Score<-(Risk.Score-min(Risk.Score))/(max(Risk.Score)-min(Risk.Score))
mobility.pop.quarantine<-mobility.pop.quarantine[,c(1,7)]
head(mobility.pop.quarantine)



################################
# Map using ggplot2
################################




# Choropleth Map with risk score

shapefile <- readOGR(dsn="shapefiles", layer="gadm36_MEX_2")
g1<-subset(shapefile, NAME_1=="Distrito Federal")
g2<-subset(shapefile,NAME_1=="México")
shapefile<-rbind(g1,g2)

shapefile@data$id = rownames(shapefile@data)
head(shapefile@data)
class(shapefile@data)
dim(shapefile@data)

#shapefile.points = fortify(shapefile, region = "id")
shapefile.points = fortify(shapefile)
shapefile.df = join(shapefile.points, shapefile@data, by = "id")
head(shapefile.df)
shapefile.df = subset(shapefile.df, select = c(long, lat, group, GID_2))
sort(unique(shapefile.df$GID_2))

#names(shapefile.df) = c("long", "lat", "group", "NAME1")
risk.map = join(shapefile.df, mobility.pop.quarantine, by = "GID_2", type = "full")
head(risk.map)
dim(risk.map)


#chrolopleth
#png("zm.risk.score.map.png", width = 5, height = 5, units = 'in', res = 300)
p0 <- ggplot(data = risk.map,
             mapping = aes(x = long, y = lat,
                 group = group,
                 fill = nRisk.Score))
p1 <- p0 + geom_polygon(color = "gray90", size = 0.05) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45)

p2 <- p1 + scale_fill_viridis_c(option = "magma", direction = -1)
p2 + theme_map() + #facet_wrap(~ year, ncol = 2) +
    theme(legend.position = "right",
          strip.background = element_blank()) +
    #labs(fill = "Changement en pourcentage / semaine",
    #     "title = "Opiate Related Deaths by State, 2000-2014")
     labs(fill = "nRisk.Score")
#dev.off()



################################
# Creation of files for kepler:
# risk score
################################

riskScore<-subset(risk.map, !duplicated(GID_2)) 
head(riskScore)

shapefile <- readOGR(dsn="shapefiles", layer="gadm36_MEX_2")
g1<-subset(shapefile, NAME_1=="Distrito Federal")
g2<-subset(shapefile,NAME_1=="México")
shapefile<-rbind(g1,g2)
shapefile<-shapefile[,c(6,7)]

shapefile@data = join(shapefile@data, riskScore, by = "GID_2", type = "full")
head(shapefile@data)
class(shapefile)

#to convert to geojson format
json <- geojson_json(shapefile)# Simplify the geometry information of GeoJSON.
geojson_write(json, geometry = "polygon", file = "nrisk.Score.geojson")






################################
# Creation of files for kepler:
# # Pre-COVID temporal window. Mobility data
################################



mobility = fread('unzip -p /home/guillermo/Documents/dpa/fbMooc/population.maps/movement-range-data-2021-07-20.zip')
mex<-subset(mobility, country=="MEX")
mex<-mex[,c(4,1,7)]

names(mex)[1]<-"GID_2"
names(mex)[2]<-"date"
names(mex)[3]<-"population.staying.home"

#select a temporal window of one week
#data starts at 2020-03-01
mex$date<-as.Date(mex$date, format="%Y-%m-%d")
mex<-subset(mex, date<="2020-03-07")
mex<-aggregate(population.staying.home~GID_2, FUN=mean, data=mex, na.rm=TRUE)


#need coordinates
#get them the previous object named: risk.map

coord<-subset(risk.map, !duplicated(GID_2)) 
coord<-coord[,c(1:4)]

#add coordinates to mex object
mex <- merge(x=mex, y=coord, by="GID_2", all.y=TRUE)


shapefile <- readOGR(dsn="shapefiles", layer="gadm36_MEX_2")
g1<-subset(shapefile, NAME_1=="Distrito Federal")
g2<-subset(shapefile,NAME_1=="México")
shapefile<-rbind(g1,g2)
shapefile<-shapefile[,c(6,7)]

shapefile@data = join(shapefile@data, mex, by = "GID_2", type = "full")
head(shapefile@data)
class(shapefile)

json <- geojson_json(shapefile)# Simplify the geometry information of GeoJSON.
geojson_write(json, geometry = "polygon", file = "mobility.precovid.geojson")




################################
# Creation of files for kepler:
# Quarantine week. Mobility data
################################


mobility = fread('unzip -p /home/guillermo/Documents/dpa/fbMooc/population.maps/movement-range-data-2021-07-20.zip')
mex<-subset(mobility, country=="MEX")
mex<-mex[,c(4,1,7)]

names(mex)[1]<-"GID_2"
names(mex)[2]<-"date"
names(mex)[3]<-"atHome"

mex$date<-as.Date(mex$date, format="%Y-%m-%d")
mex<-subset(mex, date>="2020-05-01")
mex<-subset(mex, date<="2020-05-07")

mex<-aggregate(atHome~GID_2, FUN=mean, data=mex, na.rm=TRUE)


#need coordinates
#get them from previous object named: risk.map
coord<-subset(risk.map, !duplicated(GID_2)) 
coord<-coord[,c(1:4)]

#add coordinates to mex object
mex <- merge(x=mex, y=coord, by="GID_2", all.y=TRUE)

shapefile <- readOGR(dsn="shapes", layer="gadm36_MEX_2")
g1<-subset(shapefile, NAME_1=="Distrito Federal")
g2<-subset(shapefile,NAME_1=="México")
shapefile<-rbind(g1,g2)
shapefile<-shapefile[,c(6,7)]


shapefile@data = join(shapefile@data, mex, by = "GID_2", type = "full")
head(shapefile@data)
class(shapefile)

#convert into geojson format
json <- geojson_json(shapefile)
geojson_write(json, geometry = "polygon", file = "mobility.quarantine.geojson")






################################
# Creation of files for kepler:
# population density map sample
################################

totalPop = fread('unzip -p population_mex_2018-10-01.csv.zip')

#get a sample of your total pop
sampling <- totalPop[sample(nrow(totalPop), 174579), ]

#write
write.csv(sampling, "population.density.map.sample.csv", row.names=FALSE)
clrobustse <- function(fit.model, clusterid) {
  rank=fit.model$rank
  N.obs <- length(clusterid)            
  N.clust <- length(unique(clusterid))  
  dfc <- N.clust/(N.clust-1)                    
  vcv <- vcov(fit.model)
  estfn <- estfun(fit.model)
  uj <- apply(estfn, 2, function(x) tapply(x, clusterid, sum))
  N.VCV <- N.obs * vcv
  ujuj.nobs  <- crossprod(uj)/N.obs
  vcovCL <- dfc*(1/N.obs * (N.VCV %*% ujuj.nobs %*% N.VCV))
  coeftest(fit.model, vcov=vcovCL)
}
clrobustse(UC.models[[1]], (contest.user.level.data %>% select(entered.contest,contest.format,total.prizes,contest.duration.hours,num.winners,max.prize,min.prize,binary.reads.cap,prize.sd,topic_id) %>% drop_na())$topic_id)
# function to compute total within-cluster sum of squares. 
fviz_nbclust(contest.scaled, kmeans, method = "wss", k.max = 50,verbose=TRUE,print.summary=TRUE) + 
  theme_minimal() +
  ylab("Total Within Sum of Squares")+
  xlab("Number of clusters (k)")+
  ggtitle("Determining optimal number of clusters using the Elbow Method")+theme(plot.title=element_text(hjust=0.5))
library(plotly)

df <- data.frame(x = runif(200), y = runif(200), z = runif(200), j = runif(200), k = rep(0.7, 200), i = rnorm(200,0.6,0.05))

create_buttons <- function(df, y_axis_var_names) {
  lapply(
    y_axis_var_names,
    FUN = function(var_name, df) {
      button <- list(
        method = 'restyle',
        args = list('y', list(df[, var_name])),
        label = sprintf('Show %s', var_name)
      )
    },
    df
  )
  
}

y_axis_var_names <- c('y', 'z', 'j', 'k', 'i')

p <- plot_ly(df, x = ~x, y = ~y, mode = "markers", name = "A", visible = T) %>%
     layout(
         title = "Drop down menus - Styling",
         xaxis = list(domain = c(0.1, 1)),
         yaxis = list(title = "y"),
         updatemenus = list(
             list(
                 y = 0.7,
                 buttons = create_buttons(df, y_axis_var_names)
             )
         ))
p


left_join(x, y, by='Flag') %>%
              left_join(., z, by='Flag') 
read_delim(
  file,
  delim,
  quote = "\"",
  escape_backslash = FALSE,
  escape_double = TRUE,
  col_names = TRUE,
  col_types = NULL,
  locale = default_locale(),
  na = c("", "NA"),
  quoted_na = TRUE,
  comment = "",
  trim_ws = FALSE,
  skip = 0,
  n_max = Inf,
  guess_max = min(1000, n_max),
  progress = show_progress(),
  skip_empty_rows = TRUE
)
# load packages
require(FactoMineR)
require(ggplot2)
# load data tea
data(tea)
# select these columns
newtea = tea[, c("Tea", "How", "how", "sugar", "where", "always")]
# take a look
head(newtea)


# number of categories per variable
cats = apply(newtea, 2, function(x) nlevels(as.factor(x)))
cats

# apply MCA
mca1 = MCA(newtea, graph = FALSE)

# table of eigenvalues
mca1$eig


# data frames for ggplot
mca1_vars_df = data.frame(mca1$var$coord, Variable = rep(names(cats), 
                                                         cats))
mca1_obs_df = data.frame(mca1$ind$coord)

# plot of variable categories
ggplot(data = mca1_vars_df,
       aes(x = Dim.1, y = Dim.2, label = rownames(mca1_vars_df))) + 
  geom_hline(yintercept = 0, colour = "gray70") + geom_vline(xintercept = 0, 
                                              colour = "gray70") +
  geom_text(aes(colour = Variable)) + 
  ggtitle("MCA plot of variables using R package FactoMineR")


# XXX ---------------------------------------------------------------------

Base_acm <- Base %>% select(P1_1, P3_1, P3_2, P3_3)
Base_acm$P1_1 <- as.factor(Base_acm$P1_1)
Base_acm$P3_1 <- as.factor(Base_acm$P3_1)
Base_acm$P3_2 <- as.factor(Base_acm$P3_2)
Base_acm$P3_3 <- as.factor(Base_acm$P3_3)

cats=apply(Base_acm, 2, function(x) nlevels(as.factor(x)))

mca2 = MCA(Base_acm, graph = FALSE)

# data frames for ggplot
mca2_vars_df = data.frame(mca2$var$coord, Variable = rep(names(cats), 
                                                         cats))
mca2_obs_df = data.frame(mca2$ind$coord)

# plot of variable categories
ggplot(data = mca2_vars_df,
       aes(x = Dim.1, y = Dim.2, label = rownames(mca2_vars_df))) + 
  geom_hline(yintercept = 0, colour = "gray70") + geom_vline(xintercept = 0, 
                                                        colour = "gray70") +
  geom_text(aes(colour = Variable)) + 
  ggtitle("MCA plot of variables using R package FactoMineR")

factoextra::fviz_screeplot(mca2, addlabels = TRUE, ylim = c(0, 45))
# cells = try(CustomSubset(group.info, min_n=50, max_total=2000))
      # if (inherits(cells, 'try-error')) {
      #   cells <- c(cells.1, cells.2)
      # }
      
      
img <- tryCatch(
      {
        .GetImageSize(slide.file)
      },
      error = function(e) {
        img.path <- list.files(slide.file, full.names=TRUE)[[1]] #Assuming all scan images of the same batch is of similar size, we only need to take the first one
        img <- .GetImageSize(img.path)
        return(img)
        })
        
if (!require('tidyverse')) install.packages('tidyverse'); library('tidyverse')
if (!require('gapminder')) install.packages('gapminder'); library('gapminder')

gap <- gapminder %>% filter(year == 1987 | year == 1992)

gap %>%
  group_by(year, continent) %>% 
  summarise(average = mean(gdpPercap)) %>% 
  spread(continent, average)


gap %>%
  group_by(continent, year) %>% 
  summarise(average = mean(gdpPercap)) %>% 
  spread(year, average)
ggplot(data = <DATA>) + 
  <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))
mutate(var_name = str_replace(var_name, "Q", "q"))
mutate(r_varname = ifelse(p_varname > 0.05, "-", r_varname))
install.packages("tidyverse")
library(ggplot2)
ggplot(mtcars, aes(hp, mpg)) + 
       geom_point() +
       labs(x = bquote('x axis'~(Å^2)), y = "y axis") +
       #or
       #labs(x = bquote('x axis'~(ring(A)^2)), y = "y axis") 
       theme_bw()
theme(strip.background = element_rect(colour = "black", fill = "white"))
library(tidyverse)
library(purrr)
library(zoo)

data %>% 
  group_by(group_name) %>% 
  nest() %>% 
  mutate(data = pmap(list(data),
                     ~ mutate(.x,
                              r_mean = lag(rollmean(x = var_name, k = 7, fill = NA, align = "right")))
                     )
  ) %>% 
  unnest(cols = data)
try(log("not a number"), silent = TRUE)
print("errors can't stop me")
library(ggplot2)
library("ggpubr")
theme_set(
  theme_bw() +
    theme(legend.position = "top")
  )
#Outlier Removal Function
remove_outliers <- function(dataframe, column, na.rm = TRUE) {
  Q1 <- quantile(column, .25)
  Q3 <- quantile(column, .75)
  IQR <- IQR(column)
  dataframe <- subset(dataframe, column> (Q1 - 1.5*IQR) & column< (Q3 + 1.5*IQR))
}

#Not looping
df_1 <-remove_outliers(df_1,df_1$Column1)

#Loop through a list of column names
for(i in Column_List){
  df_1 <- remove_outliers(df_1, df_1[[i]])
}  
# Account
statement <- read.csv("statement.csv", stringsAsFactors = F, sep = ";")

# Get rid of irrelevant columns
statement <- statement[2:13]

# Statement money flows
rahavoog <- statement[statement$Reatüüp == 20,]
rahavoog$Summa <- as.numeric(gsub(",", ".", rahavoog$Summa))

str_order(
  x,
  decreasing = FALSE,
  na_last = TRUE,
  locale = "en",
  numeric = FALSE,
  ...
)
  
# Examples 
  
  str_order(letters)
#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
#> [26] 26
str_sort(letters)
#>  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s"
#> [20] "t" "u" "v" "w" "x" "y" "z"

str_order(letters, locale = "haw")
#>  [1]  1  5  9 15 21  2  3  4  6  7  8 10 11 12 13 14 16 17 18 19 20 22 23 24 25
#> [26] 26
str_sort(letters, locale = "haw")
#>  [1] "a" "e" "i" "o" "u" "b" "c" "d" "f" "g" "h" "j" "k" "l" "m" "n" "p" "q" "r"
#> [20] "s" "t" "v" "w" "x" "y" "z"

x <- c("100a10", "100a5", "2b", "2a")
str_sort(x)
#> [1] "100a10" "100a5"  "2a"     "2b"    
str_sort(x, numeric = TRUE)
#> [1] "2a"     "2b"     "100a5"  "100a10"
dir <- "directory of archives"

directories <- as.list(file.path(dir, list.files(dir, pattern = ".*.csv")))

dataframes <- map(directories , rio::import)

dataset <- plyr::ldply(dataframes, data.frame)
var <- "mpg"
#Doesn't work
mtcars$var
#These both work, but note that what they return is different
# the first is a vector, the second is a data.frame
mtcars[[var]]
mtcars[var]
# make data frame 
a <- data.frame( x =  c(1,2,3,4)) 
b <- data.frame( y =  c(1,2,3,4,5,6))
subset(b, !(y %in% a$x)) #Pulls values from frame b that are not in frame a
library(tidyverse)

df$column <- df$column str_to_title(df$column) #Standardize caps

#Two options 
df$column <- str_replace(df$column, "old", "new") #Option 1, uses str_replace
df$column <- gsub("old", "new", df$column) #Option 2, uses str_replace  
x <- data$variable_xaxis
y <- data$variable_yaxis

plot(x, y, main = "Main title",
     xlab = "X axis title", ylab = "Y axis title",
     pch = 19, frame = FALSE)
abline(lm(y ~ x, data = mtcars), col = "blue")
g <- ggplot(data_set, aes(x-variable, y-variable) 

g + geom_point()
qplot(x-coordinate, y-coordinate, data=data_frame, geom = c("point", "smooth")
x <- data$variable_xaxis
y <- data$variable_yaxis

plot(x, y, main = "Main title",
     xlab = "X axis title", ylab = "Y axis title",
     pch = 19, frame = FALSE)
abline(lm(y ~ x, data = mtcars), col = "blue")
star

Sun Sep 04 2022 11:30:17 GMT+0000 (UTC) https://rdatatable.gitlab.io/data.table/reference/data.table.html#examples

#r
star

Fri Aug 12 2022 14:31:36 GMT+0000 (UTC)

#r
star

Sun Jun 19 2022 11:56:18 GMT+0000 (UTC)

#r #ggplot2
star

Thu Jun 02 2022 07:55:15 GMT+0000 (UTC)

#r #tidyr
star

Thu Jun 02 2022 07:35:31 GMT+0000 (UTC)

#r #tidyr
star

Wed Jun 01 2022 12:46:14 GMT+0000 (UTC)

#r
star

Wed Jun 01 2022 11:47:36 GMT+0000 (UTC)

#r
star

Wed Jun 01 2022 11:18:43 GMT+0000 (UTC)

#r
star

Wed Jun 01 2022 11:02:35 GMT+0000 (UTC)

#r
star

Mon May 30 2022 12:30:42 GMT+0000 (UTC)

#r #sql
star

Sun May 22 2022 16:32:49 GMT+0000 (UTC) https://www.infoworld.com/article/3637083/never-look-up-tidyrs-pivotwider-or-pivotlonger-again.html

#r
star

Tue May 17 2022 09:27:19 GMT+0000 (UTC) https://www.r-bloggers.com/2020/03/how-to-do-a-t-test-or-anova-for-many-variables-at-once-in-r-and-communicate-the-results-in-a-better-way/

#matrinem #r
star

Sun May 15 2022 17:04:56 GMT+0000 (UTC) http://datacamp-community-prod.s3.amazonaws.com/c1fae72f-d2d7-4646-9dce-dd0f8fb5c5e8

#r #tidyverse
star

Sat Apr 02 2022 07:05:37 GMT+0000 (UTC) https://stackoverflow.com/questions/17735859/for-each-row-return-the-column-name-of-the-largest-value

#r
star

Wed Feb 02 2022 22:06:40 GMT+0000 (UTC) https://github.com/jimbrig/dotfiles-wsl/blob/main/scripts/dev/scripts/install-R.sh

#r #installation #linux #bash #wsl
star

Thu Nov 25 2021 00:44:48 GMT+0000 (UTC) https://stackoverflow.com/questions/42417948/how-to-use-size-and-decay-in-nnet

#r
star

Thu Nov 25 2021 00:00:30 GMT+0000 (UTC) https://rstudio-pubs-static.s3.amazonaws.com/473000_082e869ca8ca44dbb9f8dda1b2a251d2.html

#r
star

Wed Nov 24 2021 23:59:48 GMT+0000 (UTC) https://cran.r-project.org/web/packages/caretEnsemble/vignettes/caretEnsemble-intro.html

#r
star

Wed Nov 24 2021 23:54:51 GMT+0000 (UTC) https://stackoverflow.com/questions/45156400/r-caretensemble-passing-a-fit-param-to-one-specific-model-in-caretlist

#r
star

Fri Nov 12 2021 15:28:24 GMT+0000 (UTC) https://stackoverflow.com/questions/59395540/reducing-time-series-data-from-half-hour-to-hourly-in-r

#r
star

Wed Nov 10 2021 15:55:10 GMT+0000 (UTC) https://stackoverflow.com/questions/15624656/label-points-in-geom-point

#r
star

Wed Sep 01 2021 00:49:08 GMT+0000 (UTC)

#r
star

Fri Aug 27 2021 15:46:23 GMT+0000 (UTC)

#r
star

Thu Aug 26 2021 09:54:51 GMT+0000 (UTC) https://stackoverflow.com/questions/33927766/logit-binomial-regression-with-clustered-standard-errors

#r
star

Wed Aug 25 2021 16:24:09 GMT+0000 (UTC) https://towardsdatascience.com/10-tips-for-choosing-the-optimal-number-of-clusters-277e93d72d92

#r
star

Thu Aug 19 2021 14:03:39 GMT+0000 (UTC) https://stackoverflow.com/questions/40024029/plotly-updating-data-with-dropdown-selection

#r
star

Wed Aug 18 2021 21:02:15 GMT+0000 (UTC) https://uc-r.github.io/kmeans_clustering

#r
star

Tue Jun 01 2021 15:21:20 GMT+0000 (UTC) https://stackoverflow.com/questions/32066402/how-to-perform-multiple-left-joins-using-dplyr-in-r/32066419

#r
star

Tue Jun 01 2021 15:00:09 GMT+0000 (UTC) https://readr.tidyverse.org/reference/read_delim.html

#r
star

Thu May 20 2021 21:04:33 GMT+0000 (UTC)

#r
star

Thu May 20 2021 10:07:18 GMT+0000 (UTC)

#r
star

Fri May 14 2021 20:53:58 GMT+0000 (UTC)

#r
star

Sat May 08 2021 22:27:18 GMT+0000 (UTC) https://r4ds.had.co.nz/data-visualisation.html

#r #ggplot2 #visualization
star

Mon Apr 26 2021 17:39:49 GMT+0000 (UTC)

#r
star

Fri Apr 23 2021 20:35:43 GMT+0000 (UTC) https://stackoverflow.com/questions/66630759/model-formula-for-two-way-interactions-between-one-set-of-variables-and-another/66630789#66630789

#r
star

Mon Apr 12 2021 03:38:09 GMT+0000 (UTC) https://www.tidyverse.org/

#r
star

Thu Mar 25 2021 16:57:42 GMT+0000 (UTC)

#r
star

Wed Mar 24 2021 22:17:16 GMT+0000 (UTC)

#r
star

Wed Mar 24 2021 14:24:49 GMT+0000 (UTC)

#r
star

Wed Mar 24 2021 14:22:53 GMT+0000 (UTC)

#r
star

Sun Mar 21 2021 10:33:11 GMT+0000 (UTC) https://www.datanovia.com/en/lessons/combine-multiple-ggplots-into-a-figure/

#r
star

Mon Mar 15 2021 05:12:08 GMT+0000 (UTC) https://stringr.tidyverse.org/reference/str_order.html

#r
star

Fri Mar 12 2021 19:23:49 GMT+0000 (UTC)

#r
star

Thu Mar 11 2021 14:57:05 GMT+0000 (UTC) https://stackoverflow.com/questions/18222286/dynamically-select-data-frame-columns-using-and-a-character-value

#r
star

Sun Mar 07 2021 02:05:35 GMT+0000 (UTC) https://stackoverflow.com/questions/5812478/how-i-can-select-rows-from-a-dataframe-that-do-not-match

#r
star

Sun Mar 07 2021 02:01:46 GMT+0000 (UTC)

#r
star

Fri Mar 05 2021 12:52:40 GMT+0000 (UTC)

#r
star

Sun Sep 27 2020 20:07:34 GMT+0000 (UTC)

#r
star

Sun Sep 27 2020 14:58:51 GMT+0000 (UTC)

#r
star

Sun Sep 13 2020 17:36:11 GMT+0000 (UTC)

#r

Save snippets that work with our extensions

Available in the Chrome Web Store Get Firefox Add-on Get VS Code extension