Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.
Data came from Bird Studies Canada. The data includes meteorite information such as year, bird species, and number of birds.
#function to check if packages are installed, if not then install them, and load all packages
libraries <- function(packages){
for(package in packages){
#checks if package is installed
if(!require(package, character.only = TRUE)){
#If package does not exist, then it will install
install.packages(package, dependencies = TRUE)
#Loads package
library(package, character.only = TRUE)
}
}
}
packages <- c("data.table","tidyverse","visdat","zoo","extrafont","stringr")
libraries(packages)
theme_set(theme_classic())
df <- fread("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-18/bird_counts.csv", stringsAsFactors = TRUE)
glimpse(df)
## Observations: 18,706
## Variables: 6
## $ year <int> 1921, 1921, 1921, 1921, 1921, 1921, 1...
## $ species <fct> American Bittern, American Black Duck...
## $ species_latin <fct> Botaurus lentiginosus, Anas rubripes,...
## $ how_many_counted <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 12, 0, 0, ...
## $ total_hours <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8...
## $ how_many_counted_by_hour <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0....
head(df)
## year species species_latin how_many_counted
## 1: 1921 American Bittern Botaurus lentiginosus 0
## 2: 1921 American Black Duck Anas rubripes 0
## 3: 1921 American Coot Fulica americana 0
## 4: 1921 American Crow Corvus brachyrhynchos 0
## 5: 1921 American Goldfinch Spinus tristis 0
## 6: 1921 American Kestrel Falco sparverius 0
## total_hours how_many_counted_by_hour
## 1: 8 0
## 2: 8 0
## 3: 8 0
## 4: 8 0
## 5: 8 0
## 6: 8 0
summary(df)
## year species species_latin
## Min. :1921 American Bittern : 94 Acanthis flammea : 94
## 1st Qu.:1947 American Black Duck: 94 Acanthis hornemanni: 94
## Median :1970 American Coot : 94 Accipiter cooperii : 94
## Mean :1970 American Crow : 94 Accipiter gentilis : 94
## 3rd Qu.:1994 American Goldfinch : 94 Accipiter striatus : 94
## Max. :2017 American Kestrel : 94 Actitis macularius : 94
## (Other) :18142 (Other) :18142
## how_many_counted total_hours how_many_counted_by_hour
## Min. : 0.0 Min. : 8.0 Min. : 0.000
## 1st Qu.: 0.0 1st Qu.:149.5 1st Qu.: 0.000
## Median : 0.0 Median :171.0 Median : 0.000
## Mean : 193.5 Mean :170.8 Mean : 1.336
## 3rd Qu.: 5.0 3rd Qu.:203.8 3rd Qu.: 0.051
## Max. :73000.0 Max. :251.0 Max. :439.024
## NA's :3781 NA's :3781
sapply(df, function(x) n_distinct(x)) %>% sort()
## total_hours year species
## 62 94 199
## species_latin how_many_counted how_many_counted_by_hour
## 199 1163 3556
View missing values in more detail.
#Visualize missing values
vis_miss(df, sort_miss=TRUE)
#see count of missing values
na_values <- function(df){
na <- colSums(is.na(df)) %>% sort(decreasing=TRUE)
na[na>0]
}
na_values(df)
## total_hours how_many_counted_by_hour
## 3781 3781
df %>% ggplot(aes(year,(how_many_counted/total_hours)))+
geom_bar(stat="identity")+
labs(title="Christmas Bird Counts Per Hour Over Time", x="Year", y="Count per Hour")
df %>% ggplot(aes(total_hours, how_many_counted))+
geom_jitter(alpha=0.3, size=1)+
labs(title="Total Christmas Bird Counts",x="Hours", y="Count")
Remove/replace missing values and drop unnecessary columns
#remove NA hours if how_many_counted is 0
df <- df[!(df$how_many_counted==0 & is.na(df$total_hours)),]
#view missing data again
na_values(df)
## total_hours how_many_counted_by_hour
## 753 753
# use interpolated values to replace NA values, grouped by species
df <- df %>% group_by(species) %>% mutate(total_hours = na.approx(total_hours)) %>% ungroup()
#view missing data again
na_values(df)
## how_many_counted_by_hour
## 753
#drop how_many_counted_by_hour. this can be recalculated later
df <- df %>% select(1:5)
# how many owls were seen
df_owl <- df %>% filter(str_detect(species, "Owl"))
df_owl <- df_owl %>% group_by(species) %>% summarize(counted = sum(how_many_counted))
df_owl <- df_owl %>% mutate(species = str_sub(species, end=-4))
df_owl$species <- fct_reorder(df_owl$species, df_owl$counted)
df_owl %>% ggplot(aes(species,counted))+
geom_col(fill="steelblue")+
geom_text(aes(label=counted), hjust=-0.1, vjust=0.5, family="Bodoni MT", fontface="bold")+
coord_flip()+
labs(title="Christmas Owl Spottings", subtitle="From 1921 - 2017", y="Number Spotted")+
theme(legend.position="none",
text=element_text(family="Bodoni MT", size=14),
axis.title.y=element_blank(),
plot.title=element_text(face="bold", size=24, hjust=0.175),
plot.subtitle=element_text(hjust=0.325),
plot.background=element_rect(fill="#fff7e6"),
panel.background=element_rect(fill="#fff7e6"))
ggsave("christmas_owls.png", limitsize=FALSE)
## Saving 7 x 5 in image