1 Project Description

Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.

2 Dataset

Data came from Bird Studies Canada. The data includes meteorite information such as year, bird species, and number of birds.

3 Setup

3.1 Load Libraries

#function to check if packages are installed, if not then install them, and load all packages
libraries <- function(packages){
  for(package in packages){
    #checks if package is installed
    if(!require(package, character.only = TRUE)){
      #If package does not exist, then it will install
      install.packages(package, dependencies = TRUE)
      #Loads package
      library(package, character.only = TRUE)
    }
  }
}

packages <- c("data.table","tidyverse","visdat","zoo","extrafont","stringr")

libraries(packages)

theme_set(theme_classic())

3.2 Import Data

df <- fread("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-18/bird_counts.csv", stringsAsFactors = TRUE)

4 Exploratory Data Analysis

  • There are 18,706 observations and 6 variables.
  • how_many_counted_by_hour is how_many_counted divided by total_hours.
  • There are many NA values with total_hours which results in NAs in how_many_counted_by_hour.

4.1 View Data

glimpse(df)
## Observations: 18,706
## Variables: 6
## $ year                     <int> 1921, 1921, 1921, 1921, 1921, 1921, 1...
## $ species                  <fct> American Bittern, American Black Duck...
## $ species_latin            <fct> Botaurus lentiginosus, Anas rubripes,...
## $ how_many_counted         <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 12, 0, 0, ...
## $ total_hours              <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8...
## $ how_many_counted_by_hour <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0....
head(df)
##    year             species         species_latin how_many_counted
## 1: 1921    American Bittern Botaurus lentiginosus                0
## 2: 1921 American Black Duck         Anas rubripes                0
## 3: 1921       American Coot      Fulica americana                0
## 4: 1921       American Crow Corvus brachyrhynchos                0
## 5: 1921  American Goldfinch        Spinus tristis                0
## 6: 1921    American Kestrel      Falco sparverius                0
##    total_hours how_many_counted_by_hour
## 1:           8                        0
## 2:           8                        0
## 3:           8                        0
## 4:           8                        0
## 5:           8                        0
## 6:           8                        0
summary(df)
##       year                     species                  species_latin  
##  Min.   :1921   American Bittern   :   94   Acanthis flammea   :   94  
##  1st Qu.:1947   American Black Duck:   94   Acanthis hornemanni:   94  
##  Median :1970   American Coot      :   94   Accipiter cooperii :   94  
##  Mean   :1970   American Crow      :   94   Accipiter gentilis :   94  
##  3rd Qu.:1994   American Goldfinch :   94   Accipiter striatus :   94  
##  Max.   :2017   American Kestrel   :   94   Actitis macularius :   94  
##                 (Other)            :18142   (Other)            :18142  
##  how_many_counted   total_hours    how_many_counted_by_hour
##  Min.   :    0.0   Min.   :  8.0   Min.   :  0.000         
##  1st Qu.:    0.0   1st Qu.:149.5   1st Qu.:  0.000         
##  Median :    0.0   Median :171.0   Median :  0.000         
##  Mean   :  193.5   Mean   :170.8   Mean   :  1.336         
##  3rd Qu.:    5.0   3rd Qu.:203.8   3rd Qu.:  0.051         
##  Max.   :73000.0   Max.   :251.0   Max.   :439.024         
##                    NA's   :3781    NA's   :3781
sapply(df, function(x) n_distinct(x)) %>% sort()
##              total_hours                     year                  species 
##                       62                       94                      199 
##            species_latin         how_many_counted how_many_counted_by_hour 
##                      199                     1163                     3556

4.2 Missing Values

View missing values in more detail.

#Visualize missing values
vis_miss(df, sort_miss=TRUE)

#see count of missing values
na_values <- function(df){
  na <- colSums(is.na(df)) %>% sort(decreasing=TRUE)
  na[na>0]
}

na_values(df)
##              total_hours how_many_counted_by_hour 
##                     3781                     3781

5 Data Wrangling

Remove/replace missing values and drop unnecessary columns

#remove NA hours if how_many_counted is 0
df <- df[!(df$how_many_counted==0 & is.na(df$total_hours)),]

#view missing data again
na_values(df)
##              total_hours how_many_counted_by_hour 
##                      753                      753
# use interpolated values to replace NA values, grouped by species
df <- df %>% group_by(species) %>% mutate(total_hours = na.approx(total_hours)) %>% ungroup()

#view missing data again
na_values(df)
## how_many_counted_by_hour 
##                      753
#drop how_many_counted_by_hour. this can be recalculated later
df <- df %>% select(1:5)

6 Final Visualization

# how many owls were seen
df_owl <- df %>% filter(str_detect(species, "Owl"))
df_owl <- df_owl %>% group_by(species) %>% summarize(counted = sum(how_many_counted))
df_owl <- df_owl %>% mutate(species = str_sub(species, end=-4))
df_owl$species <- fct_reorder(df_owl$species, df_owl$counted)

df_owl %>% ggplot(aes(species,counted))+
  geom_col(fill="steelblue")+
  geom_text(aes(label=counted), hjust=-0.1, vjust=0.5, family="Bodoni MT", fontface="bold")+
  coord_flip()+
  labs(title="Christmas Owl Spottings", subtitle="From 1921 - 2017", y="Number Spotted")+
  theme(legend.position="none",
        text=element_text(family="Bodoni MT", size=14),
        axis.title.y=element_blank(),
        plot.title=element_text(face="bold", size=24, hjust=0.175),
        plot.subtitle=element_text(hjust=0.325),
        plot.background=element_rect(fill="#fff7e6"),
        panel.background=element_rect(fill="#fff7e6"))

ggsave("christmas_owls.png", limitsize=FALSE)
## Saving 7 x 5 in image