1 Project Description

Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.

2 Dataset

Data came from The National UFO Reporting Center. The data includes UFO sighting information such as date/time it was observed, location, UFO shape, and a description of the event.

3 Setup

3.1 Load Libraries

if (!require("pacman")) install.packages("pacman")
pacman::p_load("data.table","tidyverse","visdat","lubridate","rworldmap","sp","rworldxtra","maps","countrycode","ggridges")

theme_set(theme_classic())

3.2 Import Data

df <- fread("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-25/ufo_sightings.csv")

4 Exploratory Data Analysis

  • There are 80,332 observations and 11 variables.
  • Most of the missing values are with state and country which could be looked up based on the coordinates.

4.1 View Data

glimpse(df)
## Observations: 80,332
## Variables: 11
## $ date_time                  <chr> "10/10/1949 20:30", "10/10/1949 21:...
## $ city_area                  <chr> "san marcos", "lackland afb", "ches...
## $ state                      <chr> "tx", "tx", NA, "tx", "hi", "tn", N...
## $ country                    <chr> "us", NA, "gb", "us", "us", "us", "...
## $ ufo_shape                  <chr> "cylinder", "light", "circle", "cir...
## $ encounter_length           <dbl> 2700, 7200, 20, 20, 900, 300, 180, ...
## $ described_encounter_length <chr> "45 minutes", "1-2 hrs", "20 second...
## $ description                <chr> "This event took place in early fal...
## $ date_documented            <chr> "4/27/2004", "12/16/2005", "1/21/20...
## $ latitude                   <dbl> 29.88306, 29.38421, 53.20000, 28.97...
## $ longitude                  <dbl> -97.941111, -98.581082, -2.916667, ...
head(df)
##           date_time            city_area state country ufo_shape
## 1: 10/10/1949 20:30           san marcos    tx      us  cylinder
## 2: 10/10/1949 21:00         lackland afb    tx    <NA>     light
## 3: 10/10/1955 17:00 chester (uk/england)  <NA>      gb    circle
## 4: 10/10/1956 21:00                 edna    tx      us    circle
## 5: 10/10/1960 20:00              kaneohe    hi      us     light
## 6: 10/10/1961 19:00              bristol    tn      us    sphere
##    encounter_length described_encounter_length
## 1:             2700                 45 minutes
## 2:             7200                    1-2 hrs
## 3:               20                 20 seconds
## 4:               20                   1/2 hour
## 5:              900                 15 minutes
## 6:              300                  5 minutes
##                                                                                                                                                   description
## 1:                    This event took place in early fall around 1949-50. It occurred after a Boy Scout meeting in the Baptist Church. The Baptist Church sit
## 2:                                                            1949 Lackland AFB&#44 TX.  Lights racing across the sky &amp; making 90 degree turns on a dime.
## 3:                                                                                                        Green/Orange circular disc over Chester&#44 England
## 4:                 My older brother and twin sister were leaving the only Edna theater at about 9 PM&#44...we had our bikes and I took a different route home
## 5: AS a Marine 1st Lt. flying an FJ4B fighter/attack aircraft on a solo night exercise&#44 I was at 50&#44000&#39 in a &quot;clean&quot; aircraft (no ordinan
## 6:                 My father is now 89 my brother 52 the girl with us now 51 myself 49 and the other fellow which worked with my father if he&#39s still livi
##    date_documented latitude   longitude
## 1:       4/27/2004 29.88306  -97.941111
## 2:      12/16/2005 29.38421  -98.581082
## 3:       1/21/2008 53.20000   -2.916667
## 4:       1/17/2004 28.97833  -96.645833
## 5:       1/22/2004 21.41806 -157.803611
## 6:       4/27/2007 36.59500  -82.188889
summary(df)
##   date_time          city_area            state          
##  Length:80332       Length:80332       Length:80332      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    country           ufo_shape         encounter_length  
##  Length:80332       Length:80332       Min.   :       0  
##  Class :character   Class :character   1st Qu.:      30  
##  Mode  :character   Mode  :character   Median :     180  
##                                        Mean   :    9017  
##                                        3rd Qu.:     600  
##                                        Max.   :97836000  
##                                        NA's   :3         
##  described_encounter_length description        date_documented   
##  Length:80332               Length:80332       Length:80332      
##  Class :character           Class :character   Class :character  
##  Mode  :character           Mode  :character   Mode  :character  
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##     latitude        longitude      
##  Min.   :-82.86   Min.   :-176.66  
##  1st Qu.: 34.13   1st Qu.:-112.07  
##  Median : 39.41   Median : -87.90  
##  Mean   : 38.12   Mean   : -86.77  
##  3rd Qu.: 42.79   3rd Qu.: -78.75  
##  Max.   : 72.70   Max.   : 178.44  
##  NA's   :1
sapply(df, function(x) n_distinct(x)) %>% sort()
##                    country                  ufo_shape 
##                          6                         30 
##                      state            date_documented 
##                         68                        317 
##           encounter_length described_encounter_length 
##                        534                       8349 
##                   latitude                  longitude 
##                      18421                      19455 
##                  city_area                  date_time 
##                      19900                      69586 
##                description 
##                      79997
df %>% count(ufo_shape, sort=TRUE)
## # A tibble: 30 x 2
##    ufo_shape     n
##    <chr>     <int>
##  1 light     16565
##  2 triangle   7865
##  3 circle     7608
##  4 fireball   6208
##  5 other      5649
##  6 unknown    5584
##  7 sphere     5387
##  8 disk       5213
##  9 oval       3733
## 10 formation  2457
## # ... with 20 more rows

4.2 Missing Values

View missing values in more detail.

#Visualize missing values
vis_miss(df, sort_miss=TRUE)

#see count of missing values
na_values <- function(df){
  na <- colSums(is.na(df)) %>% sort(decreasing=TRUE)
  na[na>0]
}

na_values(df)
##          country            state        ufo_shape      description 
##             9670             5797             1932               15 
## encounter_length         latitude 
##                3                1

5 Data Wrangling

Remove/replace missing values and drop unnecessary columns

df$latitude[is.na(df$latitude)] <- 33.2001

# The single argument to this function, points, is a data.frame in which:
#   - column 1 contains the longitude in degrees
#   - column 2 contains the latitude in degrees
coords2country = function(points){  
  countriesSP <- getMap(resolution='high') # uses high res map from rworldxtra

  #setting CRS directly to that from rworldmap
  pointsSP = SpatialPoints(points, proj4string=CRS(proj4string(countriesSP)))  

  # use 'over' to get indices of the Polygons object containing each point 
  indices = over(pointsSP, countriesSP)

  # return the names of each country
  indices$ISO3
}

df$country <- coords2country(df[,c(11,10)])

#function to find if any string value matches, then make replacement 
find_string <- function(x, patterns, replacements=patterns, fill=NA, ...){
  stopifnot(length(patterns) == length(replacements))

  ans = rep_len(as.character(fill), length(x))    
  empty = seq_along(x)

  for(i in seq_along(patterns)) {
      greps = grepl(patterns[[i]], x[empty], ...)
      ans[empty[greps]] = replacements[[i]]  
      empty = empty[!greps]
  }
  return(ans)
}

#prepare to match on country names found in city_area column
country <- map("world")

country <- country$names %>% as_data_frame()
country$iso <- country$value %>% countrycode(origin="country.name",destination="iso3c")

country <- separate_rows(country, value, sep=":")
country <- country %>% filter(str_detect(value, "[:alpha:]+") & !is.na(iso)) %>% distinct()

df$country <- find_string(df$city_area, country$value, country$iso, df$country, ignore.case = TRUE)


#prepare to match on state abbreviations for countries in the USA
data(state.fips)
state.fips <- state.fips %>% select(abb) %>% distinct()
state.fips <- add_row(state.fips, abb = c("AK","HI"))
state.fips <- add_column(state.fips, country = "USA")

df$country <- find_string(df$state, state.fips$abb, state.fips$country, df$country, ignore.case = TRUE)

#sort out country values that are still NA
df[is.na(df$country),] %>% select(city_area) %>% distinct()
##                                                     city_area
##  1:                 pacific ocean (1500mi.sw of u.s.mainland)
##  2:                              split (in former yugoslavia)
##  3:                                            tyrrhenian sea
##  4:                           atlantic ocean (virgin islands)
##  5:                                   cam rahn bay (viet nam)
##  6:                     norman island (britsh virgin islands)
##  7:                                            atlantic ocean
##  8:                             atlantic ocean (mid-atlantic)
##  9:                                atlantic ocean (in flight)
## 10:                           atlantic ocean (on cruise ship)
## 11:                           cruz bay (u. s. virgin islands)
## 12:                                             pacific ocean
## 13:                          st.thomas (u. s. virgin islands)
## 14:                     playa del cura (grand canaria island)
## 15:                            caribbean sea (u.s. navy ship)
## 16:                                             caribbean sea
## 17: pacific ocean (in-flight; from tokyo airport to honolulu)
## 18:                      atlantic ocean (troopship/ uss rose)
## 19:                                  pacific ocean (no state)
## 20:                             mediterranean sea (in flight)
## 21:                               atlantic ocean (off africa)
## 22:                                         mediterranean sea
## 23:                                                     split
## 24:                                   atlantic ocean (at sea)
## 25:                                                    europe
## 26:                             midway island - pacific ocean
## 27:                                              persian gulf
## 28:                             caribbean sea (uss w.s. sims)
## 29:                      pacific ocean (approx. 250 mi. over)
## 30:                                   philippine sea (at sea)
## 31:                                    costa adeje (tenerife)
## 32:                           atlantic ocean (cv-67 u.s.navy)
## 33:                                         faliraki (rhodes)
## 34:                                          whitehouse beach
## 35:                                                 gibraltar
## 36:                                 atlantic ocean (inflight)
## 37:                                   atlantic ocean (middle)
##                                                     city_area
df <- df %>% 
  mutate(date_time = parse_date_time(date_time, 'mdy_hm'),
         country = as.character(country),
         country = case_when(is.na(country) & str_detect(city_area, "u\\.|uss|whitehouse") ~ "USA",
                             is.na(country) & str_detect(city_area, "viet nam") ~ "VNM", 
                             is.na(country) & str_detect(city_area, "britsh virgin islands") ~ "VGB",
                             is.na(country) & str_detect(city_area, "virgin") ~ "VIR",
                             is.na(country) & str_detect(city_area, "playa del cura|costa adeje|gibraltar") ~ "ESP",
                             is.na(country) & str_detect(city_area, "faliraki") ~ "GRC",
                             TRUE ~ country),
        ufo_shape = case_when(ufo_shape %in% "changed" ~ "changing",
                              ufo_shape %in% "round" ~ "circle",
                              ufo_shape %in% "flare" ~ "light",
                              ufo_shape %in% c("pyramid","delta") ~ "triangle",
                              is.na(ufo_shape) ~ "unknown",
                              TRUE ~ ufo_shape),
         country = replace_na(country, "Ocean"))

#view missing data again
na_values(df)
##            state      description encounter_length 
##             5797               15                3
df$continent <- df$country %>% countrycode(origin="iso3c",destination="continent")
df$continent[is.na(df$continent)] <- "Other"

6 Visualizations

df_country <- df %>% group_by(country) %>% summarize(count=n()) %>% arrange(count) %>% tail(10)
df_top_country <- df %>% filter(country %in% df_country$country)


df_country %>% mutate(country = fct_reorder(country, count)) %>% 
  ggplot(aes(country, log(count), fill=count))+
  geom_col()+
  coord_flip()+
  labs(title="Top Countries for UFO Sightings")

df_top_country %>% ggplot(aes(hour(date_time),country,fill=country))+
  geom_density_ridges()+
  scale_x_continuous(limits=c(0,24),breaks=c(0,4,8,12,16,20,24),labels=c("12am","4am","8am","noon","4pm","8pm","12am"))+
  labs(title="UFO Sightings by Time of Day",x="Time",y="Country")+
  theme(legend.position="none")

df %>% ggplot(aes(as.factor(month(date_time))))+
  geom_bar(fill="olivedrab3")+
  facet_wrap(~continent, scales="free_y")+
  labs(title="UFO Sightings by Month & Continent",subtitle="From November 1906 - April 2014",x="Month",y="Number of sightings")+
  scale_x_discrete(breaks=c(1,3,5,7,9,11),labels=c("Jan","Mar","May","Jul","Sep","Nov"))+
  theme(panel.background=element_rect(fill="gray16"),
        plot.title=element_text(hjust=0.5, size=18, face="bold"),
        plot.subtitle=element_text(hjust=0.5))

ggsave("ufo_sightings.png")