1 Project Description

Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.

2 Dataset

Data came from The National UFO Reporting Center. The data includes UFO sighting information such as date/time it was observed, location, UFO shape, and a description of the event.

3 Setup

3.1 Load Libraries

if (!require("pacman")) install.packages("pacman")
pacman::p_load("data.table","tidyverse","visdat","lubridate","rworldmap","sp","rworldxtra","maps","countrycode","ggridges")

theme_set(theme_classic())

3.2 Import Data

df <- fread("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-25/ufo_sightings.csv")

4 Exploratory Data Analysis

There are 80,332 observations and 11 variables.
Most of the missing values are with state and country which could be looked up based on the coordinates.

4.1 View Data

glimpse(df)

## Observations: 80,332
## Variables: 11
## $ date_time                  <chr> "10/10/1949 20:30", "10/10/1949 21:...
## $ city_area                  <chr> "san marcos", "lackland afb", "ches...
## $ state                      <chr> "tx", "tx", NA, "tx", "hi", "tn", N...
## $ country                    <chr> "us", NA, "gb", "us", "us", "us", "...
## $ ufo_shape                  <chr> "cylinder", "light", "circle", "cir...
## $ encounter_length           <dbl> 2700, 7200, 20, 20, 900, 300, 180, ...
## $ described_encounter_length <chr> "45 minutes", "1-2 hrs", "20 second...
## $ description                <chr> "This event took place in early fal...
## $ date_documented            <chr> "4/27/2004", "12/16/2005", "1/21/20...
## $ latitude                   <dbl> 29.88306, 29.38421, 53.20000, 28.97...
## $ longitude                  <dbl> -97.941111, -98.581082, -2.916667, ...

head(df)

##           date_time            city_area state country ufo_shape
## 1: 10/10/1949 20:30           san marcos    tx      us  cylinder
## 2: 10/10/1949 21:00         lackland afb    tx    <NA>     light
## 3: 10/10/1955 17:00 chester (uk/england)  <NA>      gb    circle
## 4: 10/10/1956 21:00                 edna    tx      us    circle
## 5: 10/10/1960 20:00              kaneohe    hi      us     light
## 6: 10/10/1961 19:00              bristol    tn      us    sphere
##    encounter_length described_encounter_length
## 1:             2700                 45 minutes
## 2:             7200                    1-2 hrs
## 3:               20                 20 seconds
## 4:               20                   1/2 hour
## 5:              900                 15 minutes
## 6:              300                  5 minutes
##                                                                                                                                                   description
## 1:                    This event took place in early fall around 1949-50. It occurred after a Boy Scout meeting in the Baptist Church. The Baptist Church sit
## 2:                                                            1949 Lackland AFB&#44 TX.  Lights racing across the sky &amp; making 90 degree turns on a dime.
## 3:                                                                                                        Green/Orange circular disc over Chester&#44 England
## 4:                 My older brother and twin sister were leaving the only Edna theater at about 9 PM&#44...we had our bikes and I took a different route home
## 5: AS a Marine 1st Lt. flying an FJ4B fighter/attack aircraft on a solo night exercise&#44 I was at 50&#44000&#39 in a &quot;clean&quot; aircraft (no ordinan
## 6:                 My father is now 89 my brother 52 the girl with us now 51 myself 49 and the other fellow which worked with my father if he&#39s still livi
##    date_documented latitude   longitude
## 1:       4/27/2004 29.88306  -97.941111
## 2:      12/16/2005 29.38421  -98.581082
## 3:       1/21/2008 53.20000   -2.916667
## 4:       1/17/2004 28.97833  -96.645833
## 5:       1/22/2004 21.41806 -157.803611
## 6:       4/27/2007 36.59500  -82.188889

summary(df)

##   date_time          city_area            state          
##  Length:80332       Length:80332       Length:80332      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    country           ufo_shape         encounter_length  
##  Length:80332       Length:80332       Min.   :       0  
##  Class :character   Class :character   1st Qu.:      30  
##  Mode  :character   Mode  :character   Median :     180  
##                                        Mean   :    9017  
##                                        3rd Qu.:     600  
##                                        Max.   :97836000  
##                                        NA's   :3         
##  described_encounter_length description        date_documented   
##  Length:80332               Length:80332       Length:80332      
##  Class :character           Class :character   Class :character  
##  Mode  :character           Mode  :character   Mode  :character  
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##     latitude        longitude      
##  Min.   :-82.86   Min.   :-176.66  
##  1st Qu.: 34.13   1st Qu.:-112.07  
##  Median : 39.41   Median : -87.90  
##  Mean   : 38.12   Mean   : -86.77  
##  3rd Qu.: 42.79   3rd Qu.: -78.75  
##  Max.   : 72.70   Max.   : 178.44  
##  NA's   :1

sapply(df, function(x) n_distinct(x)) %>% sort()

##                    country                  ufo_shape 
##                          6                         30 
##                      state            date_documented 
##                         68                        317 
##           encounter_length described_encounter_length 
##                        534                       8349 
##                   latitude                  longitude 
##                      18421                      19455 
##                  city_area                  date_time 
##                      19900                      69586 
##                description 
##                      79997

df %>% count(ufo_shape, sort=TRUE)

## # A tibble: 30 x 2
##    ufo_shape     n
##    <chr>     <int>
##  1 light     16565
##  2 triangle   7865
##  3 circle     7608
##  4 fireball   6208
##  5 other      5649
##  6 unknown    5584
##  7 sphere     5387
##  8 disk       5213
##  9 oval       3733
## 10 formation  2457
## # ... with 20 more rows

4.2 Missing Values

View missing values in more detail.

#Visualize missing values
vis_miss(df, sort_miss=TRUE)

#see count of missing values
na_values <- function(df){
  na <- colSums(is.na(df)) %>% sort(decreasing=TRUE)
  na[na>0]
}

na_values(df)

##          country            state        ufo_shape      description 
##             9670             5797             1932               15 
## encounter_length         latitude 
##                3                1

5 Data Wrangling

Remove/replace missing values and drop unnecessary columns

df$latitude[is.na(df$latitude)] <- 33.2001

# The single argument to this function, points, is a data.frame in which:
#   - column 1 contains the longitude in degrees
#   - column 2 contains the latitude in degrees
coords2country = function(points){  
  countriesSP <- getMap(resolution='high') # uses high res map from rworldxtra

  #setting CRS directly to that from rworldmap
  pointsSP = SpatialPoints(points, proj4string=CRS(proj4string(countriesSP)))  

  # use 'over' to get indices of the Polygons object containing each point 
  indices = over(pointsSP, countriesSP)

  # return the names of each country
  indices$ISO3
}

df$country <- coords2country(df[,c(11,10)])

#function to find if any string value matches, then make replacement 
find_string <- function(x, patterns, replacements=patterns, fill=NA, ...){
  stopifnot(length(patterns) == length(replacements))

  ans = rep_len(as.character(fill), length(x))    
  empty = seq_along(x)

  for(i in seq_along(patterns)) {
      greps = grepl(patterns[[i]], x[empty], ...)
      ans[empty[greps]] = replacements[[i]]  
      empty = empty[!greps]
  }
  return(ans)
}

#prepare to match on country names found in city_area column
country <- map("world")

country <- country$names %>% as_data_frame()
country$iso <- country$value %>% countrycode(origin="country.name",destination="iso3c")

country <- separate_rows(country, value, sep=":")
country <- country %>% filter(str_detect(value, "[:alpha:]+") & !is.na(iso)) %>% distinct()

df$country <- find_string(df$city_area, country$value, country$iso, df$country, ignore.case = TRUE)


#prepare to match on state abbreviations for countries in the USA
data(state.fips)
state.fips <- state.fips %>% select(abb) %>% distinct()
state.fips <- add_row(state.fips, abb = c("AK","HI"))
state.fips <- add_column(state.fips, country = "USA")

df$country <- find_string(df$state, state.fips$abb, state.fips$country, df$country, ignore.case = TRUE)

#sort out country values that are still NA
df[is.na(df$country),] %>% select(city_area) %>% distinct()

##                                                     city_area
##  1:                 pacific ocean (1500mi.sw of u.s.mainland)
##  2:                              split (in former yugoslavia)
##  3:                                            tyrrhenian sea
##  4:                           atlantic ocean (virgin islands)
##  5:                                   cam rahn bay (viet nam)
##  6:                     norman island (britsh virgin islands)
##  7:                                            atlantic ocean
##  8:                             atlantic ocean (mid-atlantic)
##  9:                                atlantic ocean (in flight)
## 10:                           atlantic ocean (on cruise ship)
## 11:                           cruz bay (u. s. virgin islands)
## 12:                                             pacific ocean
## 13:                          st.thomas (u. s. virgin islands)
## 14:                     playa del cura (grand canaria island)
## 15:                            caribbean sea (u.s. navy ship)
## 16:                                             caribbean sea
## 17: pacific ocean (in-flight; from tokyo airport to honolulu)
## 18:                      atlantic ocean (troopship/ uss rose)
## 19:                                  pacific ocean (no state)
## 20:                             mediterranean sea (in flight)
## 21:                               atlantic ocean (off africa)
## 22:                                         mediterranean sea
## 23:                                                     split
## 24:                                   atlantic ocean (at sea)
## 25:                                                    europe
## 26:                             midway island - pacific ocean
## 27:                                              persian gulf
## 28:                             caribbean sea (uss w.s. sims)
## 29:                      pacific ocean (approx. 250 mi. over)
## 30:                                   philippine sea (at sea)
## 31:                                    costa adeje (tenerife)
## 32:                           atlantic ocean (cv-67 u.s.navy)
## 33:                                         faliraki (rhodes)
## 34:                                          whitehouse beach
## 35:                                                 gibraltar
## 36:                                 atlantic ocean (inflight)
## 37:                                   atlantic ocean (middle)
##                                                     city_area

df <- df %>% 
  mutate(date_time = parse_date_time(date_time, 'mdy_hm'),
         country = as.character(country),
         country = case_when(is.na(country) & str_detect(city_area, "u\\.|uss|whitehouse") ~ "USA",
                             is.na(country) & str_detect(city_area, "viet nam") ~ "VNM", 
                             is.na(country) & str_detect(city_area, "britsh virgin islands") ~ "VGB",
                             is.na(country) & str_detect(city_area, "virgin") ~ "VIR",
                             is.na(country) & str_detect(city_area, "playa del cura|costa adeje|gibraltar") ~ "ESP",
                             is.na(country) & str_detect(city_area, "faliraki") ~ "GRC",
                             TRUE ~ country),
        ufo_shape = case_when(ufo_shape %in% "changed" ~ "changing",
                              ufo_shape %in% "round" ~ "circle",
                              ufo_shape %in% "flare" ~ "light",
                              ufo_shape %in% c("pyramid","delta") ~ "triangle",
                              is.na(ufo_shape) ~ "unknown",
                              TRUE ~ ufo_shape),
         country = replace_na(country, "Ocean"))

#view missing data again
na_values(df)

##            state      description encounter_length 
##             5797               15                3

df$continent <- df$country %>% countrycode(origin="iso3c",destination="continent")
df$continent[is.na(df$continent)] <- "Other"

6 Visualizations

df_country <- df %>% group_by(country) %>% summarize(count=n()) %>% arrange(count) %>% tail(10)
df_top_country <- df %>% filter(country %in% df_country$country)


df_country %>% mutate(country = fct_reorder(country, count)) %>% 
  ggplot(aes(country, log(count), fill=count))+
  geom_col()+
  coord_flip()+
  labs(title="Top Countries for UFO Sightings")

df_top_country %>% ggplot(aes(hour(date_time),country,fill=country))+
  geom_density_ridges()+
  scale_x_continuous(limits=c(0,24),breaks=c(0,4,8,12,16,20,24),labels=c("12am","4am","8am","noon","4pm","8pm","12am"))+
  labs(title="UFO Sightings by Time of Day",x="Time",y="Country")+
  theme(legend.position="none")

df %>% ggplot(aes(as.factor(month(date_time))))+
  geom_bar(fill="olivedrab3")+
  facet_wrap(~continent, scales="free_y")+
  labs(title="UFO Sightings by Month & Continent",subtitle="From November 1906 - April 2014",x="Month",y="Number of sightings")+
  scale_x_discrete(breaks=c(1,3,5,7,9,11),labels=c("Jan","Mar","May","Jul","Sep","Nov"))+
  theme(panel.background=element_rect(fill="gray16"),
        plot.title=element_text(hjust=0.5, size=18, face="bold"),
        plot.subtitle=element_text(hjust=0.5))

ggsave("ufo_sightings.png")

UFO Sightings

Catherine Williams

June 26, 2019

Tidy Tuesday