Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.
Data came from The National UFO Reporting Center. The data includes UFO sighting information such as date/time it was observed, location, UFO shape, and a description of the event.
if (!require("pacman")) install.packages("pacman")
pacman::p_load("data.table","tidyverse","visdat","lubridate","rworldmap","sp","rworldxtra","maps","countrycode","ggridges")
theme_set(theme_classic())
df <- fread("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-25/ufo_sightings.csv")
glimpse(df)
## Observations: 80,332
## Variables: 11
## $ date_time <chr> "10/10/1949 20:30", "10/10/1949 21:...
## $ city_area <chr> "san marcos", "lackland afb", "ches...
## $ state <chr> "tx", "tx", NA, "tx", "hi", "tn", N...
## $ country <chr> "us", NA, "gb", "us", "us", "us", "...
## $ ufo_shape <chr> "cylinder", "light", "circle", "cir...
## $ encounter_length <dbl> 2700, 7200, 20, 20, 900, 300, 180, ...
## $ described_encounter_length <chr> "45 minutes", "1-2 hrs", "20 second...
## $ description <chr> "This event took place in early fal...
## $ date_documented <chr> "4/27/2004", "12/16/2005", "1/21/20...
## $ latitude <dbl> 29.88306, 29.38421, 53.20000, 28.97...
## $ longitude <dbl> -97.941111, -98.581082, -2.916667, ...
head(df)
## date_time city_area state country ufo_shape
## 1: 10/10/1949 20:30 san marcos tx us cylinder
## 2: 10/10/1949 21:00 lackland afb tx <NA> light
## 3: 10/10/1955 17:00 chester (uk/england) <NA> gb circle
## 4: 10/10/1956 21:00 edna tx us circle
## 5: 10/10/1960 20:00 kaneohe hi us light
## 6: 10/10/1961 19:00 bristol tn us sphere
## encounter_length described_encounter_length
## 1: 2700 45 minutes
## 2: 7200 1-2 hrs
## 3: 20 20 seconds
## 4: 20 1/2 hour
## 5: 900 15 minutes
## 6: 300 5 minutes
## description
## 1: This event took place in early fall around 1949-50. It occurred after a Boy Scout meeting in the Baptist Church. The Baptist Church sit
## 2: 1949 Lackland AFB, TX. Lights racing across the sky & making 90 degree turns on a dime.
## 3: Green/Orange circular disc over Chester, England
## 4: My older brother and twin sister were leaving the only Edna theater at about 9 PM,...we had our bikes and I took a different route home
## 5: AS a Marine 1st Lt. flying an FJ4B fighter/attack aircraft on a solo night exercise, I was at 50ꯠ' in a "clean" aircraft (no ordinan
## 6: My father is now 89 my brother 52 the girl with us now 51 myself 49 and the other fellow which worked with my father if he's still livi
## date_documented latitude longitude
## 1: 4/27/2004 29.88306 -97.941111
## 2: 12/16/2005 29.38421 -98.581082
## 3: 1/21/2008 53.20000 -2.916667
## 4: 1/17/2004 28.97833 -96.645833
## 5: 1/22/2004 21.41806 -157.803611
## 6: 4/27/2007 36.59500 -82.188889
summary(df)
## date_time city_area state
## Length:80332 Length:80332 Length:80332
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## country ufo_shape encounter_length
## Length:80332 Length:80332 Min. : 0
## Class :character Class :character 1st Qu.: 30
## Mode :character Mode :character Median : 180
## Mean : 9017
## 3rd Qu.: 600
## Max. :97836000
## NA's :3
## described_encounter_length description date_documented
## Length:80332 Length:80332 Length:80332
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## latitude longitude
## Min. :-82.86 Min. :-176.66
## 1st Qu.: 34.13 1st Qu.:-112.07
## Median : 39.41 Median : -87.90
## Mean : 38.12 Mean : -86.77
## 3rd Qu.: 42.79 3rd Qu.: -78.75
## Max. : 72.70 Max. : 178.44
## NA's :1
sapply(df, function(x) n_distinct(x)) %>% sort()
## country ufo_shape
## 6 30
## state date_documented
## 68 317
## encounter_length described_encounter_length
## 534 8349
## latitude longitude
## 18421 19455
## city_area date_time
## 19900 69586
## description
## 79997
df %>% count(ufo_shape, sort=TRUE)
## # A tibble: 30 x 2
## ufo_shape n
## <chr> <int>
## 1 light 16565
## 2 triangle 7865
## 3 circle 7608
## 4 fireball 6208
## 5 other 5649
## 6 unknown 5584
## 7 sphere 5387
## 8 disk 5213
## 9 oval 3733
## 10 formation 2457
## # ... with 20 more rows
View missing values in more detail.
#Visualize missing values
vis_miss(df, sort_miss=TRUE)
#see count of missing values
na_values <- function(df){
na <- colSums(is.na(df)) %>% sort(decreasing=TRUE)
na[na>0]
}
na_values(df)
## country state ufo_shape description
## 9670 5797 1932 15
## encounter_length latitude
## 3 1
Remove/replace missing values and drop unnecessary columns
df$latitude[is.na(df$latitude)] <- 33.2001
# The single argument to this function, points, is a data.frame in which:
# - column 1 contains the longitude in degrees
# - column 2 contains the latitude in degrees
coords2country = function(points){
countriesSP <- getMap(resolution='high') # uses high res map from rworldxtra
#setting CRS directly to that from rworldmap
pointsSP = SpatialPoints(points, proj4string=CRS(proj4string(countriesSP)))
# use 'over' to get indices of the Polygons object containing each point
indices = over(pointsSP, countriesSP)
# return the names of each country
indices$ISO3
}
df$country <- coords2country(df[,c(11,10)])
#function to find if any string value matches, then make replacement
find_string <- function(x, patterns, replacements=patterns, fill=NA, ...){
stopifnot(length(patterns) == length(replacements))
ans = rep_len(as.character(fill), length(x))
empty = seq_along(x)
for(i in seq_along(patterns)) {
greps = grepl(patterns[[i]], x[empty], ...)
ans[empty[greps]] = replacements[[i]]
empty = empty[!greps]
}
return(ans)
}
#prepare to match on country names found in city_area column
country <- map("world")
country <- country$names %>% as_data_frame()
country$iso <- country$value %>% countrycode(origin="country.name",destination="iso3c")
country <- separate_rows(country, value, sep=":")
country <- country %>% filter(str_detect(value, "[:alpha:]+") & !is.na(iso)) %>% distinct()
df$country <- find_string(df$city_area, country$value, country$iso, df$country, ignore.case = TRUE)
#prepare to match on state abbreviations for countries in the USA
data(state.fips)
state.fips <- state.fips %>% select(abb) %>% distinct()
state.fips <- add_row(state.fips, abb = c("AK","HI"))
state.fips <- add_column(state.fips, country = "USA")
df$country <- find_string(df$state, state.fips$abb, state.fips$country, df$country, ignore.case = TRUE)
#sort out country values that are still NA
df[is.na(df$country),] %>% select(city_area) %>% distinct()
## city_area
## 1: pacific ocean (1500mi.sw of u.s.mainland)
## 2: split (in former yugoslavia)
## 3: tyrrhenian sea
## 4: atlantic ocean (virgin islands)
## 5: cam rahn bay (viet nam)
## 6: norman island (britsh virgin islands)
## 7: atlantic ocean
## 8: atlantic ocean (mid-atlantic)
## 9: atlantic ocean (in flight)
## 10: atlantic ocean (on cruise ship)
## 11: cruz bay (u. s. virgin islands)
## 12: pacific ocean
## 13: st.thomas (u. s. virgin islands)
## 14: playa del cura (grand canaria island)
## 15: caribbean sea (u.s. navy ship)
## 16: caribbean sea
## 17: pacific ocean (in-flight; from tokyo airport to honolulu)
## 18: atlantic ocean (troopship/ uss rose)
## 19: pacific ocean (no state)
## 20: mediterranean sea (in flight)
## 21: atlantic ocean (off africa)
## 22: mediterranean sea
## 23: split
## 24: atlantic ocean (at sea)
## 25: europe
## 26: midway island - pacific ocean
## 27: persian gulf
## 28: caribbean sea (uss w.s. sims)
## 29: pacific ocean (approx. 250 mi. over)
## 30: philippine sea (at sea)
## 31: costa adeje (tenerife)
## 32: atlantic ocean (cv-67 u.s.navy)
## 33: faliraki (rhodes)
## 34: whitehouse beach
## 35: gibraltar
## 36: atlantic ocean (inflight)
## 37: atlantic ocean (middle)
## city_area
df <- df %>%
mutate(date_time = parse_date_time(date_time, 'mdy_hm'),
country = as.character(country),
country = case_when(is.na(country) & str_detect(city_area, "u\\.|uss|whitehouse") ~ "USA",
is.na(country) & str_detect(city_area, "viet nam") ~ "VNM",
is.na(country) & str_detect(city_area, "britsh virgin islands") ~ "VGB",
is.na(country) & str_detect(city_area, "virgin") ~ "VIR",
is.na(country) & str_detect(city_area, "playa del cura|costa adeje|gibraltar") ~ "ESP",
is.na(country) & str_detect(city_area, "faliraki") ~ "GRC",
TRUE ~ country),
ufo_shape = case_when(ufo_shape %in% "changed" ~ "changing",
ufo_shape %in% "round" ~ "circle",
ufo_shape %in% "flare" ~ "light",
ufo_shape %in% c("pyramid","delta") ~ "triangle",
is.na(ufo_shape) ~ "unknown",
TRUE ~ ufo_shape),
country = replace_na(country, "Ocean"))
#view missing data again
na_values(df)
## state description encounter_length
## 5797 15 3
df$continent <- df$country %>% countrycode(origin="iso3c",destination="continent")
df$continent[is.na(df$continent)] <- "Other"
df_country <- df %>% group_by(country) %>% summarize(count=n()) %>% arrange(count) %>% tail(10)
df_top_country <- df %>% filter(country %in% df_country$country)
df_country %>% mutate(country = fct_reorder(country, count)) %>%
ggplot(aes(country, log(count), fill=count))+
geom_col()+
coord_flip()+
labs(title="Top Countries for UFO Sightings")
df_top_country %>% ggplot(aes(hour(date_time),country,fill=country))+
geom_density_ridges()+
scale_x_continuous(limits=c(0,24),breaks=c(0,4,8,12,16,20,24),labels=c("12am","4am","8am","noon","4pm","8pm","12am"))+
labs(title="UFO Sightings by Time of Day",x="Time",y="Country")+
theme(legend.position="none")
df %>% ggplot(aes(as.factor(month(date_time))))+
geom_bar(fill="olivedrab3")+
facet_wrap(~continent, scales="free_y")+
labs(title="UFO Sightings by Month & Continent",subtitle="From November 1906 - April 2014",x="Month",y="Number of sightings")+
scale_x_discrete(breaks=c(1,3,5,7,9,11),labels=c("Jan","Mar","May","Jul","Sep","Nov"))+
theme(panel.background=element_rect(fill="gray16"),
plot.title=element_text(hjust=0.5, size=18, face="bold"),
plot.subtitle=element_text(hjust=0.5))
ggsave("ufo_sightings.png")