Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.
Data came from FAA. The data consists of voluntarily reported airplane wildlife strikes since 1990. The data was pre-filtered to include only the major airlines in the USA: American Airlines, Delta, Southwest, and United. The information obtained may include date, time, location, animal species, type of damage, weather conditions, airplane information, and repair costs.
if (!require("ggradar")) devtools::install_github("ricardo-bion/ggradar", dependencies = TRUE)
if (!require("pacman")) install.packages("pacman")
pacman::p_load("data.table","tidyverse","visdat","lubridate","modeest","ggradar","scales")
theme_set(theme_classic())
df <- fread("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-23/wildlife_impacts.csv", stringsAsFactors=TRUE)
#remove duplicate rows
df <- df %>% distinct()
glimpse(df)
## Observations: 56,449
## Variables: 21
## $ incident_date <fct> 2018-12-31T00:00:00Z, 2018-12-29T00:00:0...
## $ state <fct> FL, IN, N/A, N/A, N/A, FL, FL, N/A, N/A,...
## $ airport_id <fct> KMIA, KIND, ZZZZ, ZZZZ, ZZZZ, KMIA, KMCO...
## $ airport <fct> MIAMI INTL, INDIANAPOLIS INTL ARPT, UNKN...
## $ operator <fct> AMERICAN AIRLINES, AMERICAN AIRLINES, AM...
## $ atype <fct> B-737-800, B-737-800, UNKNOWN, B-737-900...
## $ type_eng <fct> D, D, NA, D, D, D, D, D, D, D, D, D, D, ...
## $ species_id <fct> UNKBL, R, R2004, N5205, J2139, UNKB, UNK...
## $ species <fct> Unknown bird - large, Owls, Short-eared ...
## $ damage <fct> M?, N, NA, M?, M?, N, N, N, N, N, N, N, ...
## $ num_engs <int> 2, 2, NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ incident_month <int> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, ...
## $ incident_year <int> 2018, 2018, 2018, 2018, 2018, 2018, 2018...
## $ time_of_day <fct> Day, Night, NA, NA, NA, Day, Night, NA, ...
## $ time <int> 1207, 2355, NA, NA, NA, 955, 948, NA, NA...
## $ height <int> 700, 0, NA, NA, NA, NA, 600, NA, NA, 0, ...
## $ speed <int> 200, NA, NA, NA, NA, NA, 145, NA, NA, 13...
## $ phase_of_flt <fct> Climb, Landing Roll, NA, NA, NA, Approac...
## $ sky <fct> Some Cloud, NA, NA, NA, NA, NA, Some Clo...
## $ precip <fct> None, NA, NA, NA, NA, NA, None, NA, NA, ...
## $ cost_repairs_infl_adj <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
head(df)
## incident_date state airport_id airport
## 1: 2018-12-31T00:00:00Z FL KMIA MIAMI INTL
## 2: 2018-12-29T00:00:00Z IN KIND INDIANAPOLIS INTL ARPT
## 3: 2018-12-29T00:00:00Z N/A ZZZZ UNKNOWN
## 4: 2018-12-27T00:00:00Z N/A ZZZZ UNKNOWN
## 5: 2018-12-27T00:00:00Z N/A ZZZZ UNKNOWN
## 6: 2018-12-27T00:00:00Z FL KMIA MIAMI INTL
## operator atype type_eng species_id species
## 1: AMERICAN AIRLINES B-737-800 D UNKBL Unknown bird - large
## 2: AMERICAN AIRLINES B-737-800 D R Owls
## 3: AMERICAN AIRLINES UNKNOWN <NA> R2004 Short-eared owl
## 4: AMERICAN AIRLINES B-737-900 D N5205 Southern lapwing
## 5: AMERICAN AIRLINES B-737-800 D J2139 Lesser scaup
## 6: AMERICAN AIRLINES A-319 D UNKB Unknown bird
## damage num_engs incident_month incident_year time_of_day time height
## 1: M? 2 12 2018 Day 1207 700
## 2: N 2 12 2018 Night 2355 0
## 3: <NA> NA 12 2018 <NA> NA NA
## 4: M? 2 12 2018 <NA> NA NA
## 5: M? 2 12 2018 <NA> NA NA
## 6: N 2 12 2018 Day 955 NA
## speed phase_of_flt sky precip cost_repairs_infl_adj
## 1: 200 Climb Some Cloud None NA
## 2: NA Landing Roll <NA> <NA> NA
## 3: NA <NA> <NA> <NA> NA
## 4: NA <NA> <NA> <NA> NA
## 5: NA <NA> <NA> <NA> NA
## 6: NA Approach <NA> <NA> NA
df %>% mutate_if(is.character, as.factor) %>% summary()
## incident_date state airport_id
## 2018-10-12T00:00:00Z: 56 N/A :12168 ZZZZ :10543
## 2018-10-21T00:00:00Z: 39 TX : 7143 KDFW : 2453
## 2018-09-29T00:00:00Z: 38 CA : 5779 KDEN : 2243
## 2018-10-11T00:00:00Z: 38 FL : 3685 KORD : 1963
## 2018-10-30T00:00:00Z: 38 IL : 2739 KSMF : 1651
## 2017-10-16T00:00:00Z: 36 CO : 2435 KMCO : 1066
## (Other) :56204 (Other):22500 (Other):36530
## airport operator
## UNKNOWN :10543 AMERICAN AIRLINES :14874
## DALLAS/FORT WORTH INTL ARPT: 2453 DELTA AIR LINES : 9000
## DENVER INTL AIRPORT : 2243 SOUTHWEST AIRLINES:17915
## CHICAGO O'HARE INTL ARPT : 1963 UNITED AIRLINES :14660
## SACRAMENTO INTL : 1651
## ORLANDO INTL : 1066
## (Other) :36530
## atype type_eng species_id
## B-737-700: 9911 A : 2 UNKBM :14943
## B-737-300: 7526 C : 34 UNKBS :14723
## B-737-800: 5225 D :56178 UNKB : 3939
## B-757-200: 4148 F : 3 NE1 : 1504
## A-320 : 3685 NA's: 232 O2205 : 1339
## A-319 : 3018 YI005 : 1170
## (Other) :22936 (Other):18831
## species damage num_engs
## Unknown bird - medium:14943 M : 1888 Min. :1.000
## Unknown bird - small :14723 M? : 1085 1st Qu.:2.000
## Unknown bird : 3939 N :48133 Median :2.000
## Gulls : 1504 S : 1026 Mean :2.059
## Mourning dove : 1339 NA's: 4317 3rd Qu.:2.000
## Barn swallow : 1170 Max. :4.000
## (Other) :18831 NA's :231
## incident_month incident_year time_of_day time
## Min. : 1.00 Min. :1990 Dawn : 1269 Min. : -84
## 1st Qu.: 5.00 1st Qu.:2002 Day :25113 1st Qu.: 930
## Median : 8.00 Median :2010 Dusk : 1717 Median :1426
## Mean : 7.23 Mean :2008 Night:12731 Mean :1428
## 3rd Qu.: 9.00 3rd Qu.:2015 NA's :15619 3rd Qu.:1950
## Max. :12.00 Max. :2018 Max. :2359
## NA's :25607
## height speed phase_of_flt sky
## Min. : 0.0 Min. : 0.0 Approach :20592 No Cloud :18926
## 1st Qu.: 0.0 1st Qu.:130.0 Take-off run: 8226 Overcast : 5445
## Median : 50.0 Median :140.0 Climb : 7656 Some Cloud:12102
## Mean : 984.2 Mean :154.6 Landing Roll: 7240 NA's :19976
## 3rd Qu.: 1000.0 3rd Qu.:170.0 Descent : 661
## Max. :25000.0 Max. :354.0 (Other) : 1110
## NA's :17526 NA's :29522 NA's :10964
## precip cost_repairs_infl_adj
## None :32922 Min. : 11
## Rain : 1686 1st Qu.: 5128
## Fog : 587 Median : 26783
## Snow : 85 Mean : 242388
## Fog, Rain: 58 3rd Qu.: 93124
## (Other) : 8 Max. :16380000
## NA's :21103 NA's :55834
sapply(df, function(x) n_distinct(x)) %>% sort()
## operator sky type_eng
## 4 4 5
## damage num_engs time_of_day
## 5 5 5
## precip incident_month phase_of_flt
## 8 12 25
## incident_year state atype
## 29 59 91
## speed height airport_id
## 166 340 423
## airport species species_id
## 423 527 528
## cost_repairs_infl_adj time incident_date
## 529 1331 9880
View missing values in more detail.
#Visualize missing values
vis_miss(df, sort_miss=TRUE, warn_large_data = FALSE)
#see count of missing values
na_values <- function(df){
na <- colSums(is.na(df)) %>% sort(decreasing=TRUE)
na[na>0]
}
na_values(df)
## cost_repairs_infl_adj speed time
## 55834 29522 25607
## precip sky height
## 21103 19976 17526
## time_of_day phase_of_flt damage
## 15619 10964 4317
## type_eng num_engs
## 232 231
#remove repair costs since there are too many missing values
df <- df %>% select(-cost_repairs_infl_adj,-incident_month,-incident_year)
#mutate date and time columns to join into a single column
df <- df %>% mutate(time = str_replace(time, "^\\d{1,2}$", paste0(time,"00")),
time = as.double(time),
time = case_when(time_of_day == "Night" & time < 1700 ~ time+1200,
time_of_day == "Dusk" & time < 1200 ~ time+1200,
is.na(time) ~ 0,
TRUE ~ time),
time = str_pad(time, 4, side="left", pad="0"),
time = if_else(time_of_day == "Night" & time > 2400,
str_pad(time, 6, side="left", pad="0"),
str_pad(time, 6, side="right", pad="0")),
incident_date = str_sub(incident_date,1,10),
datetime = paste(incident_date,time),
#this will convert to date/time
datetime = as_datetime(datetime),
#need to put it back to character in order to make proper NA replacement
#it picks a random time instead of inserting 0s if this is not done
datetime = as.character(datetime),
#replace NA values generated from illegal time formats
datetime = replace(datetime,is.na(datetime),incident_date[is.na(datetime)]),
#convert back to date/time
datetime = as_datetime(datetime)) %>%
select(-time,-incident_date)
#complete time_of_day based on time of year, day, and location
df <- df %>% mutate(month = month(datetime),
hour = hour(datetime)) %>%
group_by(month,hour,state) %>%
mutate(time_of_day = replace_na(time_of_day, mfv1(time_of_day, na.rm=T))) %>%
ungroup()
na_values(df) #there are still 151 missing time_of_day values
## speed precip sky height phase_of_flt
## 29522 21103 19976 17526 10964
## damage type_eng num_engs time_of_day
## 4317 232 231 151
#fill in values with educated guesses based on time of day
df <- df %>% mutate(time_of_day = as.character(time_of_day),
time_of_day = case_when(is.na(time_of_day) & hour > 5 & hour < 7 ~ "Dawn",
is.na(time_of_day) & hour > 17 & hour < 19 ~ "Dusk",
is.na(time_of_day) & hour >= 7 & hour <= 17 ~ "Day",
is.na(time_of_day) & hour >=19 | hour <=5 ~ "Night",
TRUE ~ time_of_day))
na_values(df)
## speed precip sky height phase_of_flt
## 29522 21103 19976 17526 10964
## damage type_eng num_engs
## 4317 232 231
#make airport names more readable
df <- df %>% mutate(airport = str_replace_all(airport," ARPT| INTL",""),
airport = str_to_title(airport) %>% as.factor())
#clean phase_of_flt
df <- df %>% mutate(phase_of_flt = str_to_lower(phase_of_flt),
phase_of_flt = replace_na(phase_of_flt,"unknown"),
phase_of_flt = str_replace(phase_of_flt, "local", "unknown"),
phase_of_flt = as.factor(phase_of_flt),
phase_of_flt = fct_relevel(phase_of_flt, c("parked","taxi","take-off run","departure","climb","en route","descent","approach","landing roll","arrival","unknown")))
summary(as.factor(df$phase_of_flt))
## parked taxi take-off run departure climb
## 12 76 8264 470 7672
## en route descent approach landing roll arrival
## 81 661 20642 7278 190
## unknown
## 11103
The first three visualizations are misleading because there is no information on number of total flights so per capita amounts cannot be established.
The FAA Color Scheme was used for these visualizations.
#this is misrepresenting the data because there are more day and night hours in a day
df %>% ggplot(aes(time_of_day))+
geom_bar(fill="#0f4d92")+
labs(title="What part of the day do most bird strikes happen?")
#this is misrepresenting the data because some times are more popular to fly than others
df %>% filter(hour != 0) %>%
ggplot(aes(hour, fill=time_of_day))+
geom_bar()+
scale_fill_manual(values=c("#eedd82","#0f4d92","#b8860b","#008000"))+
labs(title="What time of day do most bird strikes happen?")
#this could be misrepresenting the data because some airports are busier than others
df %>% filter(airport_id != "ZZZZ") %>% count(airport) %>% filter(n > 700) %>% mutate(airport = fct_reorder(airport,n)) %>%
ggplot(aes(airport, n))+
geom_col(fill="#0f4d92")+
coord_flip()+
labs(title="Which airport has the most bird strikes?")
#final visualization
df_flt_phase <- df %>% filter(phase_of_flt != "unknown") %>% count(phase_of_flt)
df_flt_phase <- df_flt_phase %>% mutate_at(vars(n), rescale)
df_flt_phase %>% spread(phase_of_flt,n) %>%
ggradar(axis.label.size = 4,
background.circle.colour = "#ffffff",
values.radar = c("","",""),
gridline.min.colour = "#b8860b",
gridline.mid.colour = "#eedd82",
gridline.max.colour = "#b8860b",
axis.line.colour = "#008000",
group.point.size = 4,
group.colours = "#0f4d92",
plot.title = "When is an airplane most likely to strike a bird?")+
labs(caption = "Data: FAA Wildlife Strike Database | Graphic: Cat Williams @catrwilliams")+
theme(legend.position = "none",
plot.title = element_text(size=17, hjust=0.5, face="bold"),
plot.caption = element_text(size=6))
ggsave("wildlife-impacts.png")