Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.
Data came from NASA. The data includes meteorite information such as the class, mass, and location when meteorites were found/fallen.
#function to check if packages are installed, if not then install them, and load all packages
libraries <- function(packages){
for(package in packages){
#checks if package is installed
if(!require(package, character.only = TRUE)){
#If package does not exist, then it will install
install.packages(package, dependencies = TRUE)
#Loads package
library(package, character.only = TRUE)
}
}
}
packages <- c("tidyverse","readxl","visdat","modeest","maps","ggthemes","RColorBrewer")
libraries(packages)
theme_set(theme_classic()) #applies classic theme to all charts
df <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-11/meteorites.csv")
glimpse(df)
## Observations: 45,716
## Variables: 10
## $ name <fct> Aachen, Aarhus, Abee, Acapulco, Achiras, Adhi Kot,...
## $ id <int> 1, 2, 6, 10, 370, 379, 390, 392, 398, 417, 423, 42...
## $ name_type <fct> Valid, Valid, Valid, Valid, Valid, Valid, Valid, V...
## $ class <fct> "L5", "H6", "EH4", "Acapulcoite", "L6", "EH4", "LL...
## $ mass <dbl> 21, 720, 107000, 1914, 780, 4239, 910, 30000, 1620...
## $ fall <fct> Fell, Fell, Fell, Fell, Fell, Fell, Fell, Fell, Fe...
## $ year <int> 1880, 1951, 1952, 1976, 1902, 1919, 1949, 1814, 19...
## $ lat <dbl> 50.77500, 56.18333, 54.21667, 16.88333, -33.16667,...
## $ long <dbl> 6.08333, 10.23333, -113.00000, -99.90000, -64.9500...
## $ geolocation <fct> "(50.775, 6.08333)", "(56.18333, 10.23333)", "(54....
head(df)
## name id name_type class mass fall year lat long
## 1 Aachen 1 Valid L5 21 Fell 1880 50.77500 6.08333
## 2 Aarhus 2 Valid H6 720 Fell 1951 56.18333 10.23333
## 3 Abee 6 Valid EH4 107000 Fell 1952 54.21667 -113.00000
## 4 Acapulco 10 Valid Acapulcoite 1914 Fell 1976 16.88333 -99.90000
## 5 Achiras 370 Valid L6 780 Fell 1902 -33.16667 -64.95000
## 6 Adhi Kot 379 Valid EH4 4239 Fell 1919 32.10000 71.80000
## geolocation
## 1 (50.775, 6.08333)
## 2 (56.18333, 10.23333)
## 3 (54.21667, -113.0)
## 4 (16.88333, -99.9)
## 5 (-33.16667, -64.95)
## 6 (32.1, 71.8)
summary(df)
## name id name_type class
## Österplana 002: 1 Min. : 1 Relict: 75 L6 : 8339
## Österplana 003: 1 1st Qu.:12689 Valid :45641 H5 : 7164
## Österplana 004: 1 Median :24262 L5 : 4817
## Österplana 005: 1 Mean :26890 H6 : 4529
## Österplana 006: 1 3rd Qu.:40657 H4 : 4222
## Österplana 007: 1 Max. :57458 LL5 : 2766
## (Other) :45710 (Other):13879
## mass fall year lat
## Min. : 0 Fell : 1107 Min. : 860 Min. :-87.37
## 1st Qu.: 7 Found:44609 1st Qu.:1987 1st Qu.:-76.71
## Median : 33 Median :1998 Median :-71.50
## Mean : 13278 Mean :1992 Mean :-39.12
## 3rd Qu.: 203 3rd Qu.:2003 3rd Qu.: 0.00
## Max. :60000000 Max. :2101 Max. : 81.17
## NA's :131 NA's :291 NA's :7315
## long geolocation
## Min. :-165.43 (0.0, 0.0) : 6214
## 1st Qu.: 0.00 (-71.5, 35.66667) : 4761
## Median : 35.67 (-84.0, 168.0) : 3040
## Mean : 61.07 (-72.0, 26.0) : 1505
## 3rd Qu.: 157.17 (-79.68333, 159.75): 657
## Max. : 354.47 (Other) :22224
## NA's :7315 NA's : 7315
sapply(df, function(x) n_distinct(x)) %>% sort()
## name_type fall year class mass lat
## 2 2 266 455 12577 12739
## long geolocation name id
## 14641 17101 45716 45716
View missing values in more detail and remove or fix them.
#Visualize missing values
vis_miss(df[,colSums(is.na(df)|df==0) >0], sort_miss=TRUE)
#see count of missing values
na_values <- function(df){
na <- colSums(is.na(df)|df==0) %>% sort(decreasing=TRUE)
na[na>0]
}
na_values(df)
## lat long geolocation year mass
## 13753 13529 7315 291 150
#remove NA coordinates
df <- df %>% drop_na(geolocation)
#remove missing coordinates
df <- subset(df, lat!=0 & long!=0)
Meteorite classification data came from The Meteorite Market. The table includes merged cells so it came over a lot cleaner after copy/pasting into Excel and then unmerging all cells.
#import classification table to match class and determine category
df_class <- read_xlsx("classifications.xlsx", skip=3)
df_class <- df_class %>% select(category=Category,class=`Letter Designation`,comp=`Composition Type`)
#remove rows with NA in class
df_class <- df_class %>% drop_na(class)
#set category to the correct values and clean data to make the class column optimal for matching
df_class <- df_class %>%
mutate(category = case_when((row_number() <= which(df_class[,2] == "R")) ~ "Chondrites",
(row_number() >= which(df_class[,2] == "HOW") &
row_number() <= which(df_class[,2] == "WIN")) ~ "Achondrites",
(row_number() >= which(df_class[,2] == "H") &
row_number() <= which(df_class[,2] == "D")) ~ "Irons (structural)",
(row_number() >= which(df_class[,2] == "IAB") &
row_number() <= which(df_class[,2] == "Anom")) ~ "Irons (chemical)",
(row_number() >= which(df_class[,2] == "PAL")) ~ "Stony Irons",
TRUE ~ category),
comp = str_replace(comp,"([:graph:]+\\s+[:graph:]+)\\s.*",""),
class = str_split(class, ",\\s|.\\s|-")) %>% unnest(class)
#replace blank values with NAs
df_class$comp[df_class$comp == ""]<-NA
df_class <- df_class %>% mutate(comp = str_split(comp, "\\s\\(")) %>% unnest(comp)
df_class <- df_class %>% mutate(comp = str_replace(comp, "\\*|\\)", ""),
comp = str_extract(comp, ".*[^s$]"))
#combine the comp and class column since the data uses both variations as its classification
df_class <- df_class %>% gather(column, class, comp:class) %>% select(-column) %>% distinct() %>% drop_na()
#make sure the original df has clean class columns for matching
df <- df %>% mutate(class = str_replace_all(class, "[:punct:]+|~+", " "))
#function to find if any string value in class matches in df_class
find_cat <- function(x, patterns, replacements=patterns, fill=NA, ...){
stopifnot(length(patterns) == length(replacements))
ans = rep_len(as.character(fill), length(x))
empty = seq_along(x)
for(i in seq_along(patterns)) {
greps = grepl(patterns[[i]], x[empty], ...)
ans[empty[greps]] = replacements[[i]]
empty = empty[!greps]
}
return(ans)
}
df$category <- find_cat(df$class, df_class$class, df_class$category, NA, ignore.case = TRUE)
#view the class values that still need to be worked on
df[!df$class %in% df_class$class & df$category %in% NA,] %>% select(class,category) %>% distinct()
## class category
## 1 L <NA>
## 2 CV3 <NA>
## 3 Stone uncl <NA>
## 4 OC <NA>
## 5 C2 ung <NA>
## 6 C3 ung <NA>
## 7 LL <NA>
## 8 C <NA>
## 9 L 6 <NA>
## 10 L 3 <NA>
## 11 L LL 3 5 3 7 <NA>
## 12 CV3 an <NA>
## 13 C6 <NA>
## 14 C4 <NA>
## 15 L imp melt <NA>
## 16 L 5 <NA>
## 17 C4 ung <NA>
## 18 LL 6 <NA>
## 19 L LL 3 <NA>
## 20 CM an <NA>
## 21 L LL 6 <NA>
## 22 L LL 5 <NA>
## 23 L 4 <NA>
## 24 L LL 4 <NA>
## 25 LL L 3 <NA>
## 26 C2 <NA>
## 27 C4 5 <NA>
## 28 E <NA>
## 29 L metal <NA>
## 30 L LL 3 05 <NA>
## 31 C5 6 ung <NA>
## 32 E an <NA>
## 33 LL 5 <NA>
## 34 LL 3 <NA>
## 35 Stone ung <NA>
## 36 C1 2 ung <NA>
## 37 L LL <NA>
#classify the remaining classes
#according to https://en.wikipedia.org/wiki/Ordinary_chondrite OC is a Chondrite
df <- df %>% mutate(category = case_when(str_detect(class, "^C|LL|L|E|OC") ~ "Chondrites",
TRUE ~ category),
category = replace_na(category, mfv(category, na.rm=T))) #the remaining values cannot be classified. Using mode to give them a value
Convert characters to factors
#finally change the data types back to factors and remove duplicates that were made above
df <- df %>% mutate_if(is.character,as.factor)
world <- map_data("world")
#practice using geom_polygon
ggplot(world, aes(x=long, y=lat, group=group))+
geom_polygon(fill="#f2e6d9",color = "white")+
geom_point(df, mapping=aes(group=category, color=category), alpha=0.5, size=0.5)+
coord_map(xlim=c(-180,180))+
theme_map()+
scale_color_brewer(palette="Set1")+
labs(title="Meteorite Categories Across The World", color="Category")+
guides(color=guide_legend(override.aes=list(size=4)))+
theme(plot.background=element_rect(fill="#ccefff"),
legend.position=c(0, 0.09),
legend.background=element_rect(fill="#ccefff"),
legend.key=element_rect(fill="#ccefff"),
plot.title=element_text(size=14, hjust=0.5),
legend.title.align=0.5,
legend.title=element_text(face="bold"))
state <- map_data("state")
usa <- map_data("usa")
#filter out points that do not belong in the usa
x=usa$long
y=usa$lat
long=df$long
lat=df$lat
#define the points and polygon
points <- cbind(long,lat)
polypoints <- cbind(x,y)
#plot the polygon and all points to be checked
plot(rbind(polypoints,points))
polygon(polypoints,col="red")
#check which points fall within the polygon
out <- pnt.in.poly(points,polypoints)
summary(out)
## long lat pip
## Min. :-165.43 Min. :-87.37 Min. :0.00000
## 1st Qu.: 26.00 1st Qu.:-79.68 1st Qu.:0.00000
## Median : 56.87 Median :-72.00 Median :0.00000
## Mean : 73.13 Mean :-47.00 Mean :0.05175
## 3rd Qu.: 159.39 3rd Qu.: 18.57 3rd Qu.:0.00000
## Max. : 354.47 Max. : 81.17 Max. :1.00000
#identify points not in the polygon with an x
plot(rbind(polypoints,points))
polygon(polypoints,col='#99999990')
points(out[which(out$pip==0),1:2],pch=4,col="red")
#filter df for only the relevant points
df_usa <- bind_cols(df,out) %>% select(-long1,-lat1) %>% filter(pip==1)
##practice using geom_map
ggplot() +
geom_map(data=state, aes(x=long, y=lat, group=group, map_id=region), fill="#f8f2ec",color="grey", map=state)+
geom_map(data=usa, map=usa, aes(long, lat, map_id=region), color="black", fill=NA, size=0.5)+
geom_point(df_usa, mapping=aes(x=long, y=lat, group=category, color=category), alpha=0.8, size=0.9)+
coord_map("polyconic", xlim=c(-124.7, -67.1), ylim = c(25.2, 49.4)) +
theme_map()+
labs(title="Meteorite Categories Across The United States", color="Category")+
scale_color_brewer(palette="Set1")+
guides(color=guide_legend(override.aes=list(size=4)))+
theme(plot.background=element_rect(fill="#e6f7ff"),
legend.position=c(0.83, 0.02),
legend.background=element_rect(fill="#e6f7ff"),
legend.key=element_rect(fill="#e6f7ff"),
plot.title=element_text(size=14, hjust=0.5),
legend.title.align=0.5,
legend.title=element_text(face="bold"))