1 Project Description

Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.

2 Dataset

Data came from NASA. The data includes meteorite information such as the class, mass, and location when meteorites were found/fallen.

3 Setup

3.1 Load Libraries

#function to check if packages are installed, if not then install them, and load all packages
libraries <- function(packages){
  for(package in packages){
    #checks if package is installed
    if(!require(package, character.only = TRUE)){
      #If package does not exist, then it will install
      install.packages(package, dependencies = TRUE)
      #Loads package
      library(package, character.only = TRUE)
    }
  }
}

packages <- c("tidyverse","readxl","visdat","modeest","maps","ggthemes","RColorBrewer")

libraries(packages)

theme_set(theme_classic()) #applies classic theme to all charts

3.2 Import Data

df <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-11/meteorites.csv")

4 Data Wrangling

There are 45,716 observations and 10 variables.
There are several NA values that will be looked at in more detail later.
name and id are both unique identifiers
geolocation is just the summary of both lat and long
there a bunch of coordinates entered as 0 - these are basically NA

4.1 View Data

glimpse(df)

## Observations: 45,716
## Variables: 10
## $ name        <fct> Aachen, Aarhus, Abee, Acapulco, Achiras, Adhi Kot,...
## $ id          <int> 1, 2, 6, 10, 370, 379, 390, 392, 398, 417, 423, 42...
## $ name_type   <fct> Valid, Valid, Valid, Valid, Valid, Valid, Valid, V...
## $ class       <fct> "L5", "H6", "EH4", "Acapulcoite", "L6", "EH4", "LL...
## $ mass        <dbl> 21, 720, 107000, 1914, 780, 4239, 910, 30000, 1620...
## $ fall        <fct> Fell, Fell, Fell, Fell, Fell, Fell, Fell, Fell, Fe...
## $ year        <int> 1880, 1951, 1952, 1976, 1902, 1919, 1949, 1814, 19...
## $ lat         <dbl> 50.77500, 56.18333, 54.21667, 16.88333, -33.16667,...
## $ long        <dbl> 6.08333, 10.23333, -113.00000, -99.90000, -64.9500...
## $ geolocation <fct> "(50.775, 6.08333)", "(56.18333, 10.23333)", "(54....

head(df)

##       name  id name_type       class   mass fall year       lat       long
## 1   Aachen   1     Valid          L5     21 Fell 1880  50.77500    6.08333
## 2   Aarhus   2     Valid          H6    720 Fell 1951  56.18333   10.23333
## 3     Abee   6     Valid         EH4 107000 Fell 1952  54.21667 -113.00000
## 4 Acapulco  10     Valid Acapulcoite   1914 Fell 1976  16.88333  -99.90000
## 5  Achiras 370     Valid          L6    780 Fell 1902 -33.16667  -64.95000
## 6 Adhi Kot 379     Valid         EH4   4239 Fell 1919  32.10000   71.80000
##            geolocation
## 1    (50.775, 6.08333)
## 2 (56.18333, 10.23333)
## 3   (54.21667, -113.0)
## 4    (16.88333, -99.9)
## 5  (-33.16667, -64.95)
## 6         (32.1, 71.8)

summary(df)

##               name             id         name_type         class      
##  Ãâsterplana 002:    1   Min.   :    1   Relict:   75   L6     : 8339  
##  Ãâsterplana 003:    1   1st Qu.:12689   Valid :45641   H5     : 7164  
##  Ãâsterplana 004:    1   Median :24262                  L5     : 4817  
##  Ãâsterplana 005:    1   Mean   :26890                  H6     : 4529  
##  Ãâsterplana 006:    1   3rd Qu.:40657                  H4     : 4222  
##  Ãâsterplana 007:    1   Max.   :57458                  LL5    : 2766  
##  (Other)        :45710                                  (Other):13879  
##       mass             fall            year           lat        
##  Min.   :       0   Fell : 1107   Min.   : 860   Min.   :-87.37  
##  1st Qu.:       7   Found:44609   1st Qu.:1987   1st Qu.:-76.71  
##  Median :      33                 Median :1998   Median :-71.50  
##  Mean   :   13278                 Mean   :1992   Mean   :-39.12  
##  3rd Qu.:     203                 3rd Qu.:2003   3rd Qu.:  0.00  
##  Max.   :60000000                 Max.   :2101   Max.   : 81.17  
##  NA's   :131                      NA's   :291    NA's   :7315    
##       long                      geolocation   
##  Min.   :-165.43   (0.0, 0.0)         : 6214  
##  1st Qu.:   0.00   (-71.5, 35.66667)  : 4761  
##  Median :  35.67   (-84.0, 168.0)     : 3040  
##  Mean   :  61.07   (-72.0, 26.0)      : 1505  
##  3rd Qu.: 157.17   (-79.68333, 159.75):  657  
##  Max.   : 354.47   (Other)            :22224  
##  NA's   :7315      NA's               : 7315

sapply(df, function(x) n_distinct(x)) %>% sort()

##   name_type        fall        year       class        mass         lat 
##           2           2         266         455       12577       12739 
##        long geolocation        name          id 
##       14641       17101       45716       45716

4.2 Missing Values

View missing values in more detail and remove or fix them.

#Visualize missing values
vis_miss(df[,colSums(is.na(df)|df==0) >0], sort_miss=TRUE)

#see count of missing values
na_values <- function(df){
  na <- colSums(is.na(df)|df==0) %>% sort(decreasing=TRUE)
  na[na>0]
}

na_values(df)

##         lat        long geolocation        year        mass 
##       13753       13529        7315         291         150

#remove NA coordinates
df <- df %>% drop_na(geolocation)

#remove missing coordinates
df <- subset(df, lat!=0 & long!=0)

4.3 Feature Engineering

Meteorite classification data came from The Meteorite Market. The table includes merged cells so it came over a lot cleaner after copy/pasting into Excel and then unmerging all cells.

#import classification table to match class and determine category
df_class <- read_xlsx("classifications.xlsx", skip=3)
df_class <- df_class %>% select(category=Category,class=`Letter Designation`,comp=`Composition Type`)

#remove rows with NA in class
df_class <- df_class %>% drop_na(class)

#set category to the correct values and clean data to make the class column optimal for matching
df_class <- df_class %>% 
  mutate(category = case_when((row_number() <= which(df_class[,2] == "R")) ~ "Chondrites",
                             (row_number() >= which(df_class[,2] == "HOW") & 
                                row_number() <= which(df_class[,2] == "WIN")) ~ "Achondrites",
                             (row_number() >= which(df_class[,2] == "H") & 
                                row_number() <= which(df_class[,2] == "D")) ~ "Irons (structural)",
                             (row_number() >= which(df_class[,2] == "IAB") & 
                                row_number() <= which(df_class[,2] == "Anom")) ~ "Irons (chemical)",
                             (row_number() >= which(df_class[,2] == "PAL")) ~ "Stony Irons",
                              TRUE ~ category),
        comp = str_replace(comp,"([:graph:]+\\s+[:graph:]+)\\s.*",""),
        class = str_split(class, ",\\s|.\\s|-")) %>% unnest(class)

#replace blank values with NAs
df_class$comp[df_class$comp == ""]<-NA

df_class <- df_class %>% mutate(comp = str_split(comp, "\\s\\(")) %>% unnest(comp)
df_class <- df_class %>% mutate(comp = str_replace(comp, "\\*|\\)", ""),
                                comp = str_extract(comp, ".*[^s$]"))

#combine the comp and class column since the data uses both variations as its classification
df_class <- df_class %>% gather(column, class, comp:class) %>% select(-column) %>% distinct() %>% drop_na()

#make sure the original df has clean class columns for matching
df <- df %>% mutate(class = str_replace_all(class, "[:punct:]+|~+", " "))

#function to find if any string value in class matches in df_class 
find_cat <- function(x, patterns, replacements=patterns, fill=NA, ...){
  stopifnot(length(patterns) == length(replacements))

  ans = rep_len(as.character(fill), length(x))    
  empty = seq_along(x)

  for(i in seq_along(patterns)) {
      greps = grepl(patterns[[i]], x[empty], ...)
      ans[empty[greps]] = replacements[[i]]  
      empty = empty[!greps]
  }
  return(ans)
}

df$category <- find_cat(df$class, df_class$class, df_class$category, NA, ignore.case = TRUE)

#view the class values that still need to be worked on
df[!df$class %in% df_class$class & df$category %in% NA,] %>% select(class,category) %>% distinct()

##           class category
## 1             L     <NA>
## 2           CV3     <NA>
## 3    Stone uncl     <NA>
## 4            OC     <NA>
## 5        C2 ung     <NA>
## 6        C3 ung     <NA>
## 7            LL     <NA>
## 8             C     <NA>
## 9           L 6     <NA>
## 10          L 3     <NA>
## 11 L LL 3 5 3 7     <NA>
## 12       CV3 an     <NA>
## 13           C6     <NA>
## 14           C4     <NA>
## 15   L imp melt     <NA>
## 16          L 5     <NA>
## 17       C4 ung     <NA>
## 18         LL 6     <NA>
## 19       L LL 3     <NA>
## 20        CM an     <NA>
## 21       L LL 6     <NA>
## 22       L LL 5     <NA>
## 23          L 4     <NA>
## 24       L LL 4     <NA>
## 25       LL L 3     <NA>
## 26           C2     <NA>
## 27         C4 5     <NA>
## 28            E     <NA>
## 29      L metal     <NA>
## 30    L LL 3 05     <NA>
## 31     C5 6 ung     <NA>
## 32         E an     <NA>
## 33         LL 5     <NA>
## 34         LL 3     <NA>
## 35    Stone ung     <NA>
## 36     C1 2 ung     <NA>
## 37         L LL     <NA>

#classify the remaining classes
#according to https://en.wikipedia.org/wiki/Ordinary_chondrite OC is a Chondrite
df <- df %>% mutate(category = case_when(str_detect(class, "^C|LL|L|E|OC") ~ "Chondrites",
                                        TRUE ~ category),
                    category = replace_na(category, mfv(category, na.rm=T))) #the remaining values cannot be classified. Using mode to give them a value

4.4 Data Types

Convert characters to factors

#finally change the data types back to factors and remove duplicates that were made above
df <- df %>% mutate_if(is.character,as.factor)

5 Visualizations

5.1 World

world <- map_data("world")

#practice using geom_polygon
ggplot(world, aes(x=long, y=lat, group=group))+
  geom_polygon(fill="#f2e6d9",color = "white")+
  geom_point(df, mapping=aes(group=category, color=category), alpha=0.5, size=0.5)+
  coord_map(xlim=c(-180,180))+
  theme_map()+
  scale_color_brewer(palette="Set1")+
  labs(title="Meteorite Categories Across The World", color="Category")+
  guides(color=guide_legend(override.aes=list(size=4)))+
  theme(plot.background=element_rect(fill="#ccefff"),
        legend.position=c(0, 0.09),
        legend.background=element_rect(fill="#ccefff"),
        legend.key=element_rect(fill="#ccefff"),
        plot.title=element_text(size=14, hjust=0.5),
        legend.title.align=0.5,
        legend.title=element_text(face="bold"))

5.2 United States

5.2.1 Prepare Data Frame

state <- map_data("state")
usa <- map_data("usa")

#filter out points that do not belong in the usa
x=usa$long
y=usa$lat
long=df$long
lat=df$lat

#define the points and polygon
points <- cbind(long,lat)
polypoints <- cbind(x,y)

#plot the polygon and all points to be checked
plot(rbind(polypoints,points))
polygon(polypoints,col="red")

#check which points fall within the polygon
out <- pnt.in.poly(points,polypoints)
summary(out)

##       long              lat              pip         
##  Min.   :-165.43   Min.   :-87.37   Min.   :0.00000  
##  1st Qu.:  26.00   1st Qu.:-79.68   1st Qu.:0.00000  
##  Median :  56.87   Median :-72.00   Median :0.00000  
##  Mean   :  73.13   Mean   :-47.00   Mean   :0.05175  
##  3rd Qu.: 159.39   3rd Qu.: 18.57   3rd Qu.:0.00000  
##  Max.   : 354.47   Max.   : 81.17   Max.   :1.00000

#identify points not in the polygon with an x
plot(rbind(polypoints,points))
polygon(polypoints,col='#99999990')
points(out[which(out$pip==0),1:2],pch=4,col="red")

#filter df for only the relevant points
df_usa <- bind_cols(df,out) %>% select(-long1,-lat1) %>% filter(pip==1)

5.2.2 Visualization

##practice using geom_map
ggplot() +
  geom_map(data=state, aes(x=long, y=lat, group=group, map_id=region), fill="#f8f2ec",color="grey", map=state)+
  geom_map(data=usa, map=usa, aes(long, lat, map_id=region), color="black", fill=NA, size=0.5)+
  geom_point(df_usa, mapping=aes(x=long, y=lat, group=category, color=category), alpha=0.8, size=0.9)+
  coord_map("polyconic", xlim=c(-124.7, -67.1), ylim = c(25.2, 49.4)) +
  theme_map()+
  labs(title="Meteorite Categories Across The United States", color="Category")+
  scale_color_brewer(palette="Set1")+
  guides(color=guide_legend(override.aes=list(size=4)))+
  theme(plot.background=element_rect(fill="#e6f7ff"),
        legend.position=c(0.83, 0.02),
        legend.background=element_rect(fill="#e6f7ff"),
        legend.key=element_rect(fill="#e6f7ff"),
        plot.title=element_text(size=14, hjust=0.5),
        legend.title.align=0.5,
        legend.title=element_text(face="bold"))

Meteorites

Catherine Williams

June 12, 2019

Tidy Tuesday