1 Project Description

Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.

2 Dataset

Data came from NASA. The data includes meteorite information such as the class, mass, and location when meteorites were found/fallen.

3 Setup

3.1 Load Libraries

#function to check if packages are installed, if not then install them, and load all packages
libraries <- function(packages){
  for(package in packages){
    #checks if package is installed
    if(!require(package, character.only = TRUE)){
      #If package does not exist, then it will install
      install.packages(package, dependencies = TRUE)
      #Loads package
      library(package, character.only = TRUE)
    }
  }
}

packages <- c("tidyverse","readxl","visdat","modeest","maps","ggthemes","RColorBrewer")

libraries(packages)

theme_set(theme_classic()) #applies classic theme to all charts

3.2 Import Data

df <- read.csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-11/meteorites.csv")

4 Data Wrangling

  • There are 45,716 observations and 10 variables.
  • There are several NA values that will be looked at in more detail later.
  • name and id are both unique identifiers
  • geolocation is just the summary of both lat and long
  • there a bunch of coordinates entered as 0 - these are basically NA

4.1 View Data

glimpse(df)
## Observations: 45,716
## Variables: 10
## $ name        <fct> Aachen, Aarhus, Abee, Acapulco, Achiras, Adhi Kot,...
## $ id          <int> 1, 2, 6, 10, 370, 379, 390, 392, 398, 417, 423, 42...
## $ name_type   <fct> Valid, Valid, Valid, Valid, Valid, Valid, Valid, V...
## $ class       <fct> "L5", "H6", "EH4", "Acapulcoite", "L6", "EH4", "LL...
## $ mass        <dbl> 21, 720, 107000, 1914, 780, 4239, 910, 30000, 1620...
## $ fall        <fct> Fell, Fell, Fell, Fell, Fell, Fell, Fell, Fell, Fe...
## $ year        <int> 1880, 1951, 1952, 1976, 1902, 1919, 1949, 1814, 19...
## $ lat         <dbl> 50.77500, 56.18333, 54.21667, 16.88333, -33.16667,...
## $ long        <dbl> 6.08333, 10.23333, -113.00000, -99.90000, -64.9500...
## $ geolocation <fct> "(50.775, 6.08333)", "(56.18333, 10.23333)", "(54....
head(df)
##       name  id name_type       class   mass fall year       lat       long
## 1   Aachen   1     Valid          L5     21 Fell 1880  50.77500    6.08333
## 2   Aarhus   2     Valid          H6    720 Fell 1951  56.18333   10.23333
## 3     Abee   6     Valid         EH4 107000 Fell 1952  54.21667 -113.00000
## 4 Acapulco  10     Valid Acapulcoite   1914 Fell 1976  16.88333  -99.90000
## 5  Achiras 370     Valid          L6    780 Fell 1902 -33.16667  -64.95000
## 6 Adhi Kot 379     Valid         EH4   4239 Fell 1919  32.10000   71.80000
##            geolocation
## 1    (50.775, 6.08333)
## 2 (56.18333, 10.23333)
## 3   (54.21667, -113.0)
## 4    (16.88333, -99.9)
## 5  (-33.16667, -64.95)
## 6         (32.1, 71.8)
summary(df)
##               name             id         name_type         class      
##  Österplana 002:    1   Min.   :    1   Relict:   75   L6     : 8339  
##  Österplana 003:    1   1st Qu.:12689   Valid :45641   H5     : 7164  
##  Österplana 004:    1   Median :24262                  L5     : 4817  
##  Österplana 005:    1   Mean   :26890                  H6     : 4529  
##  Österplana 006:    1   3rd Qu.:40657                  H4     : 4222  
##  Österplana 007:    1   Max.   :57458                  LL5    : 2766  
##  (Other)        :45710                                  (Other):13879  
##       mass             fall            year           lat        
##  Min.   :       0   Fell : 1107   Min.   : 860   Min.   :-87.37  
##  1st Qu.:       7   Found:44609   1st Qu.:1987   1st Qu.:-76.71  
##  Median :      33                 Median :1998   Median :-71.50  
##  Mean   :   13278                 Mean   :1992   Mean   :-39.12  
##  3rd Qu.:     203                 3rd Qu.:2003   3rd Qu.:  0.00  
##  Max.   :60000000                 Max.   :2101   Max.   : 81.17  
##  NA's   :131                      NA's   :291    NA's   :7315    
##       long                      geolocation   
##  Min.   :-165.43   (0.0, 0.0)         : 6214  
##  1st Qu.:   0.00   (-71.5, 35.66667)  : 4761  
##  Median :  35.67   (-84.0, 168.0)     : 3040  
##  Mean   :  61.07   (-72.0, 26.0)      : 1505  
##  3rd Qu.: 157.17   (-79.68333, 159.75):  657  
##  Max.   : 354.47   (Other)            :22224  
##  NA's   :7315      NA's               : 7315
sapply(df, function(x) n_distinct(x)) %>% sort()
##   name_type        fall        year       class        mass         lat 
##           2           2         266         455       12577       12739 
##        long geolocation        name          id 
##       14641       17101       45716       45716

4.2 Missing Values

View missing values in more detail and remove or fix them.

#Visualize missing values
vis_miss(df[,colSums(is.na(df)|df==0) >0], sort_miss=TRUE)

#see count of missing values
na_values <- function(df){
  na <- colSums(is.na(df)|df==0) %>% sort(decreasing=TRUE)
  na[na>0]
}

na_values(df)
##         lat        long geolocation        year        mass 
##       13753       13529        7315         291         150
#remove NA coordinates
df <- df %>% drop_na(geolocation)

#remove missing coordinates
df <- subset(df, lat!=0 & long!=0)

4.3 Feature Engineering

Meteorite classification data came from The Meteorite Market. The table includes merged cells so it came over a lot cleaner after copy/pasting into Excel and then unmerging all cells.

#import classification table to match class and determine category
df_class <- read_xlsx("classifications.xlsx", skip=3)
df_class <- df_class %>% select(category=Category,class=`Letter Designation`,comp=`Composition Type`)

#remove rows with NA in class
df_class <- df_class %>% drop_na(class)

#set category to the correct values and clean data to make the class column optimal for matching
df_class <- df_class %>% 
  mutate(category = case_when((row_number() <= which(df_class[,2] == "R")) ~ "Chondrites",
                             (row_number() >= which(df_class[,2] == "HOW") & 
                                row_number() <= which(df_class[,2] == "WIN")) ~ "Achondrites",
                             (row_number() >= which(df_class[,2] == "H") & 
                                row_number() <= which(df_class[,2] == "D")) ~ "Irons (structural)",
                             (row_number() >= which(df_class[,2] == "IAB") & 
                                row_number() <= which(df_class[,2] == "Anom")) ~ "Irons (chemical)",
                             (row_number() >= which(df_class[,2] == "PAL")) ~ "Stony Irons",
                              TRUE ~ category),
        comp = str_replace(comp,"([:graph:]+\\s+[:graph:]+)\\s.*",""),
        class = str_split(class, ",\\s|.\\s|-")) %>% unnest(class)

#replace blank values with NAs
df_class$comp[df_class$comp == ""]<-NA

df_class <- df_class %>% mutate(comp = str_split(comp, "\\s\\(")) %>% unnest(comp)
df_class <- df_class %>% mutate(comp = str_replace(comp, "\\*|\\)", ""),
                                comp = str_extract(comp, ".*[^s$]"))

#combine the comp and class column since the data uses both variations as its classification
df_class <- df_class %>% gather(column, class, comp:class) %>% select(-column) %>% distinct() %>% drop_na()

#make sure the original df has clean class columns for matching
df <- df %>% mutate(class = str_replace_all(class, "[:punct:]+|~+", " "))

#function to find if any string value in class matches in df_class 
find_cat <- function(x, patterns, replacements=patterns, fill=NA, ...){
  stopifnot(length(patterns) == length(replacements))

  ans = rep_len(as.character(fill), length(x))    
  empty = seq_along(x)

  for(i in seq_along(patterns)) {
      greps = grepl(patterns[[i]], x[empty], ...)
      ans[empty[greps]] = replacements[[i]]  
      empty = empty[!greps]
  }
  return(ans)
}

df$category <- find_cat(df$class, df_class$class, df_class$category, NA, ignore.case = TRUE)

#view the class values that still need to be worked on
df[!df$class %in% df_class$class & df$category %in% NA,] %>% select(class,category) %>% distinct()
##           class category
## 1             L     <NA>
## 2           CV3     <NA>
## 3    Stone uncl     <NA>
## 4            OC     <NA>
## 5        C2 ung     <NA>
## 6        C3 ung     <NA>
## 7            LL     <NA>
## 8             C     <NA>
## 9           L 6     <NA>
## 10          L 3     <NA>
## 11 L LL 3 5 3 7     <NA>
## 12       CV3 an     <NA>
## 13           C6     <NA>
## 14           C4     <NA>
## 15   L imp melt     <NA>
## 16          L 5     <NA>
## 17       C4 ung     <NA>
## 18         LL 6     <NA>
## 19       L LL 3     <NA>
## 20        CM an     <NA>
## 21       L LL 6     <NA>
## 22       L LL 5     <NA>
## 23          L 4     <NA>
## 24       L LL 4     <NA>
## 25       LL L 3     <NA>
## 26           C2     <NA>
## 27         C4 5     <NA>
## 28            E     <NA>
## 29      L metal     <NA>
## 30    L LL 3 05     <NA>
## 31     C5 6 ung     <NA>
## 32         E an     <NA>
## 33         LL 5     <NA>
## 34         LL 3     <NA>
## 35    Stone ung     <NA>
## 36     C1 2 ung     <NA>
## 37         L LL     <NA>
#classify the remaining classes
#according to https://en.wikipedia.org/wiki/Ordinary_chondrite OC is a Chondrite
df <- df %>% mutate(category = case_when(str_detect(class, "^C|LL|L|E|OC") ~ "Chondrites",
                                        TRUE ~ category),
                    category = replace_na(category, mfv(category, na.rm=T))) #the remaining values cannot be classified. Using mode to give them a value

4.4 Data Types

Convert characters to factors

#finally change the data types back to factors and remove duplicates that were made above
df <- df %>% mutate_if(is.character,as.factor)

5 Visualizations

5.1 World

world <- map_data("world")

#practice using geom_polygon
ggplot(world, aes(x=long, y=lat, group=group))+
  geom_polygon(fill="#f2e6d9",color = "white")+
  geom_point(df, mapping=aes(group=category, color=category), alpha=0.5, size=0.5)+
  coord_map(xlim=c(-180,180))+
  theme_map()+
  scale_color_brewer(palette="Set1")+
  labs(title="Meteorite Categories Across The World", color="Category")+
  guides(color=guide_legend(override.aes=list(size=4)))+
  theme(plot.background=element_rect(fill="#ccefff"),
        legend.position=c(0, 0.09),
        legend.background=element_rect(fill="#ccefff"),
        legend.key=element_rect(fill="#ccefff"),
        plot.title=element_text(size=14, hjust=0.5),
        legend.title.align=0.5,
        legend.title=element_text(face="bold"))

5.2 United States

5.2.1 Prepare Data Frame

state <- map_data("state")
usa <- map_data("usa")

#filter out points that do not belong in the usa
x=usa$long
y=usa$lat
long=df$long
lat=df$lat

#define the points and polygon
points <- cbind(long,lat)
polypoints <- cbind(x,y)

#plot the polygon and all points to be checked
plot(rbind(polypoints,points))
polygon(polypoints,col="red")

#check which points fall within the polygon
out <- pnt.in.poly(points,polypoints)
summary(out)
##       long              lat              pip         
##  Min.   :-165.43   Min.   :-87.37   Min.   :0.00000  
##  1st Qu.:  26.00   1st Qu.:-79.68   1st Qu.:0.00000  
##  Median :  56.87   Median :-72.00   Median :0.00000  
##  Mean   :  73.13   Mean   :-47.00   Mean   :0.05175  
##  3rd Qu.: 159.39   3rd Qu.: 18.57   3rd Qu.:0.00000  
##  Max.   : 354.47   Max.   : 81.17   Max.   :1.00000
#identify points not in the polygon with an x
plot(rbind(polypoints,points))
polygon(polypoints,col='#99999990')
points(out[which(out$pip==0),1:2],pch=4,col="red")

#filter df for only the relevant points
df_usa <- bind_cols(df,out) %>% select(-long1,-lat1) %>% filter(pip==1)

5.2.2 Visualization

##practice using geom_map
ggplot() +
  geom_map(data=state, aes(x=long, y=lat, group=group, map_id=region), fill="#f8f2ec",color="grey", map=state)+
  geom_map(data=usa, map=usa, aes(long, lat, map_id=region), color="black", fill=NA, size=0.5)+
  geom_point(df_usa, mapping=aes(x=long, y=lat, group=category, color=category), alpha=0.8, size=0.9)+
  coord_map("polyconic", xlim=c(-124.7, -67.1), ylim = c(25.2, 49.4)) +
  theme_map()+
  labs(title="Meteorite Categories Across The United States", color="Category")+
  scale_color_brewer(palette="Set1")+
  guides(color=guide_legend(override.aes=list(size=4)))+
  theme(plot.background=element_rect(fill="#e6f7ff"),
        legend.position=c(0.83, 0.02),
        legend.background=element_rect(fill="#e6f7ff"),
        legend.key=element_rect(fill="#e6f7ff"),
        plot.title=element_text(size=14, hjust=0.5),
        legend.title.align=0.5,
        legend.title=element_text(face="bold"))