1 Project Description

Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.

2 Dataset

Data came from The Ramen Rater. The data includes ratings for different ramen brands, varieties, and style by country.

3 Setup

3.1 Load Libraries

#function to check if packages are installed, if not then install them, and load all packages
libraries <- function(packages){
  for(package in packages){
    #checks if package is installed
    if(!require(package, character.only = TRUE)){
      #If package does not exist, then it will install
      install.packages(package, dependencies = TRUE)
      #Loads package
      library(package, character.only = TRUE)
    }
  }
}

packages <- c("tidyverse","naniar","rvest","textstem","modeest","countrycode","ggridges","viridis")

libraries(packages)

theme_set(theme_classic()) #applies classic theme to all charts

3.2 Import Data

df <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-04/ramen_ratings.csv")

4 Data Wrangling

  • There are 3180 observations and 6 variables.
  • 1 review_number is missing while another must be duplicated, as there is 1 NA value and 3178 unique values in that column.
  • stars has 14 NA values.
  • style has 2 NA values.

4.1 View Data

glimpse(df)
## Observations: 3,180
## Variables: 6
## $ review_number <dbl> 3180, 3179, 3178, 3177, 3176, 3175, 3174, 3173, ...
## $ brand         <chr> "Yum Yum", "Nagatanien", "Acecook", "Maison de C...
## $ variety       <chr> "Tem Tem Tom Yum Moo Deng", "tom Yum Kung Rice V...
## $ style         <chr> "Cup", "Pack", "Cup", "Cup", "Tray", "Cup", "Pac...
## $ country       <chr> "Thailand", "Japan", "Japan", "France", "Japan",...
## $ stars         <dbl> 3.75, 2.00, 2.50, 3.75, 5.00, 3.50, 3.75, 5.00, ...
head(df)
## # A tibble: 6 x 6
##   review_number brand          variety                  style country stars
##           <dbl> <chr>          <chr>                    <chr> <chr>   <dbl>
## 1          3180 Yum Yum        Tem Tem Tom Yum Moo Deng Cup   Thaila~  3.75
## 2          3179 Nagatanien     tom Yum Kung Rice Vermi~ Pack  Japan    2   
## 3          3178 Acecook        Kelp Broth Shio Ramen    Cup   Japan    2.5 
## 4          3177 Maison de Cor~ Ramen Gout Coco Poulet   Cup   France   3.75
## 5          3176 Maruchan       Gotsumori Shio Yakisoba  Tray  Japan    5   
## 6          3175 Myojo          Chukazanmai Tantanmen    Cup   Japan    3.5
summary(df)
##  review_number       brand             variety             style          
##  Min.   :   1.0   Length:3180        Length:3180        Length:3180       
##  1st Qu.: 795.5   Class :character   Class :character   Class :character  
##  Median :1590.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1590.1                                                           
##  3rd Qu.:2384.5                                                           
##  Max.   :3180.0                                                           
##  NA's   :1                                                                
##    country              stars      
##  Length:3180        Min.   :0.000  
##  Class :character   1st Qu.:3.250  
##  Mode  :character   Median :3.750  
##                     Mean   :3.688  
##                     3rd Qu.:4.500  
##                     Max.   :5.000  
##                     NA's   :14
sapply(df, function(x) n_distinct(x)) %>% sort()
##         style         stars       country         brand       variety 
##             9            40            44           456          2971 
## review_number 
##          3178

4.2 Missing Values

The visualization shows the same thing that was aleady determined before from the initial analysis. It is good to have a visual check though.

#Visualize missing values
gg_miss_var(df) + labs(title="Missing Values")

#see count of NA values
df %>% is.na() %>% colSums() %>% sort(decreasing=TRUE)
##         stars         style review_number         brand       variety 
##            14             2             1             0             0 
##       country 
##             0
#view only rows with NAs
df <- df %>% rownames_to_column()  #add row number to make it easier to locate observations
df %>% filter_all(any_vars(.=="NA"|is.na(.)))
## # A tibble: 17 x 7
##    rowname review_number brand    variety              style country  stars
##    <chr>           <dbl> <chr>    <chr>                <chr> <chr>    <dbl>
##  1 31               3150 Hakubaku Baby Somen           Pack  Japan    NA   
##  2 32               3149 Hakubaku Baby Udon            Pack  Japan    NA   
##  3 190                NA Big Bon  Chicken & Salsaa Sa~ Pack  Russia    3.25
##  4 540              2641 Nanoblo~ Ramen Bokki          Pack  Japan    NA   
##  5 569              2612 Nona Lim Laksa Rice Noodles ~ Pack  United ~ NA   
##  6 570              2611 Nona Lim Tokyo Ramen + Miso ~ Pack  United ~ NA   
##  7 571              2610 Nona Lim Whole Wheat Ramen +~ Pack  United ~ NA   
##  8 572              2609 Nona Lim Pad Thai Rice Noodl~ Pack  United ~ NA   
##  9 573              2608 Nona Lim Hakata Ramen + Thai~ Pack  United ~ NA   
## 10 574              2607 Nona Lim Hakata Ramen + Miso~ Pack  United ~ NA   
## 11 575              2606 Nona Lim Pad See Ew Rice Noo~ Pack  United ~ NA   
## 12 576              2605 Nona Lim Pad See Ew Rice Noo~ Pack  United ~ NA   
## 13 633              2548 Ottogi   Plain Instant Noodl~ Pack  South K~ NA   
## 14 723              2458 Samyang~ Sari Ramen           Pack  South K~ NA   
## 15 1594             1587 Mi E-Zee Plain Noodles        Pack  Malaysia NA   
## 16 2753              428 Kamfen   E Menm Chicken       <NA>  China     3.75
## 17 3043              138 Unif     100 Furong Shrimp    <NA>  Taiwan    3
#add in missing value for review_number
df[189:191,]
## # A tibble: 3 x 7
##   rowname review_number brand     variety               style country stars
##   <chr>           <dbl> <chr>     <chr>                 <chr> <chr>   <dbl>
## 1 189              2992 Nissin M~ Cremosa Sabor Pizza   Pack  Brazil   3.5 
## 2 190                NA Big Bon   Chicken & Salsaa Sau~ Pack  Russia   3.25
## 3 191              2990 Nissin M~ Turma Da Monica Sabo~ Pack  Brazil   4.25
df[190,2] <- 2991

#find missing values for style and stars
df <- df %>% group_by(country, brand)%>% mutate(style = replace_na(style, mfv1(style, na.rm=T)),
                                                stars = replace_na(stars, mean(stars, na.rm=T))) %>% ungroup()

#see count of NA values again
df %>% is.na() %>% colSums() %>% sort(decreasing=TRUE)
##         stars       rowname review_number         brand       variety 
##            11             0             0             0             0 
##         style       country 
##             0             0
#there are still some missing values, so we'll try again with less grouping
#find missing values for stars
df <- df %>% group_by(country)%>% mutate(stars = replace_na(stars, mean(stars, na.rm=T))) %>% ungroup()

4.3 Change Data Types

df <- df %>% mutate_if(is.character,as.factor)

4.4 Data Cleaning

4.4.1 Country

This uses a country list to see if country names match up with known country names. If not, they are assumed to have typos or be incorrectly labeled (such as using city names).

Country data obtained from [https://data.humdata.org/dataset/countries-and-territories-beta]

#import data frame with list of countries
country <- read_csv("https://docs.google.com/spreadsheets/d/1NjSI2LaS3SqbgYc0HdD8oIb7lofGtiHgoKKATCpwVdY/export?format=csv&gid=1088874596", skip=1)
head(country)
## # A tibble: 6 x 36
##   `#meta +id` `#country +code~ `#country +code~ `#country +code~
##         <dbl>            <dbl>            <dbl>            <dbl>
## 1           1              181               13                4
## 2           2              182               14              248
## 3           3              183               15                8
## 4           4              184               16               12
## 5           5              185               17               16
## 6           6              186               18               20
## # ... with 32 more variables: `#country +code +v_fts` <dbl>, X6 <chr>,
## #   X7 <chr>, `#country +code +v_iso2` <chr>, `#country +code
## #   +v_iso3` <chr>, X10 <chr>, X11 <chr>, `#country +name
## #   +preferred` <chr>, `#country +name +alt +v_m49` <chr>, `#country +name
## #   +alt +v_iso` <chr>, `#country +name +alt +v_unterm` <chr>, `#country
## #   +name +alt +v_fts` <chr>, `#country +name +alt
## #   +v_hrinfo_country` <chr>, `#country +name +short +v_reliefweb` <chr>,
## #   `#country +name +alt +v_reliefweb` <chr>, `#country +name +i_en +alt
## #   +v_unterm` <chr>, `#country +name +i_fr +alt +v_unterm` <chr>,
## #   `#country +name +i_es +alt +v_unterm` <chr>, `#country +name +i_ru
## #   +alt +v_unterm` <chr>, `#country +name +i_zh +alt +v_unterm` <chr>,
## #   `#country +name +i_ar +alt +v_unterm` <chr>, `#geo
## #   +admin_level` <dbl>, `#geo +lat` <dbl>, `#geo +lon` <dbl>, `#region
## #   +main +code` <dbl>, `#region +main +name +preferred` <chr>, `#region
## #   +sub +code` <dbl>, `#region +sub +name +preferred` <chr>, `#region
## #   +intermediate +code` <dbl>, `#region +intermediate +name
## #   +preferred` <chr>, `#country +regex` <chr>, X36 <lgl>
#there appear to be 2 columns with slightly different names. gathering these into one column
country <- country %>% gather(column, name, c(`#country +name +i_en +alt +v_unterm`, `#country +name +preferred`)) %>% select(name) %>% distinct()

#sort out country values that are NOT in the country name to correct them
country_fix <- df[!df$country %in% country$name,] %>% select(country) %>% distinct()
country_fix
## # A tibble: 6 x 1
##   country   
##   <fct>     
## 1 Sarawak   
## 2 USA       
## 3 Phlippines
## 4 UK        
## 5 Holland   
## 6 Dubai
#change values to appropriate names
df <- df %>% mutate(country = as.character(country),
                    country = (case_when(country %in% "USA" ~ "United States",
                                        country %in% "Dubai" ~ "United Arab Emirates",
                                        country %in% "Holland" ~ "Netherlands",
                                        country %in% "Sarawak" ~ "Malaysia",
                                        country %in% "UK" ~ "United Kingdom",
                                        country %in% "Phlippines" ~ "Philippines",
                                        TRUE ~ country)),
                    country = as.factor(country)) 

4.4.2 Brand

df <- df %>% mutate(brand = str_replace_all(brand, "[:punct:]+", " "),
                    brand = str_replace(brand, " s", "'s"),
                    brand = str_to_title(brand),
                    brand = str_squish(brand),
                    brand = stem_words(brand) %>% as.factor())

4.4.3 Variety

df <- df %>% mutate(variety = str_replace_all(variety, "[:punct:]+", " "),
                    variety = str_replace(variety, " s", "'s"),
                    variety = str_to_title(variety),
                    variety = str_squish(variety),
                    variety = stem_words(variety) %>% as.factor())

4.4.4 Style

df %>% count(style, sort=T)
## # A tibble: 8 x 2
##   style          n
##   <fct>      <int>
## 1 Pack        1833
## 2 Bowl         613
## 3 Cup          559
## 4 Tray         138
## 5 Box           32
## 6 Restaurant     3
## 7 Bar            1
## 8 Can            1
df <- df %>% group_by(style) %>% filter(n()>4) %>% ungroup()

4.5 Feature Engineering

There are a lot of countries that will be hard to visualize. A region column will be added.

# change countries into regions
df$region <- df$country %>% countrycode(origin="country.name",destination="region") %>% as.factor()
df <- df %>% group_by(region) %>% filter(n()>10) %>% ungroup()

count(df, region, sort=TRUE)
## # A tibble: 10 x 2
##    region                        n
##    <fct>                     <int>
##  1 Eastern Asia               1581
##  2 South-Eastern Asia          857
##  3 Northern America            426
##  4 Northern Europe              77
##  5 Southern Asia                75
##  6 Western Europe               52
##  7 Central America              28
##  8 Australia and New Zealand    26
##  9 Eastern Europe               21
## 10 South America                18
# change countries into continents
df$continent <- df$country %>% countrycode(origin="country.name",destination="continent") %>% as.factor()

count(df, continent, sort=TRUE)
## # A tibble: 4 x 2
##   continent     n
##   <fct>     <int>
## 1 Asia       2513
## 2 Americas    472
## 3 Europe      150
## 4 Oceania      26

5 Exploratory Data Analysis

5.1 Country Ratings

df_country <- df %>% group_by(country) %>% summarize(stars=mean(stars)) %>% ungroup()

df_country %>% mutate(country = fct_reorder(country, stars)) %>% 
  ggplot(aes(country, stars))+
  geom_bar(stat="identity")+
  coord_flip()

5.2 Region Ratings

df_region <- df %>% group_by(region) %>% summarize(stars=mean(stars)) %>% ungroup()

df_region %>% mutate(region = fct_reorder(region, stars)) %>%
  ggplot(aes(region, stars))+
  geom_bar(stat="identity")+
  coord_flip()

5.3 Continent Ratings

df_continent <- df %>% group_by(continent) %>% summarize(stars=mean(stars)) %>% ungroup()

df_continent %>% mutate(continent = fct_reorder(continent, stars)) %>%
  ggplot(aes(continent, stars))+
  geom_bar(stat="identity")+
  coord_flip()

5.4 Style Ratings

df_style <- df %>% group_by(style) %>% summarize(stars=mean(stars)) %>% ungroup()

df_style %>% mutate(style = fct_reorder(style, stars)) %>%
  ggplot(aes(style, stars))+
  geom_bar(stat="identity")+
  coord_flip()

6 Final Visualization

df %>% mutate(region = fct_reorder(region, stars)) %>% 
  ggplot(aes(stars, region, fill = region)) +
  geom_density_ridges() +
  scale_fill_viridis(option = "D", discrete = TRUE) +
  theme(legend.position = "none") +
  scale_x_continuous(breaks=0:5) +
  labs(title="Ramen Ratings by Region", x="Rating", y="Region")