Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.
Data came from The Ramen Rater. The data includes ratings for different ramen brands, varieties, and style by country.
#function to check if packages are installed, if not then install them, and load all packages
libraries <- function(packages){
for(package in packages){
#checks if package is installed
if(!require(package, character.only = TRUE)){
#If package does not exist, then it will install
install.packages(package, dependencies = TRUE)
#Loads package
library(package, character.only = TRUE)
}
}
}
packages <- c("tidyverse","naniar","rvest","textstem","modeest","countrycode","ggridges","viridis")
libraries(packages)
theme_set(theme_classic()) #applies classic theme to all charts
df <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-04/ramen_ratings.csv")
glimpse(df)
## Observations: 3,180
## Variables: 6
## $ review_number <dbl> 3180, 3179, 3178, 3177, 3176, 3175, 3174, 3173, ...
## $ brand <chr> "Yum Yum", "Nagatanien", "Acecook", "Maison de C...
## $ variety <chr> "Tem Tem Tom Yum Moo Deng", "tom Yum Kung Rice V...
## $ style <chr> "Cup", "Pack", "Cup", "Cup", "Tray", "Cup", "Pac...
## $ country <chr> "Thailand", "Japan", "Japan", "France", "Japan",...
## $ stars <dbl> 3.75, 2.00, 2.50, 3.75, 5.00, 3.50, 3.75, 5.00, ...
head(df)
## # A tibble: 6 x 6
## review_number brand variety style country stars
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 3180 Yum Yum Tem Tem Tom Yum Moo Deng Cup Thaila~ 3.75
## 2 3179 Nagatanien tom Yum Kung Rice Vermi~ Pack Japan 2
## 3 3178 Acecook Kelp Broth Shio Ramen Cup Japan 2.5
## 4 3177 Maison de Cor~ Ramen Gout Coco Poulet Cup France 3.75
## 5 3176 Maruchan Gotsumori Shio Yakisoba Tray Japan 5
## 6 3175 Myojo Chukazanmai Tantanmen Cup Japan 3.5
summary(df)
## review_number brand variety style
## Min. : 1.0 Length:3180 Length:3180 Length:3180
## 1st Qu.: 795.5 Class :character Class :character Class :character
## Median :1590.0 Mode :character Mode :character Mode :character
## Mean :1590.1
## 3rd Qu.:2384.5
## Max. :3180.0
## NA's :1
## country stars
## Length:3180 Min. :0.000
## Class :character 1st Qu.:3.250
## Mode :character Median :3.750
## Mean :3.688
## 3rd Qu.:4.500
## Max. :5.000
## NA's :14
sapply(df, function(x) n_distinct(x)) %>% sort()
## style stars country brand variety
## 9 40 44 456 2971
## review_number
## 3178
The visualization shows the same thing that was aleady determined before from the initial analysis. It is good to have a visual check though.
#Visualize missing values
gg_miss_var(df) + labs(title="Missing Values")
#see count of NA values
df %>% is.na() %>% colSums() %>% sort(decreasing=TRUE)
## stars style review_number brand variety
## 14 2 1 0 0
## country
## 0
#view only rows with NAs
df <- df %>% rownames_to_column() #add row number to make it easier to locate observations
df %>% filter_all(any_vars(.=="NA"|is.na(.)))
## # A tibble: 17 x 7
## rowname review_number brand variety style country stars
## <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 31 3150 Hakubaku Baby Somen Pack Japan NA
## 2 32 3149 Hakubaku Baby Udon Pack Japan NA
## 3 190 NA Big Bon Chicken & Salsaa Sa~ Pack Russia 3.25
## 4 540 2641 Nanoblo~ Ramen Bokki Pack Japan NA
## 5 569 2612 Nona Lim Laksa Rice Noodles ~ Pack United ~ NA
## 6 570 2611 Nona Lim Tokyo Ramen + Miso ~ Pack United ~ NA
## 7 571 2610 Nona Lim Whole Wheat Ramen +~ Pack United ~ NA
## 8 572 2609 Nona Lim Pad Thai Rice Noodl~ Pack United ~ NA
## 9 573 2608 Nona Lim Hakata Ramen + Thai~ Pack United ~ NA
## 10 574 2607 Nona Lim Hakata Ramen + Miso~ Pack United ~ NA
## 11 575 2606 Nona Lim Pad See Ew Rice Noo~ Pack United ~ NA
## 12 576 2605 Nona Lim Pad See Ew Rice Noo~ Pack United ~ NA
## 13 633 2548 Ottogi Plain Instant Noodl~ Pack South K~ NA
## 14 723 2458 Samyang~ Sari Ramen Pack South K~ NA
## 15 1594 1587 Mi E-Zee Plain Noodles Pack Malaysia NA
## 16 2753 428 Kamfen E Menm Chicken <NA> China 3.75
## 17 3043 138 Unif 100 Furong Shrimp <NA> Taiwan 3
#add in missing value for review_number
df[189:191,]
## # A tibble: 3 x 7
## rowname review_number brand variety style country stars
## <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 189 2992 Nissin M~ Cremosa Sabor Pizza Pack Brazil 3.5
## 2 190 NA Big Bon Chicken & Salsaa Sau~ Pack Russia 3.25
## 3 191 2990 Nissin M~ Turma Da Monica Sabo~ Pack Brazil 4.25
df[190,2] <- 2991
#find missing values for style and stars
df <- df %>% group_by(country, brand)%>% mutate(style = replace_na(style, mfv1(style, na.rm=T)),
stars = replace_na(stars, mean(stars, na.rm=T))) %>% ungroup()
#see count of NA values again
df %>% is.na() %>% colSums() %>% sort(decreasing=TRUE)
## stars rowname review_number brand variety
## 11 0 0 0 0
## style country
## 0 0
#there are still some missing values, so we'll try again with less grouping
#find missing values for stars
df <- df %>% group_by(country)%>% mutate(stars = replace_na(stars, mean(stars, na.rm=T))) %>% ungroup()
df <- df %>% mutate_if(is.character,as.factor)
This uses a country list to see if country names match up with known country names. If not, they are assumed to have typos or be incorrectly labeled (such as using city names).
Country data obtained from [https://data.humdata.org/dataset/countries-and-territories-beta]
#import data frame with list of countries
country <- read_csv("https://docs.google.com/spreadsheets/d/1NjSI2LaS3SqbgYc0HdD8oIb7lofGtiHgoKKATCpwVdY/export?format=csv&gid=1088874596", skip=1)
head(country)
## # A tibble: 6 x 36
## `#meta +id` `#country +code~ `#country +code~ `#country +code~
## <dbl> <dbl> <dbl> <dbl>
## 1 1 181 13 4
## 2 2 182 14 248
## 3 3 183 15 8
## 4 4 184 16 12
## 5 5 185 17 16
## 6 6 186 18 20
## # ... with 32 more variables: `#country +code +v_fts` <dbl>, X6 <chr>,
## # X7 <chr>, `#country +code +v_iso2` <chr>, `#country +code
## # +v_iso3` <chr>, X10 <chr>, X11 <chr>, `#country +name
## # +preferred` <chr>, `#country +name +alt +v_m49` <chr>, `#country +name
## # +alt +v_iso` <chr>, `#country +name +alt +v_unterm` <chr>, `#country
## # +name +alt +v_fts` <chr>, `#country +name +alt
## # +v_hrinfo_country` <chr>, `#country +name +short +v_reliefweb` <chr>,
## # `#country +name +alt +v_reliefweb` <chr>, `#country +name +i_en +alt
## # +v_unterm` <chr>, `#country +name +i_fr +alt +v_unterm` <chr>,
## # `#country +name +i_es +alt +v_unterm` <chr>, `#country +name +i_ru
## # +alt +v_unterm` <chr>, `#country +name +i_zh +alt +v_unterm` <chr>,
## # `#country +name +i_ar +alt +v_unterm` <chr>, `#geo
## # +admin_level` <dbl>, `#geo +lat` <dbl>, `#geo +lon` <dbl>, `#region
## # +main +code` <dbl>, `#region +main +name +preferred` <chr>, `#region
## # +sub +code` <dbl>, `#region +sub +name +preferred` <chr>, `#region
## # +intermediate +code` <dbl>, `#region +intermediate +name
## # +preferred` <chr>, `#country +regex` <chr>, X36 <lgl>
#there appear to be 2 columns with slightly different names. gathering these into one column
country <- country %>% gather(column, name, c(`#country +name +i_en +alt +v_unterm`, `#country +name +preferred`)) %>% select(name) %>% distinct()
#sort out country values that are NOT in the country name to correct them
country_fix <- df[!df$country %in% country$name,] %>% select(country) %>% distinct()
country_fix
## # A tibble: 6 x 1
## country
## <fct>
## 1 Sarawak
## 2 USA
## 3 Phlippines
## 4 UK
## 5 Holland
## 6 Dubai
#change values to appropriate names
df <- df %>% mutate(country = as.character(country),
country = (case_when(country %in% "USA" ~ "United States",
country %in% "Dubai" ~ "United Arab Emirates",
country %in% "Holland" ~ "Netherlands",
country %in% "Sarawak" ~ "Malaysia",
country %in% "UK" ~ "United Kingdom",
country %in% "Phlippines" ~ "Philippines",
TRUE ~ country)),
country = as.factor(country))
df <- df %>% mutate(brand = str_replace_all(brand, "[:punct:]+", " "),
brand = str_replace(brand, " s", "'s"),
brand = str_to_title(brand),
brand = str_squish(brand),
brand = stem_words(brand) %>% as.factor())
df <- df %>% mutate(variety = str_replace_all(variety, "[:punct:]+", " "),
variety = str_replace(variety, " s", "'s"),
variety = str_to_title(variety),
variety = str_squish(variety),
variety = stem_words(variety) %>% as.factor())
df %>% count(style, sort=T)
## # A tibble: 8 x 2
## style n
## <fct> <int>
## 1 Pack 1833
## 2 Bowl 613
## 3 Cup 559
## 4 Tray 138
## 5 Box 32
## 6 Restaurant 3
## 7 Bar 1
## 8 Can 1
df <- df %>% group_by(style) %>% filter(n()>4) %>% ungroup()
There are a lot of countries that will be hard to visualize. A region column will be added.
# change countries into regions
df$region <- df$country %>% countrycode(origin="country.name",destination="region") %>% as.factor()
df <- df %>% group_by(region) %>% filter(n()>10) %>% ungroup()
count(df, region, sort=TRUE)
## # A tibble: 10 x 2
## region n
## <fct> <int>
## 1 Eastern Asia 1581
## 2 South-Eastern Asia 857
## 3 Northern America 426
## 4 Northern Europe 77
## 5 Southern Asia 75
## 6 Western Europe 52
## 7 Central America 28
## 8 Australia and New Zealand 26
## 9 Eastern Europe 21
## 10 South America 18
# change countries into continents
df$continent <- df$country %>% countrycode(origin="country.name",destination="continent") %>% as.factor()
count(df, continent, sort=TRUE)
## # A tibble: 4 x 2
## continent n
## <fct> <int>
## 1 Asia 2513
## 2 Americas 472
## 3 Europe 150
## 4 Oceania 26
df_country <- df %>% group_by(country) %>% summarize(stars=mean(stars)) %>% ungroup()
df_country %>% mutate(country = fct_reorder(country, stars)) %>%
ggplot(aes(country, stars))+
geom_bar(stat="identity")+
coord_flip()
df_region <- df %>% group_by(region) %>% summarize(stars=mean(stars)) %>% ungroup()
df_region %>% mutate(region = fct_reorder(region, stars)) %>%
ggplot(aes(region, stars))+
geom_bar(stat="identity")+
coord_flip()
df_continent <- df %>% group_by(continent) %>% summarize(stars=mean(stars)) %>% ungroup()
df_continent %>% mutate(continent = fct_reorder(continent, stars)) %>%
ggplot(aes(continent, stars))+
geom_bar(stat="identity")+
coord_flip()
df_style <- df %>% group_by(style) %>% summarize(stars=mean(stars)) %>% ungroup()
df_style %>% mutate(style = fct_reorder(style, stars)) %>%
ggplot(aes(style, stars))+
geom_bar(stat="identity")+
coord_flip()
df %>% mutate(region = fct_reorder(region, stars)) %>%
ggplot(aes(stars, region, fill = region)) +
geom_density_ridges() +
scale_fill_viridis(option = "D", discrete = TRUE) +
theme(legend.position = "none") +
scale_x_continuous(breaks=0:5) +
labs(title="Ramen Ratings by Region", x="Rating", y="Region")