1 Project Description

Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.

2 Dataset

Data comes from both data.world and Wikipedia. The data includes information about each country such as total score, year, and whether they won, lost, or tied. There is also information about individual squads such as player name, age, position played, number of games played, and number of goals scored.

3 Setup

3.1 Load Libraries

if (!require("pacman")) install.packages("pacman")
pacman::p_load("data.table","tidyverse","visdat")

theme_set(theme_classic())

3.2 Import Data

df_wwc_outcomes <- fread("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/wwc_outcomes.csv")
df_squads <- fread("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/squads.csv")
df_codes <- fread("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/codes.csv")

df_wwc_outcomes <- left_join(df_wwc_outcomes, df_codes, by = "team")

4 Exploratory Data Analysis

  • There are 3 different data frames.

  • df_codes has 212 observations and 2 variables. This is just to get full country names in df_wwc_outcomes
  • df_squads has 552 observations and 9 variables.
  • df_wwc_outcomes has 568 observations and 7 variables.

4.1 View Data

#df_squads
glimpse(df_squads)
## Observations: 552
## Variables: 9
## $ squad_no <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ country  <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US",...
## $ pos      <chr> "GK", "FW", "MF", "DF", "DF", "MF", "DF", "MF", "MF",...
## $ player   <chr> "Alyssa Naeher", "Mallory Pugh", "Sam Mewis", "Becky ...
## $ dob      <chr> "1988-04-20T00:00:00Z", "1998-04-29T00:00:00Z", "1992...
## $ age      <int> 31, 21, 26, 34, 30, 26, 26, 27, 25, 36, 34, 20, 29, 2...
## $ caps     <int> 43, 50, 47, 155, 115, 82, 37, 79, 66, 271, 99, 19, 16...
## $ goals    <int> 0, 15, 9, 0, 2, 6, 0, 18, 8, 107, 1, 1, 101, 0, 44, 6...
## $ club     <chr> "Chicago Red Stars", "Washington Spirit", "North Caro...
head(df_squads)
##    squad_no country pos           player                  dob age caps
## 1:        1      US  GK    Alyssa Naeher 1988-04-20T00:00:00Z  31   43
## 2:        2      US  FW     Mallory Pugh 1998-04-29T00:00:00Z  21   50
## 3:        3      US  MF        Sam Mewis 1992-10-09T00:00:00Z  26   47
## 4:        4      US  DF Becky Sauerbrunn 1985-06-06T00:00:00Z  34  155
## 5:        5      US  DF    Kelley O'Hara 1988-08-04T00:00:00Z  30  115
## 6:        6      US  MF     Morgan Brian 1993-02-26T00:00:00Z  26   82
##    goals                   club
## 1:     0      Chicago Red Stars
## 2:    15      Washington Spirit
## 3:     9 North Carolina Courage
## 4:     0            Utah Royals
## 5:     2            Utah Royals
## 6:     6      Chicago Red Stars
summary(df_squads)
##     squad_no    country              pos               player         
##  Min.   : 1   Length:552         Length:552         Length:552        
##  1st Qu.: 6   Class :character   Class :character   Class :character  
##  Median :12   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :12                                                           
##  3rd Qu.:18                                                           
##  Max.   :23                                                           
##                                                                       
##      dob                 age             caps            goals        
##  Length:552         Min.   :16.00   Min.   :  0.00   Min.   :  0.000  
##  Class :character   1st Qu.:23.00   1st Qu.: 11.75   1st Qu.:  0.000  
##  Mode  :character   Median :26.00   Median : 29.50   Median :  1.500  
##                     Mean   :26.05   Mean   : 43.66   Mean   :  7.348  
##                     3rd Qu.:29.00   3rd Qu.: 62.00   3rd Qu.:  8.250  
##                     Max.   :41.00   Max.   :282.00   Max.   :181.000  
##                                     NA's   :32       NA's   :32       
##      club          
##  Length:552        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
sapply(df_squads, function(x) n_distinct(x))
## squad_no  country      pos   player      dob      age     caps    goals 
##       23       24        4      552      520       24      141       50 
##     club 
##      188
#df_wwc_outcomes
glimpse(df_wwc_outcomes)
## Observations: 568
## Variables: 8
## $ year           <int> 1991, 1991, 1991, 1991, 1991, 1991, 1991, 1991,...
## $ team           <chr> "CHN", "NOR", "DEN", "NZL", "JPN", "BRA", "GER"...
## $ score          <int> 4, 0, 3, 0, 0, 1, 4, 0, 2, 3, 0, 5, 4, 0, 2, 2,...
## $ round          <chr> "Group", "Group", "Group", "Group", "Group", "G...
## $ yearly_game_id <int> 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,...
## $ team_num       <int> 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,...
## $ win_status     <chr> "Won", "Lost", "Won", "Lost", "Lost", "Won", "W...
## $ country        <chr> "China PR", "Norway", "Denmark", "New Zealand",...
head(df_wwc_outcomes)
##   year team score round yearly_game_id team_num win_status     country
## 1 1991  CHN     4 Group              1        1        Won    China PR
## 2 1991  NOR     0 Group              1        2       Lost      Norway
## 3 1991  DEN     3 Group              2        1        Won     Denmark
## 4 1991  NZL     0 Group              2        2       Lost New Zealand
## 5 1991  JPN     0 Group              3        1       Lost       Japan
## 6 1991  BRA     1 Group              3        2        Won      Brazil
summary(df_wwc_outcomes)
##       year          team               score           round          
##  Min.   :1991   Length:568         Min.   : 0.000   Length:568        
##  1st Qu.:1999   Class :character   1st Qu.: 0.000   Class :character  
##  Median :2007   Mode  :character   Median : 1.000   Mode  :character  
##  Mean   :2007                      Mean   : 1.614                     
##  3rd Qu.:2015                      3rd Qu.: 2.000                     
##  Max.   :2019                      Max.   :13.000                     
##  yearly_game_id     team_num    win_status          country         
##  Min.   : 1.00   Min.   :1.0   Length:568         Length:568        
##  1st Qu.: 9.00   1st Qu.:1.0   Class :character   Class :character  
##  Median :18.00   Median :1.5   Mode  :character   Mode  :character  
##  Mean   :19.61   Mean   :1.5                                        
##  3rd Qu.:27.00   3rd Qu.:2.0                                        
##  Max.   :52.00   Max.   :2.0
sapply(df_wwc_outcomes, function(x) n_distinct(x))
##           year           team          score          round yearly_game_id 
##              8             36             12              6             52 
##       team_num     win_status        country 
##              2              3             36

4.2 Missing Values

Only df_squads has missing values- with caps and goals

#Visualize missing values
vis_miss(df_squads, sort_miss=TRUE)

vis_miss(df_wwc_outcomes, sort_miss=TRUE)

#see count of missing values
na_values <- function(df){
  na <- colSums(is.na(df)) %>% sort(decreasing=TRUE)
  na[na>0]
}

na_values(df_squads)
##  caps goals 
##    32    32

4.3 Data Wrangling

  • For missing caps values, a 1 was supplied since it is assumed that player was present in the recent World Cup games.
  • For missing goals values, 0 was supplied since it is assumed that if any goals were scored, they would have been recorded.
df_squads <- df_squads %>% mutate(caps = replace_na(caps, 1),
                                  goals = replace_na(goals, 0))

df_wwc_outcomes <- df_wwc_outcomes %>% mutate(country = str_replace(country, "Ivory Coast.*", "Ivory Coast"))

# see total number of games by country
df_games <- df_wwc_outcomes %>% group_by(country) %>% summarize(games = n())

# see total number of wins and losses by country
df_wins <- df_wwc_outcomes %>% filter(win_status == "Won") %>% group_by(country) %>% summarize(wins = n())
df_loss <- df_wwc_outcomes %>% filter(win_status == "Lost") %>% group_by(country) %>% summarize(loss = n())

# get percentage of games won
df_games_won <- inner_join(df_games, df_wins) %>% mutate(percent_won = (wins/games)*100)

# get net wins/losses by country
df_games <- left_join(df_games, df_wins) %>% mutate(wins = replace_na(wins, 0))
df_games <- left_join(df_games, df_loss) %>% mutate(loss = replace_na(loss, 0),
                                                    net = wins-loss)

# capitalize country to be more consistent with FIFA France 2019 theme
df_games <- df_games %>% mutate(country = str_to_upper(country))

4.4 Visualizations

#Win status per country
df_wwc_outcomes %>% ggplot(aes(country))+
  geom_bar()+
  facet_wrap(~win_status)+
  coord_flip()+
  labs(title="Game Status Per Country", y="number of games")

#Win percentage per country
df_games_won %>% mutate(country = fct_reorder(country,percent_won)) %>% ggplot(aes(country, percent_won))+
  geom_col()+
  coord_flip()+
  labs(title="Percentage of Games Won Per Country", y="percent")

#ages of players
mean <- mean(df_squads$age)
median <- median(df_squads$age)

h <- hist(df_squads$age, breaks = "FD", plot = FALSE) #histogram with Freedman-Diaconis rule for binwidth

df_squads %>% ggplot(aes(age))+
  geom_histogram(aes(y = ..density..), breaks = h$breaks, alpha = 0.5, col = "white")+
  geom_vline(xintercept=mean, color="red", size=2)+
  geom_vline(xintercept=median, color="blue", size=1.5)+
  labs(title="Distribution of Players' Ages", subtitle="mean: red, median: blue")

5 Final Visualization

Branding information was found here: - Colors - Font

# load custom fonts
windowsFonts(Elegance = windowsFont("Elegance"))
windowsFonts(OpenSans = windowsFont("Open Sans"))

df_games %>% mutate(country = fct_reorder(country,net)) %>% 
  ggplot(aes(country, net, fill=net < 0))+
    geom_col(width=0.95, color="white", size=0.3)+
    coord_flip()+
    scale_fill_manual(name = "net < 0", values = setNames(c("#D6000A","#F08C01"), c(T,F)))+
    scale_y_continuous(expand=c(0,0), limits=c(-37,37), breaks=c(-30,-20,-10,0,10,20,30))+
    labs(title="Women's World Cup - Net Wins Per Country", subtitle="From 1991 - 2019", y="NET WINS / LOSSES", fill="Net Game Status")+
    theme(plot.background = element_rect(fill="#23207C"),
          panel.background = element_rect(fill="#23207C"),
          legend.position="none",
          axis.line = element_line(color="white"),
          axis.text = element_text(color="white", family="OpenSans"),
          axis.text.y = element_text(size=7.5),
          axis.ticks = element_line(color="white"),
          axis.title = element_text(family="OpenSans", face="bold", color="white"),
          plot.title = element_text(hjust=0.5, family="Elegance", face="bold", size=16, color="#00B5ED"),
          plot.subtitle = element_text(hjust=0.5, family="Elegance", face="bold", size=13, color="#FDDB00"),
          axis.title.y = element_blank(),
          axis.title.x = element_text(size=8))

ggsave("womens-world-cup.png")