Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.
Data comes from both data.world and Wikipedia. The data includes information about each country such as total score, year, and whether they won, lost, or tied. There is also information about individual squads such as player name, age, position played, number of games played, and number of goals scored.
if (!require("pacman")) install.packages("pacman")
pacman::p_load("data.table","tidyverse","visdat")
theme_set(theme_classic())
df_wwc_outcomes <- fread("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/wwc_outcomes.csv")
df_squads <- fread("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/squads.csv")
df_codes <- fread("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/codes.csv")
df_wwc_outcomes <- left_join(df_wwc_outcomes, df_codes, by = "team")
There are 3 different data frames.
df_wwc_outcomes has 568 observations and 7 variables.
#df_squads
glimpse(df_squads)
## Observations: 552
## Variables: 9
## $ squad_no <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ country <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US",...
## $ pos <chr> "GK", "FW", "MF", "DF", "DF", "MF", "DF", "MF", "MF",...
## $ player <chr> "Alyssa Naeher", "Mallory Pugh", "Sam Mewis", "Becky ...
## $ dob <chr> "1988-04-20T00:00:00Z", "1998-04-29T00:00:00Z", "1992...
## $ age <int> 31, 21, 26, 34, 30, 26, 26, 27, 25, 36, 34, 20, 29, 2...
## $ caps <int> 43, 50, 47, 155, 115, 82, 37, 79, 66, 271, 99, 19, 16...
## $ goals <int> 0, 15, 9, 0, 2, 6, 0, 18, 8, 107, 1, 1, 101, 0, 44, 6...
## $ club <chr> "Chicago Red Stars", "Washington Spirit", "North Caro...
head(df_squads)
## squad_no country pos player dob age caps
## 1: 1 US GK Alyssa Naeher 1988-04-20T00:00:00Z 31 43
## 2: 2 US FW Mallory Pugh 1998-04-29T00:00:00Z 21 50
## 3: 3 US MF Sam Mewis 1992-10-09T00:00:00Z 26 47
## 4: 4 US DF Becky Sauerbrunn 1985-06-06T00:00:00Z 34 155
## 5: 5 US DF Kelley O'Hara 1988-08-04T00:00:00Z 30 115
## 6: 6 US MF Morgan Brian 1993-02-26T00:00:00Z 26 82
## goals club
## 1: 0 Chicago Red Stars
## 2: 15 Washington Spirit
## 3: 9 North Carolina Courage
## 4: 0 Utah Royals
## 5: 2 Utah Royals
## 6: 6 Chicago Red Stars
summary(df_squads)
## squad_no country pos player
## Min. : 1 Length:552 Length:552 Length:552
## 1st Qu.: 6 Class :character Class :character Class :character
## Median :12 Mode :character Mode :character Mode :character
## Mean :12
## 3rd Qu.:18
## Max. :23
##
## dob age caps goals
## Length:552 Min. :16.00 Min. : 0.00 Min. : 0.000
## Class :character 1st Qu.:23.00 1st Qu.: 11.75 1st Qu.: 0.000
## Mode :character Median :26.00 Median : 29.50 Median : 1.500
## Mean :26.05 Mean : 43.66 Mean : 7.348
## 3rd Qu.:29.00 3rd Qu.: 62.00 3rd Qu.: 8.250
## Max. :41.00 Max. :282.00 Max. :181.000
## NA's :32 NA's :32
## club
## Length:552
## Class :character
## Mode :character
##
##
##
##
sapply(df_squads, function(x) n_distinct(x))
## squad_no country pos player dob age caps goals
## 23 24 4 552 520 24 141 50
## club
## 188
#df_wwc_outcomes
glimpse(df_wwc_outcomes)
## Observations: 568
## Variables: 8
## $ year <int> 1991, 1991, 1991, 1991, 1991, 1991, 1991, 1991,...
## $ team <chr> "CHN", "NOR", "DEN", "NZL", "JPN", "BRA", "GER"...
## $ score <int> 4, 0, 3, 0, 0, 1, 4, 0, 2, 3, 0, 5, 4, 0, 2, 2,...
## $ round <chr> "Group", "Group", "Group", "Group", "Group", "G...
## $ yearly_game_id <int> 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,...
## $ team_num <int> 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,...
## $ win_status <chr> "Won", "Lost", "Won", "Lost", "Lost", "Won", "W...
## $ country <chr> "China PR", "Norway", "Denmark", "New Zealand",...
head(df_wwc_outcomes)
## year team score round yearly_game_id team_num win_status country
## 1 1991 CHN 4 Group 1 1 Won China PR
## 2 1991 NOR 0 Group 1 2 Lost Norway
## 3 1991 DEN 3 Group 2 1 Won Denmark
## 4 1991 NZL 0 Group 2 2 Lost New Zealand
## 5 1991 JPN 0 Group 3 1 Lost Japan
## 6 1991 BRA 1 Group 3 2 Won Brazil
summary(df_wwc_outcomes)
## year team score round
## Min. :1991 Length:568 Min. : 0.000 Length:568
## 1st Qu.:1999 Class :character 1st Qu.: 0.000 Class :character
## Median :2007 Mode :character Median : 1.000 Mode :character
## Mean :2007 Mean : 1.614
## 3rd Qu.:2015 3rd Qu.: 2.000
## Max. :2019 Max. :13.000
## yearly_game_id team_num win_status country
## Min. : 1.00 Min. :1.0 Length:568 Length:568
## 1st Qu.: 9.00 1st Qu.:1.0 Class :character Class :character
## Median :18.00 Median :1.5 Mode :character Mode :character
## Mean :19.61 Mean :1.5
## 3rd Qu.:27.00 3rd Qu.:2.0
## Max. :52.00 Max. :2.0
sapply(df_wwc_outcomes, function(x) n_distinct(x))
## year team score round yearly_game_id
## 8 36 12 6 52
## team_num win_status country
## 2 3 36
Only df_squads has missing values- with caps and goals
#Visualize missing values
vis_miss(df_squads, sort_miss=TRUE)
vis_miss(df_wwc_outcomes, sort_miss=TRUE)
#see count of missing values
na_values <- function(df){
na <- colSums(is.na(df)) %>% sort(decreasing=TRUE)
na[na>0]
}
na_values(df_squads)
## caps goals
## 32 32
df_squads <- df_squads %>% mutate(caps = replace_na(caps, 1),
goals = replace_na(goals, 0))
df_wwc_outcomes <- df_wwc_outcomes %>% mutate(country = str_replace(country, "Ivory Coast.*", "Ivory Coast"))
# see total number of games by country
df_games <- df_wwc_outcomes %>% group_by(country) %>% summarize(games = n())
# see total number of wins and losses by country
df_wins <- df_wwc_outcomes %>% filter(win_status == "Won") %>% group_by(country) %>% summarize(wins = n())
df_loss <- df_wwc_outcomes %>% filter(win_status == "Lost") %>% group_by(country) %>% summarize(loss = n())
# get percentage of games won
df_games_won <- inner_join(df_games, df_wins) %>% mutate(percent_won = (wins/games)*100)
# get net wins/losses by country
df_games <- left_join(df_games, df_wins) %>% mutate(wins = replace_na(wins, 0))
df_games <- left_join(df_games, df_loss) %>% mutate(loss = replace_na(loss, 0),
net = wins-loss)
# capitalize country to be more consistent with FIFA France 2019 theme
df_games <- df_games %>% mutate(country = str_to_upper(country))
#Win status per country
df_wwc_outcomes %>% ggplot(aes(country))+
geom_bar()+
facet_wrap(~win_status)+
coord_flip()+
labs(title="Game Status Per Country", y="number of games")
#Win percentage per country
df_games_won %>% mutate(country = fct_reorder(country,percent_won)) %>% ggplot(aes(country, percent_won))+
geom_col()+
coord_flip()+
labs(title="Percentage of Games Won Per Country", y="percent")
#ages of players
mean <- mean(df_squads$age)
median <- median(df_squads$age)
h <- hist(df_squads$age, breaks = "FD", plot = FALSE) #histogram with Freedman-Diaconis rule for binwidth
df_squads %>% ggplot(aes(age))+
geom_histogram(aes(y = ..density..), breaks = h$breaks, alpha = 0.5, col = "white")+
geom_vline(xintercept=mean, color="red", size=2)+
geom_vline(xintercept=median, color="blue", size=1.5)+
labs(title="Distribution of Players' Ages", subtitle="mean: red, median: blue")
Branding information was found here: - Colors - Font
# load custom fonts
windowsFonts(Elegance = windowsFont("Elegance"))
windowsFonts(OpenSans = windowsFont("Open Sans"))
df_games %>% mutate(country = fct_reorder(country,net)) %>%
ggplot(aes(country, net, fill=net < 0))+
geom_col(width=0.95, color="white", size=0.3)+
coord_flip()+
scale_fill_manual(name = "net < 0", values = setNames(c("#D6000A","#F08C01"), c(T,F)))+
scale_y_continuous(expand=c(0,0), limits=c(-37,37), breaks=c(-30,-20,-10,0,10,20,30))+
labs(title="Women's World Cup - Net Wins Per Country", subtitle="From 1991 - 2019", y="NET WINS / LOSSES", fill="Net Game Status")+
theme(plot.background = element_rect(fill="#23207C"),
panel.background = element_rect(fill="#23207C"),
legend.position="none",
axis.line = element_line(color="white"),
axis.text = element_text(color="white", family="OpenSans"),
axis.text.y = element_text(size=7.5),
axis.ticks = element_line(color="white"),
axis.title = element_text(family="OpenSans", face="bold", color="white"),
plot.title = element_text(hjust=0.5, family="Elegance", face="bold", size=16, color="#00B5ED"),
plot.subtitle = element_text(hjust=0.5, family="Elegance", face="bold", size=13, color="#FDDB00"),
axis.title.y = element_blank(),
axis.title.x = element_text(size=8))
ggsave("womens-world-cup.png")