Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.
Data came from Wikipedia. The data includes information such as franchise name, revenue generated, year created, and owners/creators of the franchise.
if (!require("pacman")) install.packages("pacman")
pacman::p_load("tidyverse","visdat","extrafont","ggpubr","png")
theme_set(theme_classic())
df <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-02/media_franchises.csv")
glimpse(df)
## Observations: 321
## Variables: 7
## $ franchise <chr> "A Song of Ice and Fire / Game of Thrones", ...
## $ revenue_category <chr> "Book sales", "Box Office", "Home Video/Enter...
## $ revenue <dbl> 0.900, 0.001, 0.280, 4.000, 0.132, 0.760, 1.0...
## $ year_created <dbl> 1996, 1996, 1996, 1996, 1996, 1992, 1992, 199...
## $ original_media <chr> "Novel", "Novel", "Novel", "Novel", "Novel", ...
## $ creators <chr> "George R. R. Martin", "George R. R. Martin",...
## $ owners <chr> "Random House WarnerMedia (AT&T)", "Random Ho...
head(df)
## # A tibble: 6 x 7
## franchise revenue_category revenue year_created original_media creators
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 A Song o~ Book sales 0.9 1996 Novel George ~
## 2 A Song o~ Box Office 0.001 1996 Novel George ~
## 3 A Song o~ Home Video/Ente~ 0.28 1996 Novel George ~
## 4 A Song o~ TV 4 1996 Novel George ~
## 5 A Song o~ Video Games/Gam~ 0.132 1996 Novel George ~
## 6 Aladdin Box Office 0.76 1992 Animated film Walt Di~
## # ... with 1 more variable: owners <chr>
summary(df)
## franchise revenue_category revenue year_created
## Length:321 Length:321 Min. : 0.001 Min. :1924
## Class :character Class :character 1st Qu.: 0.211 1st Qu.:1977
## Mode :character Mode :character Median : 1.000 Median :1991
## Mean : 4.742 Mean :1986
## 3rd Qu.: 5.000 3rd Qu.:1999
## Max. :80.000 Max. :2013
## original_media creators owners
## Length:321 Length:321 Length:321
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
sapply(df, function(x) n_distinct(x)) %>% sort()
## revenue_category original_media year_created owners
## 8 18 52 78
## creators franchise revenue
## 101 103 174
df %>% count(revenue_category, sort=TRUE)
## # A tibble: 8 x 2
## revenue_category n
## <chr> <int>
## 1 Box Office 76
## 2 Merchandise, Licensing & Retail 73
## 3 Home Video/Entertainment 64
## 4 Video Games/Games 51
## 5 Comic or Manga 27
## 6 Music 12
## 7 TV 11
## 8 Book sales 7
df %>% count(original_media, sort=TRUE)
## # A tibble: 18 x 2
## original_media n
## <chr> <int>
## 1 Video game 64
## 2 Manga 63
## 3 Novel 33
## 4 Comic book 29
## 5 Animated film 27
## 6 Television series 19
## 7 Film 17
## 8 Animated series 16
## 9 Anime 13
## 10 Animated cartoon 8
## 11 Digital pet 8
## 12 Book 5
## 13 Visual novel 5
## 14 Greeting card 4
## 15 Cartoon character 3
## 16 Comic strip 3
## 17 Cartoon 2
## 18 Musical theatre 2
View missing values in more detail.
#Visualize missing values
vis_miss(df, sort_miss=TRUE)
Clean fields to make them easier to read/visualize.
df <- df %>% mutate(original_media = case_when(original_media %in% "Book" ~ "Novel",
original_media %in% "Cartoon character" ~ "Cartoon",
original_media %in% "Visual novel" ~ "Comic book",
original_media %in% "Animated cartoon" ~ "Animated series",
TRUE ~ original_media),
franchise = str_replace(franchise, "A Song of Ice and Fire \\/ ", ""),
franchise = str_replace(franchise, "Wizarding World \\/ ", ""),
franchise = str_replace(franchise, "Middle-earth \\/ ", ""),
revenue_category = str_replace(revenue_category, ", Licensing & Retail", ""),
revenue_category = str_replace(revenue_category, "\\/Entertainment", ""),
revenue_category = str_replace(revenue_category, "\\/Games", ""),
revenue_category = str_replace(revenue_category, " sales", ""),
revenue_category = str_replace(revenue_category, "Comic or Manga", "Comic\\/Manga"))
df %>% ggplot(aes(year_created, revenue, fill=revenue_category))+
geom_col()+
scale_fill_viridis_d(option="E")+
labs(title="Franchise Revenue by Year", x="Year", y="Revenue (in billions)")
df %>% filter(year_created > 1970) %>%
ggplot(aes(year_created, revenue, fill=revenue_category))+
geom_col()+
scale_fill_viridis_d(option="E")+
labs(title="Franchise Revenue after 1970", x="Year", y="Revenue (in billions)")
df_revenue <- df %>% group_by(franchise) %>% summarize(total_revenue = sum(revenue)) %>% arrange(desc(total_revenue)) %>% ungroup()
df <- left_join(df,df_revenue)
df %>% filter(year_created == 1996) %>% mutate(franchise = fct_reorder(franchise, total_revenue)) %>%
ggplot(aes(franchise, revenue, fill=revenue_category))+
geom_col()+
scale_fill_viridis_d(option="E")+
coord_flip()+
labs(title="Franchise Revenue in 1996", y="Revenue (in billions)")+
theme(axis.title.y = element_blank())
df %>% filter(total_revenue > 50) %>% mutate(franchise = fct_reorder(franchise, total_revenue)) %>%
ggplot(aes(franchise, revenue, fill=revenue_category))+
geom_col()+
scale_fill_viridis_d(option="E")+
coord_flip()+
labs(title="Top Franchises' Revenue", y="Revenue (in billions)")+
theme(axis.title.y = element_blank())
df_revenue <- df %>% group_by(original_media) %>% summarize(media_revenue = sum(revenue)) %>% arrange(desc(media_revenue)) %>% ungroup()
df <- left_join(df,df_revenue)
df %>% mutate(original_media = fct_reorder(original_media, media_revenue)) %>%
ggplot(aes(original_media, revenue, fill=revenue_category))+
geom_col()+
scale_fill_viridis_d(option="E")+
coord_flip()+
labs(title="Revenue by Original Media Type", x="Original Media", y="Revenue (in billions)")
df_revenue <- df %>% group_by(revenue_category) %>% summarize(category_revenue = sum(revenue)) %>% arrange(desc(category_revenue)) %>% ungroup()
df <- left_join(df,df_revenue)
df %>% mutate(revenue_category = fct_reorder(revenue_category, category_revenue)) %>%
ggplot(aes(revenue_category, revenue, fill=original_media))+
geom_col()+
scale_fill_viridis_d(option="E")+
coord_flip()+
labs(title="Revenue by Category", x="Category", y="Revenue (in billions)")
Image found at http://www.greeklibrary.org/wp-content/uploads/2019/05/books.jpg
img <- readPNG("books.png")
df %>% filter(original_media == "Novel") %>% mutate(franchise = fct_reorder(franchise, total_revenue)) %>%
ggplot(aes(franchise, revenue, fill=revenue_category))+
background_image(img)+
geom_col(color="gray", size=0.2)+
scale_fill_brewer(palette="Blues")+
scale_y_continuous(expand = c(0.01,0))+
coord_flip()+
labs(title="Franchised Novels' Revenue by Category", y="Revenue (in billions)", fill="Category")+
theme(plot.title = element_text(hjust=0.5, face="bold"),
text=element_text(family="Candara", size=13),
plot.background = element_blank(),
panel.background = element_blank(),
axis.title.y = element_blank(),
axis.text = element_text(face="bold"),
legend.background = element_blank(),
legend.text = element_text(size=8),
legend.title = element_text(size=10))
ggsave("media-franchises.png")