1 Project Description

Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.

2 Dataset

Data came from Wikipedia. The data includes information such as franchise name, revenue generated, year created, and owners/creators of the franchise.

3 Setup

3.1 Load Libraries

if (!require("pacman")) install.packages("pacman")
pacman::p_load("tidyverse","visdat","extrafont","ggpubr","png")

theme_set(theme_classic())

3.2 Import Data

df <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-02/media_franchises.csv")

4 Exploratory Data Analysis

  • There are 321 observations and 7 variables.
  • Revenue is shown in ‘billions’ so the numbers seem small.
  • Many of the variables are characters and appear to need cleaning.
  • There are no missing values.
  • 1996 brought in the most revenue.
  • Pokémon, Hello Kitty, Winnie the Pooh, Mickey Mouse & Friends, Star Wars, and Anpanman have earned the most revenue.
  • Merchandise, Licensing & Retail is the highest earning revenue category.

4.1 View Data

glimpse(df)
## Observations: 321
## Variables: 7
## $ franchise        <chr> "A Song of Ice and Fire /  Game of Thrones", ...
## $ revenue_category <chr> "Book sales", "Box Office", "Home Video/Enter...
## $ revenue          <dbl> 0.900, 0.001, 0.280, 4.000, 0.132, 0.760, 1.0...
## $ year_created     <dbl> 1996, 1996, 1996, 1996, 1996, 1992, 1992, 199...
## $ original_media   <chr> "Novel", "Novel", "Novel", "Novel", "Novel", ...
## $ creators         <chr> "George R. R. Martin", "George R. R. Martin",...
## $ owners           <chr> "Random House WarnerMedia (AT&T)", "Random Ho...
head(df)
## # A tibble: 6 x 7
##   franchise revenue_category revenue year_created original_media creators
##   <chr>     <chr>              <dbl>        <dbl> <chr>          <chr>   
## 1 A Song o~ Book sales         0.9           1996 Novel          George ~
## 2 A Song o~ Box Office         0.001         1996 Novel          George ~
## 3 A Song o~ Home Video/Ente~   0.28          1996 Novel          George ~
## 4 A Song o~ TV                 4             1996 Novel          George ~
## 5 A Song o~ Video Games/Gam~   0.132         1996 Novel          George ~
## 6 Aladdin   Box Office         0.76          1992 Animated film  Walt Di~
## # ... with 1 more variable: owners <chr>
summary(df)
##   franchise         revenue_category      revenue        year_created 
##  Length:321         Length:321         Min.   : 0.001   Min.   :1924  
##  Class :character   Class :character   1st Qu.: 0.211   1st Qu.:1977  
##  Mode  :character   Mode  :character   Median : 1.000   Median :1991  
##                                        Mean   : 4.742   Mean   :1986  
##                                        3rd Qu.: 5.000   3rd Qu.:1999  
##                                        Max.   :80.000   Max.   :2013  
##  original_media       creators            owners         
##  Length:321         Length:321         Length:321        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 
sapply(df, function(x) n_distinct(x)) %>% sort()
## revenue_category   original_media     year_created           owners 
##                8               18               52               78 
##         creators        franchise          revenue 
##              101              103              174
df %>% count(revenue_category, sort=TRUE)
## # A tibble: 8 x 2
##   revenue_category                    n
##   <chr>                           <int>
## 1 Box Office                         76
## 2 Merchandise, Licensing & Retail    73
## 3 Home Video/Entertainment           64
## 4 Video Games/Games                  51
## 5 Comic or Manga                     27
## 6 Music                              12
## 7 TV                                 11
## 8 Book sales                          7
df %>% count(original_media, sort=TRUE)
## # A tibble: 18 x 2
##    original_media        n
##    <chr>             <int>
##  1 Video game           64
##  2 Manga                63
##  3 Novel                33
##  4 Comic book           29
##  5 Animated film        27
##  6 Television series    19
##  7 Film                 17
##  8 Animated series      16
##  9 Anime                13
## 10 Animated cartoon      8
## 11 Digital pet           8
## 12 Book                  5
## 13 Visual novel          5
## 14 Greeting card         4
## 15 Cartoon character     3
## 16 Comic strip           3
## 17 Cartoon               2
## 18 Musical theatre       2

4.2 Missing Values

View missing values in more detail.

#Visualize missing values
vis_miss(df, sort_miss=TRUE)

4.3 Data Cleaning

Clean fields to make them easier to read/visualize.

df <- df %>% mutate(original_media = case_when(original_media %in% "Book" ~ "Novel",
                                               original_media %in% "Cartoon character" ~ "Cartoon",
                                               original_media %in% "Visual novel" ~ "Comic book",
                                               original_media %in% "Animated cartoon" ~ "Animated series",
                                               TRUE ~ original_media),
                    franchise = str_replace(franchise, "A Song of Ice and Fire \\/ ", ""),
                    franchise = str_replace(franchise, "Wizarding World \\/ ", ""),
                    franchise = str_replace(franchise, "Middle-earth \\/ ", ""),
                    revenue_category = str_replace(revenue_category, ", Licensing & Retail", ""),
                    revenue_category = str_replace(revenue_category, "\\/Entertainment", ""),
                    revenue_category = str_replace(revenue_category, "\\/Games", ""),
                    revenue_category = str_replace(revenue_category, " sales", ""),
                    revenue_category = str_replace(revenue_category, "Comic or Manga", "Comic\\/Manga"))

4.4 Visualizations

df %>% ggplot(aes(year_created, revenue, fill=revenue_category))+
  geom_col()+
  scale_fill_viridis_d(option="E")+
  labs(title="Franchise Revenue by Year", x="Year", y="Revenue (in billions)")

df %>% filter(year_created > 1970) %>% 
  ggplot(aes(year_created, revenue, fill=revenue_category))+
  geom_col()+
  scale_fill_viridis_d(option="E")+
  labs(title="Franchise Revenue after 1970", x="Year", y="Revenue (in billions)")

df_revenue <- df %>% group_by(franchise) %>% summarize(total_revenue = sum(revenue)) %>% arrange(desc(total_revenue)) %>% ungroup()

df <- left_join(df,df_revenue)

df %>% filter(year_created == 1996) %>% mutate(franchise = fct_reorder(franchise, total_revenue)) %>%
  ggplot(aes(franchise, revenue, fill=revenue_category))+
  geom_col()+
  scale_fill_viridis_d(option="E")+
  coord_flip()+
  labs(title="Franchise Revenue in 1996", y="Revenue (in billions)")+
  theme(axis.title.y = element_blank())

df %>% filter(total_revenue > 50) %>% mutate(franchise = fct_reorder(franchise, total_revenue)) %>%
  ggplot(aes(franchise, revenue, fill=revenue_category))+
  geom_col()+
  scale_fill_viridis_d(option="E")+
  coord_flip()+
  labs(title="Top Franchises' Revenue", y="Revenue (in billions)")+
  theme(axis.title.y = element_blank())

df_revenue <- df %>% group_by(original_media) %>% summarize(media_revenue = sum(revenue)) %>% arrange(desc(media_revenue)) %>% ungroup()

df <- left_join(df,df_revenue)

df %>% mutate(original_media = fct_reorder(original_media, media_revenue)) %>%
  ggplot(aes(original_media, revenue, fill=revenue_category))+
  geom_col()+
  scale_fill_viridis_d(option="E")+
  coord_flip()+
  labs(title="Revenue by Original Media Type", x="Original Media", y="Revenue (in billions)")

df_revenue <- df %>% group_by(revenue_category) %>% summarize(category_revenue = sum(revenue)) %>% arrange(desc(category_revenue)) %>% ungroup()

df <- left_join(df,df_revenue)

df %>% mutate(revenue_category = fct_reorder(revenue_category, category_revenue)) %>%
  ggplot(aes(revenue_category, revenue, fill=original_media))+
  geom_col()+
  scale_fill_viridis_d(option="E")+
  coord_flip()+
  labs(title="Revenue by Category", x="Category", y="Revenue (in billions)")

5 Final Visualization

Image found at http://www.greeklibrary.org/wp-content/uploads/2019/05/books.jpg

img <- readPNG("books.png")

df %>% filter(original_media == "Novel") %>% mutate(franchise = fct_reorder(franchise, total_revenue)) %>%
  ggplot(aes(franchise, revenue, fill=revenue_category))+
  background_image(img)+
  geom_col(color="gray", size=0.2)+
  scale_fill_brewer(palette="Blues")+
  scale_y_continuous(expand = c(0.01,0))+
  coord_flip()+
  labs(title="Franchised Novels' Revenue by Category", y="Revenue (in billions)", fill="Category")+
  theme(plot.title = element_text(hjust=0.5, face="bold"),
        text=element_text(family="Candara", size=13),
        plot.background = element_blank(),
        panel.background = element_blank(),
        axis.title.y = element_blank(),
        axis.text = element_text(face="bold"),
        legend.background = element_blank(),
        legend.text = element_text(size=8),
        legend.title = element_text(size=10))

ggsave("media-franchises.png")