1 Project Description

Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.

2 Dataset

Data came from R4DS. The data includes the date, information about the number of members, number of messages, and the breakdown by direct messages, public, private, or shared channels.

3 Setup

3.1 Load Libraries

if (!require("pacman")) install.packages("pacman")
pacman::p_load("tidyverse","visdat","grid","gridExtra")

theme_set(theme_minimal())

3.2 Import Data

df <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-16/r4ds_members.csv")

4 Exploratory Data Analysis

  • There are 678 observations and 21 variables.
  • name, guests, and messages_in_shared_channels only have one unique value in the column.
  • total_membership and full_members are duplicate columns.
  • entries prior to 2017-09-01 only have 1 member. This is likely when R4DS was just getting started.

4.1 View Data

glimpse(df)
## Observations: 678
## Variables: 21
## $ date                                 <date> 2017-08-27, 2017-08-28, ...
## $ total_membership                     <dbl> 1, 1, 1, 1, 1, 188, 284, ...
## $ full_members                         <dbl> 1, 1, 1, 1, 1, 188, 284, ...
## $ guests                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ daily_active_members                 <dbl> 1, 1, 1, 1, 1, 169, 225, ...
## $ daily_members_posting_messages       <dbl> 1, 0, 1, 0, 1, 111, 110, ...
## $ weekly_active_members                <dbl> 1, 1, 1, 1, 1, 169, 270, ...
## $ weekly_members_posting_messages      <dbl> 1, 1, 1, 1, 1, 111, 183, ...
## $ messages_in_public_channels          <dbl> 4, 0, 0, 0, 1, 252, 326, ...
## $ messages_in_private_channels         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ messages_in_shared_channels          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ messages_in_d_ms                     <dbl> 1, 0, 0, 0, 0, 119, 46, 7...
## $ percent_of_messages_public_channels  <dbl> 0.8000, 0.0000, 0.0000, 0...
## $ percent_of_messages_private_channels <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ percent_of_messages_d_ms             <dbl> 0.2000, 0.0000, 0.0000, 0...
## $ percent_of_views_public_channels     <dbl> 0.2857, 1.0000, 1.0000, 1...
## $ percent_of_views_private_channels    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ percent_of_views_d_ms                <dbl> 0.7143, 0.0000, 0.0000, 0...
## $ name                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ public_channels_single_workspace     <dbl> 10, 10, 11, 11, 12, 12, 1...
## $ messages_posted                      <dbl> 35, 35, 37, 38, 66, 1101,...
head(df)
## # A tibble: 6 x 21
##   date       total_membership full_members guests daily_active_me~
##   <date>                <dbl>        <dbl>  <dbl>            <dbl>
## 1 2017-08-27                1            1      0                1
## 2 2017-08-28                1            1      0                1
## 3 2017-08-29                1            1      0                1
## 4 2017-08-30                1            1      0                1
## 5 2017-08-31                1            1      0                1
## 6 2017-09-01              188          188      0              169
## # ... with 16 more variables: daily_members_posting_messages <dbl>,
## #   weekly_active_members <dbl>, weekly_members_posting_messages <dbl>,
## #   messages_in_public_channels <dbl>, messages_in_private_channels <dbl>,
## #   messages_in_shared_channels <dbl>, messages_in_d_ms <dbl>,
## #   percent_of_messages_public_channels <dbl>,
## #   percent_of_messages_private_channels <dbl>,
## #   percent_of_messages_d_ms <dbl>,
## #   percent_of_views_public_channels <dbl>,
## #   percent_of_views_private_channels <dbl>, percent_of_views_d_ms <dbl>,
## #   name <dbl>, public_channels_single_workspace <dbl>,
## #   messages_posted <dbl>
summary(df)
##       date            total_membership  full_members        guests 
##  Min.   :2017-08-27   Min.   :   1.0   Min.   :   1.0   Min.   :0  
##  1st Qu.:2018-02-12   1st Qu.: 978.2   1st Qu.: 978.2   1st Qu.:0  
##  Median :2018-07-31   Median :1605.0   Median :1605.0   Median :0  
##  Mean   :2018-07-31   Mean   :1567.8   Mean   :1567.8   Mean   :0  
##  3rd Qu.:2019-01-16   3rd Qu.:2142.8   3rd Qu.:2142.8   3rd Qu.:0  
##  Max.   :2019-07-05   Max.   :3029.0   Max.   :3029.0   Max.   :0  
##  daily_active_members daily_members_posting_messages weekly_active_members
##  Min.   :  1.00       Min.   :  0.00                 Min.   :  1.0        
##  1st Qu.: 63.00       1st Qu.:  6.00                 1st Qu.:206.0        
##  Median : 88.00       Median : 11.00                 Median :239.0        
##  Mean   : 91.39       Mean   : 13.24                 Mean   :249.7        
##  3rd Qu.:110.00       3rd Qu.: 16.00                 3rd Qu.:307.8        
##  Max.   :258.00       Max.   :111.00                 Max.   :525.0        
##  weekly_members_posting_messages messages_in_public_channels
##  Min.   :  1.00                  Min.   :  0.00             
##  1st Qu.: 35.00                  1st Qu.:  9.25             
##  Median : 48.00                  Median : 19.00             
##  Mean   : 52.16                  Mean   : 28.46             
##  3rd Qu.: 59.00                  3rd Qu.: 35.00             
##  Max.   :278.00                  Max.   :326.00             
##  messages_in_private_channels messages_in_shared_channels messages_in_d_ms
##  Min.   : 0.000               Min.   :0                   Min.   :  0.00  
##  1st Qu.: 0.000               1st Qu.:0                   1st Qu.:  1.00  
##  Median : 0.000               Median :0                   Median :  4.00  
##  Mean   : 1.718               Mean   :0                   Mean   : 13.05  
##  3rd Qu.: 0.000               3rd Qu.:0                   3rd Qu.: 12.00  
##  Max.   :75.000               Max.   :0                   Max.   :227.00  
##  percent_of_messages_public_channels percent_of_messages_private_channels
##  Min.   :0.0000                      Min.   :0.0000                      
##  1st Qu.:0.5840                      1st Qu.:0.0000                      
##  Median :0.8000                      Median :0.0000                      
##  Mean   :0.7248                      Mean   :0.0305                      
##  3rd Qu.:0.9444                      3rd Qu.:0.0000                      
##  Max.   :1.0000                      Max.   :1.0000                      
##  percent_of_messages_d_ms percent_of_views_public_channels
##  Min.   :0.0000           Min.   :0.2726                  
##  1st Qu.:0.0345           1st Qu.:0.9115                  
##  Median :0.1595           Median :0.9519                  
##  Mean   :0.2270           Mean   :0.9285                  
##  3rd Qu.:0.3478           3rd Qu.:0.9744                  
##  Max.   :1.0000           Max.   :1.0000                  
##  percent_of_views_private_channels percent_of_views_d_ms      name  
##  Min.   :0.000000                  Min.   :0.00000       Min.   :0  
##  1st Qu.:0.000000                  1st Qu.:0.02235       1st Qu.:0  
##  Median :0.000000                  Median :0.04170       Median :0  
##  Mean   :0.009773                  Mean   :0.06176       Mean   :0  
##  3rd Qu.:0.006450                  3rd Qu.:0.07433       3rd Qu.:0  
##  Max.   :0.267400                  Max.   :0.72170       Max.   :0  
##  public_channels_single_workspace messages_posted
##  Min.   :10.0                     Min.   :   35  
##  1st Qu.:15.0                     1st Qu.:20543  
##  Median :19.0                     Median :33828  
##  Mean   :17.8                     Mean   :32936  
##  3rd Qu.:21.0                     3rd Qu.:40104  
##  Max.   :27.0                     Max.   :59627
sapply(df, function(x) n_distinct(x)) %>% sort()
##                               guests          messages_in_shared_channels 
##                                    1                                    1 
##                                 name     public_channels_single_workspace 
##                                    1                                   18 
##         messages_in_private_channels       daily_members_posting_messages 
##                                   27                                   52 
##                     messages_in_d_ms          messages_in_public_channels 
##                                   83                                  108 
##      weekly_members_posting_messages percent_of_messages_private_channels 
##                                  114                                  120 
##                 daily_active_members    percent_of_views_private_channels 
##                                  168                                  192 
##                weekly_active_members             percent_of_messages_d_ms 
##                                  267                                  323 
##  percent_of_messages_public_channels                     total_membership 
##                                  330                                  443 
##                         full_members                percent_of_views_d_ms 
##                                  443                                  489 
##     percent_of_views_public_channels                      messages_posted 
##                                  520                                  674 
##                                 date 
##                                  678

4.2 Missing Values

There are no missing values in the data set.

#Visualize missing values
vis_miss(df, sort_miss=TRUE)

5 Data Wrangling

Remove/replace missing values and drop unnecessary columns

# remove columns with only 1 unique value
df <- df[, sapply(df, function(x) n_distinct(x)) > 1] %>% select(-full_members)
df <- df %>% filter(date >= "2017-09-01")

6 Visualizations

  • When R4DS launched, initial users were very active at first and then usage gradually declined.
  • Numbers dramatically increased just before the start of 2018.
  • What happened on 2018-06-19? Total messages posted dropped. Did the site go down?
# active vs total membership
df %>% ggplot(aes(x=date))+
  geom_area(aes(y=total_membership, fill="Total Membership"))+
  geom_area(aes(y=weekly_active_members, fill="Active Members"))+
  scale_fill_manual(values = c("steelblue1","steelblue4")) +
  labs(title="Active Members vs Total Membership", fill="")

# total members posting messages
df %>% ggplot(aes(x=date))+
  geom_line(aes(y=weekly_members_posting_messages), color="red")+
  labs(title="Number of Members Posting Messages")

# messages posted
df %>% ggplot(aes(x=date))+
  geom_line(aes(y=messages_posted), color="purple4", size=1.2)+
  labs(title="Total Messages Posted")

## FINAL VISUALIZATION
# active vs. total membership
p1 <- df %>% ggplot(aes(x=date))+
  geom_area(aes(y=total_membership), fill="steelblue4")+
  geom_area(aes(y=weekly_active_members), fill="steelblue1")+
  labs(x="Date", y="Total Number")+
  theme(axis.title = element_text(size=9))

# messages posted vs. membership
p2 <- df %>% ggplot(aes(x=date))+
  geom_area(aes(y=total_membership, fill="Total Membership"))+
  geom_area(aes(y=weekly_active_members, fill="Active Members"))+
  geom_line(aes(y=messages_posted, color="Messages Posted"), size=1.2)+
  scale_fill_manual(values = c("steelblue1","steelblue4")) +
  scale_color_manual(values = "purple3") +
  labs(x="Date", y="Total Number")+
  theme(legend.position="bottom",
        legend.title=element_blank(),
        legend.spacing.x = unit(0.4, 'cm'),
        axis.title = element_text(size=9))+
  guides(fill = guide_legend(order=1),
         color = guide_legend(order=2))

#https://github.com/hadley/ggplot2/wiki/Share-a-legend-between-two-ggplot2-graphs
#function to create a common legend for two plots
g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)}

mylegend<-g_legend(p2)

tg <- grobTree(textGrob("Membership Status vs. Messages Posted", 
                        y=1, 
                        vjust=1, 
                        gp=gpar(fontface="bold", fontsize = 16)),
               textGrob("Messages posted are rapidly increasing, despite active members staying fairly constant", 
                        y=0, 
                        vjust=0, 
                        gp = gpar(fontsize=11, col="grey20")),
               cl="titlegrob")

heightDetails.titlegrob <- function(x) do.call(sum,lapply(x$children, grobHeight))

#create final plot
p3 <- grid.arrange(arrangeGrob(p1 + theme(legend.position="none"),
                         p2 + theme(legend.position="none"),
                         nrow=1),
             mylegend, 
             nrow=2, 
             heights=c(10, 1),
             top = tg)

ggsave("R4DS-community-stats.png", p3)