Tidy Tuesday has a weekly data project aimed at the R ecosystem. An emphasis is placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2, tidyr, dplyr, and other tools in the tidyverse ecosystem.
Data came from R4DS. The data includes the date, information about the number of members, number of messages, and the breakdown by direct messages, public, private, or shared channels.
if (!require("pacman")) install.packages("pacman")
pacman::p_load("tidyverse","visdat","grid","gridExtra")
theme_set(theme_minimal())
df <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-16/r4ds_members.csv")
glimpse(df)
## Observations: 678
## Variables: 21
## $ date <date> 2017-08-27, 2017-08-28, ...
## $ total_membership <dbl> 1, 1, 1, 1, 1, 188, 284, ...
## $ full_members <dbl> 1, 1, 1, 1, 1, 188, 284, ...
## $ guests <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ daily_active_members <dbl> 1, 1, 1, 1, 1, 169, 225, ...
## $ daily_members_posting_messages <dbl> 1, 0, 1, 0, 1, 111, 110, ...
## $ weekly_active_members <dbl> 1, 1, 1, 1, 1, 169, 270, ...
## $ weekly_members_posting_messages <dbl> 1, 1, 1, 1, 1, 111, 183, ...
## $ messages_in_public_channels <dbl> 4, 0, 0, 0, 1, 252, 326, ...
## $ messages_in_private_channels <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ messages_in_shared_channels <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ messages_in_d_ms <dbl> 1, 0, 0, 0, 0, 119, 46, 7...
## $ percent_of_messages_public_channels <dbl> 0.8000, 0.0000, 0.0000, 0...
## $ percent_of_messages_private_channels <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ percent_of_messages_d_ms <dbl> 0.2000, 0.0000, 0.0000, 0...
## $ percent_of_views_public_channels <dbl> 0.2857, 1.0000, 1.0000, 1...
## $ percent_of_views_private_channels <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ percent_of_views_d_ms <dbl> 0.7143, 0.0000, 0.0000, 0...
## $ name <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ public_channels_single_workspace <dbl> 10, 10, 11, 11, 12, 12, 1...
## $ messages_posted <dbl> 35, 35, 37, 38, 66, 1101,...
head(df)
## # A tibble: 6 x 21
## date total_membership full_members guests daily_active_me~
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 2017-08-27 1 1 0 1
## 2 2017-08-28 1 1 0 1
## 3 2017-08-29 1 1 0 1
## 4 2017-08-30 1 1 0 1
## 5 2017-08-31 1 1 0 1
## 6 2017-09-01 188 188 0 169
## # ... with 16 more variables: daily_members_posting_messages <dbl>,
## # weekly_active_members <dbl>, weekly_members_posting_messages <dbl>,
## # messages_in_public_channels <dbl>, messages_in_private_channels <dbl>,
## # messages_in_shared_channels <dbl>, messages_in_d_ms <dbl>,
## # percent_of_messages_public_channels <dbl>,
## # percent_of_messages_private_channels <dbl>,
## # percent_of_messages_d_ms <dbl>,
## # percent_of_views_public_channels <dbl>,
## # percent_of_views_private_channels <dbl>, percent_of_views_d_ms <dbl>,
## # name <dbl>, public_channels_single_workspace <dbl>,
## # messages_posted <dbl>
summary(df)
## date total_membership full_members guests
## Min. :2017-08-27 Min. : 1.0 Min. : 1.0 Min. :0
## 1st Qu.:2018-02-12 1st Qu.: 978.2 1st Qu.: 978.2 1st Qu.:0
## Median :2018-07-31 Median :1605.0 Median :1605.0 Median :0
## Mean :2018-07-31 Mean :1567.8 Mean :1567.8 Mean :0
## 3rd Qu.:2019-01-16 3rd Qu.:2142.8 3rd Qu.:2142.8 3rd Qu.:0
## Max. :2019-07-05 Max. :3029.0 Max. :3029.0 Max. :0
## daily_active_members daily_members_posting_messages weekly_active_members
## Min. : 1.00 Min. : 0.00 Min. : 1.0
## 1st Qu.: 63.00 1st Qu.: 6.00 1st Qu.:206.0
## Median : 88.00 Median : 11.00 Median :239.0
## Mean : 91.39 Mean : 13.24 Mean :249.7
## 3rd Qu.:110.00 3rd Qu.: 16.00 3rd Qu.:307.8
## Max. :258.00 Max. :111.00 Max. :525.0
## weekly_members_posting_messages messages_in_public_channels
## Min. : 1.00 Min. : 0.00
## 1st Qu.: 35.00 1st Qu.: 9.25
## Median : 48.00 Median : 19.00
## Mean : 52.16 Mean : 28.46
## 3rd Qu.: 59.00 3rd Qu.: 35.00
## Max. :278.00 Max. :326.00
## messages_in_private_channels messages_in_shared_channels messages_in_d_ms
## Min. : 0.000 Min. :0 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.:0 1st Qu.: 1.00
## Median : 0.000 Median :0 Median : 4.00
## Mean : 1.718 Mean :0 Mean : 13.05
## 3rd Qu.: 0.000 3rd Qu.:0 3rd Qu.: 12.00
## Max. :75.000 Max. :0 Max. :227.00
## percent_of_messages_public_channels percent_of_messages_private_channels
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5840 1st Qu.:0.0000
## Median :0.8000 Median :0.0000
## Mean :0.7248 Mean :0.0305
## 3rd Qu.:0.9444 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000
## percent_of_messages_d_ms percent_of_views_public_channels
## Min. :0.0000 Min. :0.2726
## 1st Qu.:0.0345 1st Qu.:0.9115
## Median :0.1595 Median :0.9519
## Mean :0.2270 Mean :0.9285
## 3rd Qu.:0.3478 3rd Qu.:0.9744
## Max. :1.0000 Max. :1.0000
## percent_of_views_private_channels percent_of_views_d_ms name
## Min. :0.000000 Min. :0.00000 Min. :0
## 1st Qu.:0.000000 1st Qu.:0.02235 1st Qu.:0
## Median :0.000000 Median :0.04170 Median :0
## Mean :0.009773 Mean :0.06176 Mean :0
## 3rd Qu.:0.006450 3rd Qu.:0.07433 3rd Qu.:0
## Max. :0.267400 Max. :0.72170 Max. :0
## public_channels_single_workspace messages_posted
## Min. :10.0 Min. : 35
## 1st Qu.:15.0 1st Qu.:20543
## Median :19.0 Median :33828
## Mean :17.8 Mean :32936
## 3rd Qu.:21.0 3rd Qu.:40104
## Max. :27.0 Max. :59627
sapply(df, function(x) n_distinct(x)) %>% sort()
## guests messages_in_shared_channels
## 1 1
## name public_channels_single_workspace
## 1 18
## messages_in_private_channels daily_members_posting_messages
## 27 52
## messages_in_d_ms messages_in_public_channels
## 83 108
## weekly_members_posting_messages percent_of_messages_private_channels
## 114 120
## daily_active_members percent_of_views_private_channels
## 168 192
## weekly_active_members percent_of_messages_d_ms
## 267 323
## percent_of_messages_public_channels total_membership
## 330 443
## full_members percent_of_views_d_ms
## 443 489
## percent_of_views_public_channels messages_posted
## 520 674
## date
## 678
There are no missing values in the data set.
#Visualize missing values
vis_miss(df, sort_miss=TRUE)
Remove/replace missing values and drop unnecessary columns
# remove columns with only 1 unique value
df <- df[, sapply(df, function(x) n_distinct(x)) > 1] %>% select(-full_members)
df <- df %>% filter(date >= "2017-09-01")
# active vs total membership
df %>% ggplot(aes(x=date))+
geom_area(aes(y=total_membership, fill="Total Membership"))+
geom_area(aes(y=weekly_active_members, fill="Active Members"))+
scale_fill_manual(values = c("steelblue1","steelblue4")) +
labs(title="Active Members vs Total Membership", fill="")
# total members posting messages
df %>% ggplot(aes(x=date))+
geom_line(aes(y=weekly_members_posting_messages), color="red")+
labs(title="Number of Members Posting Messages")
# messages posted
df %>% ggplot(aes(x=date))+
geom_line(aes(y=messages_posted), color="purple4", size=1.2)+
labs(title="Total Messages Posted")
## FINAL VISUALIZATION
# active vs. total membership
p1 <- df %>% ggplot(aes(x=date))+
geom_area(aes(y=total_membership), fill="steelblue4")+
geom_area(aes(y=weekly_active_members), fill="steelblue1")+
labs(x="Date", y="Total Number")+
theme(axis.title = element_text(size=9))
# messages posted vs. membership
p2 <- df %>% ggplot(aes(x=date))+
geom_area(aes(y=total_membership, fill="Total Membership"))+
geom_area(aes(y=weekly_active_members, fill="Active Members"))+
geom_line(aes(y=messages_posted, color="Messages Posted"), size=1.2)+
scale_fill_manual(values = c("steelblue1","steelblue4")) +
scale_color_manual(values = "purple3") +
labs(x="Date", y="Total Number")+
theme(legend.position="bottom",
legend.title=element_blank(),
legend.spacing.x = unit(0.4, 'cm'),
axis.title = element_text(size=9))+
guides(fill = guide_legend(order=1),
color = guide_legend(order=2))
#https://github.com/hadley/ggplot2/wiki/Share-a-legend-between-two-ggplot2-graphs
#function to create a common legend for two plots
g_legend<-function(a.gplot){
tmp <- ggplot_gtable(ggplot_build(a.gplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)}
mylegend<-g_legend(p2)
tg <- grobTree(textGrob("Membership Status vs. Messages Posted",
y=1,
vjust=1,
gp=gpar(fontface="bold", fontsize = 16)),
textGrob("Messages posted are rapidly increasing, despite active members staying fairly constant",
y=0,
vjust=0,
gp = gpar(fontsize=11, col="grey20")),
cl="titlegrob")
heightDetails.titlegrob <- function(x) do.call(sum,lapply(x$children, grobHeight))
#create final plot
p3 <- grid.arrange(arrangeGrob(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
nrow=1),
mylegend,
nrow=2,
heights=c(10, 1),
top = tg)
ggsave("R4DS-community-stats.png", p3)