Analisis kelompok sangat populer dalam pemasaran . Popularitasnya kemungkinan besar karena kemudahan algoritma dan perhitungannya. Tidak ada konsep matematika yang serius di dasar, matematika dasar dilakukan di excel. Dari sudut pandang untuk memperoleh wawasan, analisis kelangsungan hidup jauh lebih menarik.
Namun demikian, kami percaya bahwa ada tugas seperti itu dan itu harus diselesaikan. Mencari paket dan fungsi siap pakai tidak menarik - matematikanya sederhana, ada banyak pengaturan. Di bawah ini adalah contoh implementasi yang mungkin (tanpa fiksasi khusus pada kecepatan eksekusi), seluruh kode untuk beberapa lusin baris.
Ini adalah kelanjutan dari serangkaian publikasi sebelumnya .
Beberapa kode
Saat membuat set pengujian, kami mungkin tidak terlalu fokus pada zona waktu, sama saja, datanya acak.
# 15
set.seed(42)
events_dt <- tibble(user_id = 1000:9000) %>%
mutate(birthday = Sys.Date() + as.integer(rexp(n(), 1/10))) %>%
rowwise() %>%
mutate(timestamp = list(as_datetime(birthday) + 24*60*60 * (
rexp(10^3, rate = 1/runif(1, 2, 25))))) %>%
ungroup() %>%
unnest(timestamp) %>%
#
filter(timestamp >= quantile(timestamp, probs = 0.1),
timestamp <= quantile(timestamp, probs = 0.95)) %>%
mutate(date = as_date(timestamp)) %>%
select(user_id, date) %>%
setDT(key = c("user_id", "date")) %>%
#
unique()
Mari kita lihat distribusi kumulatif yang dihasilkan
ggplot(events_dt, aes(date)) + geom_histogram()

Langkah 1. Membentuk panduan pengguna
" ", .. , . data.table
.
users_dict <- events_dt[, .(birthday = head(date, 1)), by = user_id] %>%
#
.[, week_start := floor_date(.BY[[1]], unit = "week"), by = birthday] %>%
#
.[, cohort := stri_c(
lubridate::isoyear(.BY[[1]]),
sprintf("%02d", lubridate::isoweek(.BY[[1]])),
sep = "/"), by = week_start]
# ,
as_tibble(janitor::tabyl(users_dict, birthday))

2.
.
. .
cohort_dict <- unique(users_dict[, .(cohort, week_start)])
cohort_tbl <- users_dict[events_dt, on = "user_id"] %>%
#
.[, rel_week := floor(as.numeric(difftime(date, birthday, units = "week")))] %>%
# 10
.[rel_week <= 9] %>%
#
unique(by = c("user_id", "cohort", "rel_week")) %>%
#
.[, .N, by = .(cohort, rel_week)] %>%
.[, rate := N/max(N), by = cohort]
3.
1. ggplot
# ggplot
data_tbl <- cohort_tbl %>%
#
left_join(cohort_dict)
data_tbl %>%
mutate(cohort_group = forcats::fct_reorder(cohort, week_start, .desc = TRUE)) %>%
ggplot(mapping = aes(x = rel_week, y = cohort_group, fill = rate)) +
geom_tile() +
geom_text(aes(label = N), colour = "darkgray") +
labs(x = " ",
y = " ",
fill = "\n",
title = "graph_title") +
scale_fill_viridis_c(option = "inferno") +
scale_x_continuous(breaks = scales::breaks_width(1)) +
theme_minimal() +
theme(panel.grid = element_blank())

2. gt
, .
# -
data_tbl <- cohort_tbl %>%
pivot_longer(cols = c(N, rate)) %>%
pivot_wider(names_from = rel_week, values_from = value) %>%
#
left_join(cohort_dict) %>%
arrange(week_start, desc(name))
odd_rows <- seq(1, to = nrow(data_tbl), by = 2)
even_rows <- seq(2, to = nrow(data_tbl), by = 2)
tab <- data_tbl %>%
mutate(cohort = if_else(rep(c(TRUE, FALSE), length.out = nrow(.)),
cohort, "")) %>%
select(-name, -week_start) %>%
gt(rowname_col = "cohort") %>%
fmt_percent(columns = matches("[0-9]+"),
rows = odd_rows,
decimals = 0, pattern = "<big>{x}</big>") %>%
fmt_missing(columns = everything(),
missing_text = "---") %>%
tab_stubhead(label = " ") %>%
tab_spanner(label = " ",
columns = everything()) %>%
tab_header(title = "") %>%
data_color(columns = everything(),
colors = scales::col_numeric(palette = "inferno",
domain = c(0, 1),
alpha = 0.6,
na.color = "lightgray")) %>%
tab_options(
table.font.size = "smaller",
data_row.padding = px(1),
table.width = pct(75)
) %>%
tab_style(
style = list(
cell_fill(color = "white"),
cell_text(style = "italic"),
cell_borders(sides = "bottom")
),
locations = cells_body(
columns = everything(),
rows = even_rows)
) %>%
tab_style(
style = list(
cell_borders(sides = "top")
),
locations = cells_body(
columns = everything(),
rows = odd_rows)
)
tab

, .
Publikasi sebelumnya - βR dan bekerja dengan waktu. Ada apa di balik layar? " ...