温馨提示:本文翻译自stackoverflow.com,查看原文请点击:r - How to data wrangle and barplot the proportion without undesired stripes
bar-chart dplyr grouping r ggplotly

r - 如何计算纠缠和绘制比例而没有不想要的条纹

发布于 2020-04-15 12:11:40

请在下面的屏幕截图中找到输入数据和预期输出:

预期

但是,当前图具有以下代码:

当前

我觉得我太复杂了。但是,在此过程中,我共享了输入数据和预期数据以及困难的代码。你能帮我们吗

主要有2个问题。1.如果使用了mutate,则在使用“汇总”的图上会出现不希望有的条纹,那么它就不会加到100%。2.我们如何提取出最杰出的参与者?


# Input data
df <- tibble(
  country = c(rep(c("India","USA","Germany","Africa"), each = 8)),
  type = c("sms","Other","whatsapp","web","online","shiny","whatsapp","whatsapp",
              "sms","sms","sms","web","web","Other","online","whatsapp",
              "sms","Other","whatsapp","shiny","online","shiny","whatsapp","whatsapp",
              "sms","sms","sms","shiny","online","Other","online","Other"
  ),
  cust = rep(c("google","Apple","wallmart","pg"),8),
  quantity = c(10,20,30,40,50,60,70,80,
               90,100,15,25,35,45,55,65,
               75,85,95,105,10,15,20,25,
               30,35,40,45,50,55,60,65)
)

# Without Customer
df %>% 
  group_by(country,type) %>%
  summarise(kpi_wo_cust = sum(quantity)) %>% 
  ungroup() -> df_wo_cust

# With Customer
df %>% 
  group_by(country,type,cust) %>%
  summarise(kpi_cust = sum(quantity)) %>% 
  ungroup() -> df_cust

df_combo <- left_join(df_cust, df_wo_cust, by = c("country","type"))
df_combo %>% glimpse()

# Aggregated data for certain KPIs for final plot
df_aggr <- df_combo %>% 
  group_by(country,type) %>%
  mutate(kpi_cust_total = sum(kpi_cust),
         per_kpi_cust = 100 * (kpi_cust/kpi_cust_total)) %>%
  group_by(country) %>%
  # In order to except from repeated counting, selecting unique()
  mutate(kpi_cust_uniq_total = sum(kpi_cust) %>% unique(),
         per_unq_kpi_cust = 100 * (kpi_cust/kpi_cust_uniq_total) %>% round(4)) 

#
plt = df_aggr %>% ungroup() %>%#glimpse()

# In order to obtain theTop 2 customers (Major contributor) within country and type 
# However, if this code is used, there is an error
# group_by(country, type) %>%
 # nest()  %>%
 # mutate(top_cust = purrr::map_chr(data, function(x){
 #   x %>% arrange(desc(per_kpi_cust)) %>%
 #     top_n(2,per_kpi_cust) %>%
 #     summarise(Cust = paste(cust,round(per_kpi_cust,2), collapse = "<br>")) %>%
 #     pull(cust)
 # })#,data = NULL
 # ) %>%
 # unnest(cols = data) %>%
  group_by(country, type) %>%
# If mutate is used, undesired stripes appear on the plot 
  # Summarize used, then it is not adding to 100% 
  mutate(avg_kpi_cust = per_unq_kpi_cust %>% mean()) %>%
  #summarise(avg_kpi_cust = per_unq_kpi_cust %>% mean()) %>%
  ggplot(aes(x = country, 
             y = avg_kpi_cust, 
             fill = type,  
             text = paste('<br>proportion: ', round(avg_kpi_cust,2), "%",
                          "<br>country:",country
                          ))) +
  geom_bar(stat = "identity"#, position=position_dodge()
  ) +
  coord_flip() +
  theme_bw()


ggplotly(plt)

查看更多

提问者
Abhishek
被浏览
49
Abhishek 2020-02-06 21:29

关键是在mutate()之后使用distinct()而不是summarise()。此外,mean()是较早使用的错误函数,而不是sum(),这导致了不完整的barplot。

library(tidyverse)
library(plotly)

# Input data
df <- tibble(
  country = c(rep(c("India","USA","Germany","Africa"), each = 8)),
  type = c("sms","Other","whatsapp","web","online","shiny","whatsapp","whatsapp",
              "sms","sms","sms","web","web","Other","online","whatsapp",
              "sms","Other","whatsapp","shiny","online","shiny","whatsapp","whatsapp",
              "sms","sms","sms","shiny","online","Other","online","Other"
  ),
  cust = rep(c("google","Apple","wallmart","pg"),8),
  quantity = c(10,20,30,40,50,60,70,80,
               90,100,15,25,35,45,55,65,
               75,85,95,105,10,15,20,25,
               30,35,40,45,50,55,60,65)
)

# Without Customer
df %>% 
  group_by(country,type) %>%
  summarise(kpi_wo_cust = sum(quantity)) %>% 
  ungroup() -> df_wo_cust

# With Customer
df %>% 
  group_by(country,type,cust) %>%
  summarise(kpi_cust = sum(quantity)) %>% 
  ungroup() -> df_cust

df_combo <- left_join(df_cust, df_wo_cust, by = c("country","type"))
df_combo %>% glimpse()

# Aggregated data for certain KPIs for final plot
df_aggr <- df_combo %>% 
  group_by(country,type) %>%
  mutate(kpi_cust_total = sum(kpi_cust),
         per_kpi_cust = 100 * (kpi_cust/kpi_cust_total)) %>%
  group_by(country) %>%
  # In order to except from repeated counting, selecting unique()
  mutate(kpi_cust_uniq_total = sum(kpi_cust) %>% unique(),
         per_unq_kpi_cust = 100 * (kpi_cust/kpi_cust_uniq_total) %>% round(4)) 

plt = df_aggr %>% ungroup() %>%
# In order to diplay Top 2 customers (Major contributor) within country and type 
group_by(country, type) %>%
nest()  %>%
mutate(top_cust = purrr::map_chr(data, function(x){
  x %>% arrange(desc(per_kpi_cust)) %>%
    top_n(2,per_kpi_cust) %>%
    summarise(Cust = paste(cust,round(per_kpi_cust,2), collapse = "<br>")) %>%
    pull(Cust)
})) %>%
unnest(cols = data) %>%
  group_by(country, type) %>%
# If mutate is used, undesired stripes appear on the plot 
  # Summarize used, then it is not adding to 100%.
  # So distinct was used
  mutate(avg_kpi_cust = per_unq_kpi_cust %>% sum()) %>%
  ungroup() %>%
  distinct(country, type, .keep_all = T) %>%
  ggplot(aes(x = country, 
             y = avg_kpi_cust, 
             fill = type,  
             text = top_cust
                          )) +
  geom_bar(stat = "identity") +
  coord_flip() +
  theme_bw()

ggplotly(plt, tooltip = "text")