You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
130 lines
3.4 KiB
130 lines
3.4 KiB
---
|
|
title: "Analysis"
|
|
format: html
|
|
editor: visual
|
|
---
|
|
|
|
# Setup
|
|
|
|
```{r}
|
|
library(dplyr)
|
|
library(cluster)
|
|
library(tidyr)
|
|
library(highcharter)
|
|
library(ggplot2)
|
|
```
|
|
|
|
|
|
# Data
|
|
|
|
```{r}
|
|
dat<-read.csv("Data.csv", colClasses = c("character","factor", "factor",
|
|
"numeric","numeric","numeric",
|
|
"numeric","numeric","numeric"))
|
|
dat[is.na(dat)]<-0
|
|
```
|
|
|
|
|
|
```{r}
|
|
# Select the columns for clustering (including "Role" and "Location")
|
|
selected_cols <- c("Self.Purchase1", "Self.Purchase2", "Self.Purchase3", "Other.Purchase1", "Other.Purchase2", "Other.Purchase3", "Role", "Location")
|
|
|
|
# Subset the data frame to include only the selected columns
|
|
df_subset <- dat[selected_cols]
|
|
|
|
# Convert the "Role" and "Location" columns to factors (if they are not already)
|
|
df_subset$Role <- as.factor(df_subset$Role)
|
|
df_subset$Location <- as.factor(df_subset$Location)
|
|
|
|
# Perform one-hot encoding for "Role" and "Location"
|
|
df_encoded <- model.matrix(~Role + Location - 1, data = df_subset) # -1 removes intercept terms
|
|
|
|
numeric_cols <- dat[, c("Self.Purchase1", "Self.Purchase2", "Self.Purchase3", "Other.Purchase1", "Other.Purchase2", "Other.Purchase3")]
|
|
scaled_data <- scale(numeric_cols)
|
|
final_data <- cbind(scaled_data, df_encoded)
|
|
|
|
wss <- numeric(length = 10)
|
|
for (i in 1:10) {
|
|
kmeans_result <- kmeans(final_data, centers = i, nstart = 10)
|
|
wss[i] <- sum(kmeans_result$tot.withinss)
|
|
}
|
|
|
|
plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within-cluster Sum of Squares")
|
|
```
|
|
|
|
|
|
```{r}
|
|
optimal_k <- 4
|
|
|
|
kmeans_result <- kmeans(final_data, centers = optimal_k, nstart = 10)
|
|
|
|
dat$Cluster <- kmeans_result$cluster
|
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
summary(dat[dat$Cluster==1,])
|
|
filter.curr<-dat |>
|
|
filter(Location=="Curacao") |>
|
|
summarize(yr.1=sum(Self.Purchase1)+sum(Other.Purchase1),
|
|
yr.2=sum(Self.Purchase2)+sum(Other.Purchase2),
|
|
yr.3=sum(Self.Purchase3)+sum(Other.Purchase3)
|
|
) |>
|
|
pivot_longer(c(yr.1,yr.2,yr.3)) |>
|
|
rename(Year=name, Revenue=value)
|
|
|
|
hchart(filter.curr,
|
|
"column",
|
|
hcaes(x=Year,y=Revenue)) |>
|
|
hc_title(text="Revenue from self purchase")
|
|
|
|
unique(dat$Role)
|
|
|
|
write.csv(dat,"DataSegmented.csv",row.names = FALSE)
|
|
|
|
dat |>
|
|
summarise(current.year=sum(Self.Purchase3),
|
|
last.year=sum(Self.Purchase2))
|
|
library(ggplot2)
|
|
|
|
# Create a scatter plot to visualize the clusters
|
|
summ.dat<-dat |>
|
|
group_by(Cluster) |>
|
|
mutate(
|
|
self=sum(Self.Purchase1)+sum(Self.Purchase2)+sum(Self.Purchase3),
|
|
other=sum(Other.Purchase1)+sum(Other.Purchase2)+sum(Other.Purchase3)
|
|
)
|
|
hchart(summ.dat, type = "scatter", hcaes(x = self, y = other, color = factor(Cluster))) %>%
|
|
hc_plotOptions(
|
|
scatter = list(jitter = list(x = 10000000, y = 10000000))
|
|
) %>%
|
|
hc_title(text = "K-means Clustering Visualization") %>%
|
|
hc_xAxis(title = list(text = "Self")) %>%
|
|
hc_yAxis(title = list(text = "Other"))
|
|
```
|
|
|
|
|
|
|
|
```{r}
|
|
dat |>
|
|
group_by(Cluster) |>
|
|
summarise(
|
|
self=sum(Self.Purchase1)+sum(Self.Purchase2)+sum(Self.Purchase3),
|
|
other=sum(Other.Purchase1)+sum(Other.Purchase2)+sum(Other.Purchase3)
|
|
) |>
|
|
pivot_longer(c(self,other)) |>
|
|
hchart("column", hcaes(x=Cluster, y=value, group=name), stacking="normal") |>
|
|
hc_colors(c("#0073C2FF", "#EFC000FF")) |>
|
|
hc_title(text="Revenue by segment") |>
|
|
hc_xAxis(title=list(text="Segment")) |>
|
|
hc_yAxis(title=list(text="Revenue"))
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
|
```
|
|
|
|
|
|
|
|
|