You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

130 lines
3.4 KiB

---
title: "Analysis"
format: html
editor: visual
---
# Setup
```{r}
library(dplyr)
library(cluster)
library(tidyr)
library(highcharter)
library(ggplot2)
```
# Data
```{r}
dat<-read.csv("Data.csv", colClasses = c("character","factor", "factor",
"numeric","numeric","numeric",
"numeric","numeric","numeric"))
dat[is.na(dat)]<-0
```
```{r}
# Select the columns for clustering (including "Role" and "Location")
selected_cols <- c("Self.Purchase1", "Self.Purchase2", "Self.Purchase3", "Other.Purchase1", "Other.Purchase2", "Other.Purchase3", "Role", "Location")
# Subset the data frame to include only the selected columns
df_subset <- dat[selected_cols]
# Convert the "Role" and "Location" columns to factors (if they are not already)
df_subset$Role <- as.factor(df_subset$Role)
df_subset$Location <- as.factor(df_subset$Location)
# Perform one-hot encoding for "Role" and "Location"
df_encoded <- model.matrix(~Role + Location - 1, data = df_subset) # -1 removes intercept terms
numeric_cols <- dat[, c("Self.Purchase1", "Self.Purchase2", "Self.Purchase3", "Other.Purchase1", "Other.Purchase2", "Other.Purchase3")]
scaled_data <- scale(numeric_cols)
final_data <- cbind(scaled_data, df_encoded)
wss <- numeric(length = 10)
for (i in 1:10) {
kmeans_result <- kmeans(final_data, centers = i, nstart = 10)
wss[i] <- sum(kmeans_result$tot.withinss)
}
plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within-cluster Sum of Squares")
```
```{r}
optimal_k <- 4
kmeans_result <- kmeans(final_data, centers = optimal_k, nstart = 10)
dat$Cluster <- kmeans_result$cluster
```
```{r}
summary(dat[dat$Cluster==1,])
filter.curr<-dat |>
filter(Location=="Curacao") |>
summarize(yr.1=sum(Self.Purchase1)+sum(Other.Purchase1),
yr.2=sum(Self.Purchase2)+sum(Other.Purchase2),
yr.3=sum(Self.Purchase3)+sum(Other.Purchase3)
) |>
pivot_longer(c(yr.1,yr.2,yr.3)) |>
rename(Year=name, Revenue=value)
hchart(filter.curr,
"column",
hcaes(x=Year,y=Revenue)) |>
hc_title(text="Revenue from self purchase")
unique(dat$Role)
write.csv(dat,"DataSegmented.csv",row.names = FALSE)
dat |>
summarise(current.year=sum(Self.Purchase3),
last.year=sum(Self.Purchase2))
library(ggplot2)
# Create a scatter plot to visualize the clusters
summ.dat<-dat |>
group_by(Cluster) |>
mutate(
self=sum(Self.Purchase1)+sum(Self.Purchase2)+sum(Self.Purchase3),
other=sum(Other.Purchase1)+sum(Other.Purchase2)+sum(Other.Purchase3)
)
hchart(summ.dat, type = "scatter", hcaes(x = self, y = other, color = factor(Cluster))) %>%
hc_plotOptions(
scatter = list(jitter = list(x = 10000000, y = 10000000))
) %>%
hc_title(text = "K-means Clustering Visualization") %>%
hc_xAxis(title = list(text = "Self")) %>%
hc_yAxis(title = list(text = "Other"))
```
```{r}
dat |>
group_by(Cluster) |>
summarise(
self=sum(Self.Purchase1)+sum(Self.Purchase2)+sum(Self.Purchase3),
other=sum(Other.Purchase1)+sum(Other.Purchase2)+sum(Other.Purchase3)
) |>
pivot_longer(c(self,other)) |>
hchart("column", hcaes(x=Cluster, y=value, group=name), stacking="normal") |>
hc_colors(c("#0073C2FF", "#EFC000FF")) |>
hc_title(text="Revenue by segment") |>
hc_xAxis(title=list(text="Segment")) |>
hc_yAxis(title=list(text="Revenue"))
```
```{r}
```