segmentation/Analysis.qmd


								---

								title: "Analysis"

								format: html

								editor: visual

								---


								# Setup


								```{r}

								library(dplyr)

								library(cluster)

								library(tidyr)

								library(highcharter)

								library(ggplot2)

								```


								# Data


								```{r}

								dat<-read.csv("Data.csv", colClasses = c("character","factor", "factor",

								                                         "numeric","numeric","numeric",

								                                         "numeric","numeric","numeric"))

								dat[is.na(dat)]<-0

								```


								```{r}

								# Select the columns for clustering (including "Role" and "Location")

								selected_cols <- c("Self.Purchase1", "Self.Purchase2", "Self.Purchase3", "Other.Purchase1", "Other.Purchase2", "Other.Purchase3", "Role", "Location")


								# Subset the data frame to include only the selected columns

								df_subset <- dat[selected_cols]


								# Convert the "Role" and "Location" columns to factors (if they are not already)

								df_subset$Role <- as.factor(df_subset$Role)

								df_subset$Location <- as.factor(df_subset$Location)


								# Perform one-hot encoding for "Role" and "Location"

								df_encoded <- model.matrix(~Role + Location - 1, data = df_subset)  # -1 removes intercept terms


								numeric_cols <- dat[, c("Self.Purchase1", "Self.Purchase2", "Self.Purchase3", "Other.Purchase1", "Other.Purchase2", "Other.Purchase3")]

								scaled_data <- scale(numeric_cols)

								final_data <- cbind(scaled_data, df_encoded)


								wss <- numeric(length = 10)

								for (i in 1:10) {

								  kmeans_result <- kmeans(final_data, centers = i, nstart = 10)

								  wss[i] <- sum(kmeans_result$tot.withinss)

								}


								plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within-cluster Sum of Squares")

								```


								```{r}

								optimal_k <- 4


								kmeans_result <- kmeans(final_data, centers = optimal_k, nstart = 10)


								dat$Cluster <- kmeans_result$cluster


								```


								```{r}

								summary(dat[dat$Cluster==1,])

								filter.curr<-dat |>

								  filter(Location=="Curacao") |>

								  summarize(yr.1=sum(Self.Purchase1)+sum(Other.Purchase1),

								            yr.2=sum(Self.Purchase2)+sum(Other.Purchase2),

								            yr.3=sum(Self.Purchase3)+sum(Other.Purchase3)

								            ) |>

								  pivot_longer(c(yr.1,yr.2,yr.3)) |>

								  rename(Year=name, Revenue=value)


								hchart(filter.curr,

								       "column",

								       hcaes(x=Year,y=Revenue)) |>

								  hc_title(text="Revenue from self purchase")


								unique(dat$Role)


								write.csv(dat,"DataSegmented.csv",row.names = FALSE)


								dat |>

								  summarise(current.year=sum(Self.Purchase3),

								            last.year=sum(Self.Purchase2))

								library(ggplot2)


								# Create a scatter plot to visualize the clusters

								summ.dat<-dat |>

								  group_by(Cluster) |>

								  mutate(

								    self=sum(Self.Purchase1)+sum(Self.Purchase2)+sum(Self.Purchase3),

								    other=sum(Other.Purchase1)+sum(Other.Purchase2)+sum(Other.Purchase3)

								  )

								hchart(summ.dat, type = "scatter", hcaes(x = self, y = other, color = factor(Cluster))) %>%

								  hc_plotOptions(

								    scatter = list(jitter = list(x = 10000000, y = 10000000))

								  ) %>%

								  hc_title(text = "K-means Clustering Visualization") %>%

								  hc_xAxis(title = list(text = "Self")) %>%

								  hc_yAxis(title = list(text = "Other"))

								```


								```{r}

								 dat |>

								  group_by(Cluster) |>

								        summarise(

								          self=sum(Self.Purchase1)+sum(Self.Purchase2)+sum(Self.Purchase3),

								          other=sum(Other.Purchase1)+sum(Other.Purchase2)+sum(Other.Purchase3)

								        ) |>

								  pivot_longer(c(self,other)) |>

								        hchart("column", hcaes(x=Cluster, y=value, group=name), stacking="normal") |>

								  hc_colors(c("#0073C2FF", "#EFC000FF")) |>

								  hc_title(text="Revenue by segment") |>

								  hc_xAxis(title=list(text="Segment")) |>

								  hc_yAxis(title=list(text="Revenue"))

								```


								```{r}


								```