---
title: "Analysis"
format: html
editor: visual
---

# Setup

```{r}
library(dplyr)
library(cluster)
library(tidyr)
library(highcharter)
library(ggplot2)
```


# Data

```{r}
dat<-read.csv("Data.csv", colClasses = c("character","factor", "factor",
                                         "numeric","numeric","numeric",
                                         "numeric","numeric","numeric"))
dat[is.na(dat)]<-0
```


```{r}
# Select the columns for clustering (including "Role" and "Location")
selected_cols <- c("Self.Purchase1", "Self.Purchase2", "Self.Purchase3", "Other.Purchase1", "Other.Purchase2", "Other.Purchase3", "Role", "Location")

# Subset the data frame to include only the selected columns
df_subset <- dat[selected_cols]

# Convert the "Role" and "Location" columns to factors (if they are not already)
df_subset$Role <- as.factor(df_subset$Role)
df_subset$Location <- as.factor(df_subset$Location)

# Perform one-hot encoding for "Role" and "Location"
df_encoded <- model.matrix(~Role + Location - 1, data = df_subset)  # -1 removes intercept terms

numeric_cols <- dat[, c("Self.Purchase1", "Self.Purchase2", "Self.Purchase3", "Other.Purchase1", "Other.Purchase2", "Other.Purchase3")]
scaled_data <- scale(numeric_cols)
final_data <- cbind(scaled_data, df_encoded)

wss <- numeric(length = 10)
for (i in 1:10) {
  kmeans_result <- kmeans(final_data, centers = i, nstart = 10)
  wss[i] <- sum(kmeans_result$tot.withinss)
}

plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within-cluster Sum of Squares")
```


```{r}
optimal_k <- 4

kmeans_result <- kmeans(final_data, centers = optimal_k, nstart = 10)

dat$Cluster <- kmeans_result$cluster

```


```{r}
summary(dat[dat$Cluster==1,])
filter.curr<-dat |> 
  filter(Location=="Curacao") |> 
  summarize(yr.1=sum(Self.Purchase1)+sum(Other.Purchase1),
            yr.2=sum(Self.Purchase2)+sum(Other.Purchase2),
            yr.3=sum(Self.Purchase3)+sum(Other.Purchase3)
            ) |> 
  pivot_longer(c(yr.1,yr.2,yr.3)) |> 
  rename(Year=name, Revenue=value)

hchart(filter.curr,
       "column",
       hcaes(x=Year,y=Revenue)) |> 
  hc_title(text="Revenue from self purchase")

unique(dat$Role)

write.csv(dat,"DataSegmented.csv",row.names = FALSE)

dat |> 
  summarise(current.year=sum(Self.Purchase3),
            last.year=sum(Self.Purchase2))
library(ggplot2)

# Create a scatter plot to visualize the clusters
summ.dat<-dat |> 
  group_by(Cluster) |> 
  mutate(
    self=sum(Self.Purchase1)+sum(Self.Purchase2)+sum(Self.Purchase3),
    other=sum(Other.Purchase1)+sum(Other.Purchase2)+sum(Other.Purchase3)
  )
hchart(summ.dat, type = "scatter", hcaes(x = self, y = other, color = factor(Cluster))) %>%
  hc_plotOptions(
    scatter = list(jitter = list(x = 10000000, y = 10000000))
  ) %>%
  hc_title(text = "K-means Clustering Visualization") %>%
  hc_xAxis(title = list(text = "Self")) %>%
  hc_yAxis(title = list(text = "Other"))
```


```{r}
 dat |> 
  group_by(Cluster) |>
        summarise(
          self=sum(Self.Purchase1)+sum(Self.Purchase2)+sum(Self.Purchase3),
          other=sum(Other.Purchase1)+sum(Other.Purchase2)+sum(Other.Purchase3)
        ) |> 
  pivot_longer(c(self,other)) |>
        hchart("column", hcaes(x=Cluster, y=value, group=name), stacking="normal") |>
  hc_colors(c("#0073C2FF", "#EFC000FF")) |>
  hc_title(text="Revenue by segment") |>
  hc_xAxis(title=list(text="Segment")) |>
  hc_yAxis(title=list(text="Revenue"))
```


```{r}

```