--- title: "Analysis" format: html editor: visual --- # Setup ```{r} library(dplyr) library(cluster) library(tidyr) library(highcharter) library(ggplot2) ``` # Data ```{r} dat<-read.csv("Data.csv", colClasses = c("character","factor", "factor", "numeric","numeric","numeric", "numeric","numeric","numeric")) dat[is.na(dat)]<-0 ``` ```{r} # Select the columns for clustering (including "Role" and "Location") selected_cols <- c("Self.Purchase1", "Self.Purchase2", "Self.Purchase3", "Other.Purchase1", "Other.Purchase2", "Other.Purchase3", "Role", "Location") # Subset the data frame to include only the selected columns df_subset <- dat[selected_cols] # Convert the "Role" and "Location" columns to factors (if they are not already) df_subset$Role <- as.factor(df_subset$Role) df_subset$Location <- as.factor(df_subset$Location) # Perform one-hot encoding for "Role" and "Location" df_encoded <- model.matrix(~Role + Location - 1, data = df_subset) # -1 removes intercept terms numeric_cols <- dat[, c("Self.Purchase1", "Self.Purchase2", "Self.Purchase3", "Other.Purchase1", "Other.Purchase2", "Other.Purchase3")] scaled_data <- scale(numeric_cols) final_data <- cbind(scaled_data, df_encoded) wss <- numeric(length = 10) for (i in 1:10) { kmeans_result <- kmeans(final_data, centers = i, nstart = 10) wss[i] <- sum(kmeans_result$tot.withinss) } plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within-cluster Sum of Squares") ``` ```{r} optimal_k <- 4 kmeans_result <- kmeans(final_data, centers = optimal_k, nstart = 10) dat$Cluster <- kmeans_result$cluster ``` ```{r} summary(dat[dat$Cluster==1,]) filter.curr<-dat |> filter(Location=="Curacao") |> summarize(yr.1=sum(Self.Purchase1)+sum(Other.Purchase1), yr.2=sum(Self.Purchase2)+sum(Other.Purchase2), yr.3=sum(Self.Purchase3)+sum(Other.Purchase3) ) |> pivot_longer(c(yr.1,yr.2,yr.3)) |> rename(Year=name, Revenue=value) hchart(filter.curr, "column", hcaes(x=Year,y=Revenue)) |> hc_title(text="Revenue from self purchase") unique(dat$Role) write.csv(dat,"DataSegmented.csv",row.names = FALSE) dat |> summarise(current.year=sum(Self.Purchase3), last.year=sum(Self.Purchase2)) library(ggplot2) # Create a scatter plot to visualize the clusters summ.dat<-dat |> group_by(Cluster) |> mutate( self=sum(Self.Purchase1)+sum(Self.Purchase2)+sum(Self.Purchase3), other=sum(Other.Purchase1)+sum(Other.Purchase2)+sum(Other.Purchase3) ) hchart(summ.dat, type = "scatter", hcaes(x = self, y = other, color = factor(Cluster))) %>% hc_plotOptions( scatter = list(jitter = list(x = 10000000, y = 10000000)) ) %>% hc_title(text = "K-means Clustering Visualization") %>% hc_xAxis(title = list(text = "Self")) %>% hc_yAxis(title = list(text = "Other")) ``` ```{r} dat |> group_by(Cluster) |> summarise( self=sum(Self.Purchase1)+sum(Self.Purchase2)+sum(Self.Purchase3), other=sum(Other.Purchase1)+sum(Other.Purchase2)+sum(Other.Purchase3) ) |> pivot_longer(c(self,other)) |> hchart("column", hcaes(x=Cluster, y=value, group=name), stacking="normal") |> hc_colors(c("#0073C2FF", "#EFC000FF")) |> hc_title(text="Revenue by segment") |> hc_xAxis(title=list(text="Segment")) |> hc_yAxis(title=list(text="Revenue")) ``` ```{r} ```