--- title: "Performance of three circular data clustering algorithms" author: "Tathagata Debnath and Joe Song" date: "Updated: 2021-07-27; 2020-09-05. Created: 2020-08-07" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Performance of three circular data clustering algorithms} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} bibliography: ../inst/REFERENCES.bib --- ```{r runtime-n, results='hide', message=FALSE, warning=FALSE, echo=FALSE} library(ggplot2) library(OptCirClust) library(knitr) library(reshape2) opts_chunk$set(fig.width=6, fig.height=4) FOCC <- c() BOCC <- c() HEUC <- c() number <- c() FOCC_SSQ <- c() BOCC_SSQ <- c() HEUC_SSQ <- c() First <- 1 Prev <- -1 Next <- -1 K <- 3 for(i in seq(10,100, 10 ) ) { n <- i set.seed(1) x <- c(rnorm(n, sd=0.3), rnorm(n, mean=100, sd=0.3), rnorm(n, mean=200, sd=0.3)) Data_Points <- c(x,(x + length(x))) width <- length(Data_Points)/2 Last <- length(Data_Points) - width time <- system.time(result1 <- FramedClust(Data_Points, K, width, First, Last, "linear.polylog")) FOCC <- c(FOCC,as.double(time[1])) time <- system.time(result2 <- FramedClust(Data_Points, K, width, First, Last, "Ckmeans.1d.dp")) BOCC <- c(BOCC,as.double(time[1])) time <- system.time(result3 <- FramedClust(Data_Points, K, width, First, Last, "kmeans")) HEUC <- c(HEUC,as.double(time[1])) number <- c(number, n*3) } FOCC <- FOCC * 1000 BOCC <- BOCC * 1000 HEUC <- HEUC * 1000 df1 <- data.frame("No_Points" = number, "FOCC" = FOCC, "BOCC" = BOCC, "HEUC" = HEUC) df <- reshape2::melt(df1, id.var='No_Points') plot1 <- ggplot2::ggplot(df, aes(x=No_Points, y=value, col=variable)) + geom_line() + geom_point(alpha=3) + labs(y = "Runtime (millisecond)", x = "Number of points (N)") + labs(colour = "Methods") ``` ```{r runtime-k, results='hide', message=FALSE, warning=FALSE, echo=FALSE} FOCC <- c() BOCC <- c() HEUC <- c() clusters <- c() FOCC_SSQ <- c() BOCC_SSQ <- c() HEUC_SSQ <- c() K <- 3 set.seed(1) x <- c(rnorm(50, sd=0.3), rnorm(50, mean=100, sd=0.3), rnorm(50, mean=200, sd=0.3)) Data_Points <- c(x,(x + length(x))) width <- length(Data_Points)/2 Last <- length(Data_Points) - width - 1 for(K in seq(10,100,10)) { ptm <- proc.time() result1 <- FramedClust(Data_Points, K, width, First, Last, "linear.polylog") time <- proc.time() - ptm FOCC <- c(FOCC,as.double(time[3])) FOCC_SSQ <- c(FOCC_SSQ,result1$tot.withinss) ptm <- proc.time() result2 <- FramedClust(Data_Points, K, width, First, Last, "Ckmeans.1d.dp") time <- proc.time() - ptm BOCC <- c(BOCC,as.double(time[3])) BOCC_SSQ <- c(BOCC_SSQ,result2$tot.withinss) ptm <- proc.time() result3 <- FramedClust(Data_Points, K, width, First, Last, "kmeans") time <- proc.time() - ptm HEUC <- c(HEUC,as.double(time[3])) HEUC_SSQ <- c(HEUC_SSQ,result3$tot.withinss) clusters <- c(clusters, K) } FOCC <- FOCC * 1000 BOCC <- BOCC * 1000 HEUC <- HEUC * 1000 df1 <- data.frame("No_Clusters" = clusters, "FOCC" = FOCC, "BOCC" = BOCC, "HEUC" = HEUC) df <- reshape2::melt(df1, id.var='No_Clusters') plot2 <- ggplot2::ggplot(df, aes(x=No_Clusters, y=value, col=variable)) + geom_line() + geom_point(alpha=3) + labs(y = "Runtime (milliseccond)", x = "Number of clusters (K)") + labs(colour = "Methods") ``` ```{r SSQ, results='hide', message=FALSE, warning=FALSE, echo=FALSE} df1 <- data.frame("No_Clusters" = clusters, "FOCC" = FOCC_SSQ, "BOCC" = BOCC_SSQ, "HEUC" = HEUC_SSQ) df <- reshape2::melt(df1, id.var='No_Clusters') df$mysize <- rep(0, nrow(df)) df$mysize[df$variable=="FOCC"] <- 1 plot3 <- ggplot2::ggplot(df, aes(x=No_Clusters, y=value, col=variable, size=mysize)) + geom_line() + geom_point(alpha=3) + labs(y="Within-cluster sum of squared distances", x = "Number of clusters (K)") + labs(colour = "Methods") + scale_size(range = c(2, 4), guide="none") + scale_y_continuous(trans = 'log2') ``` We illustrate the runtime and quality of three circular data clustering algorithms: 1. fast optimal circular clustering (FOCC) [@Debnath21], 2. repeated heuristic $K$-means clustering (HEUC), and 3. brute force optimal clustering (BOCC). ## Runtime as a function of number of points for circular clustering The runtime of both optimal and heuristic algorithms increases with the number of points in the circular data. Empirical runtime of the three algorithms as a function of the number of points $N$ are shown in the figure below. Both the HEUC and the BOCC algorithms increase at a quadratic rate. In contrast, the runtime of FOCC increases at a linear poly-logarithmic rate. Runtime is crucial for processing large datasets. ```{r results='hide', message=FALSE, warning=FALSE, echo=FALSE} plot(plot1) ``` ## Runtime as a function of number of clusters The runtime is also affected by the number of clusters $K$ inside the circular data. The figure below shows this effect on the three algorithms. Empirical runtime of both HEUC and BOCC increases at a similar rate to FOCC over an increasing $K$. ```{r, results='hide', message=FALSE, warning=FALSE, echo=FALSE} plot(plot2) ``` ## Circular clustering optimality as a function of number of clusters The within-cluster sum of squared distances (SSQ) indicates the compactness of a cluster. A low value of SSQ suggests a good compact clustering outcome. The last figure shows the change in SSQ value with number of clusters $K$ in the data. The SSQ values of BOCC and FOCC always remain no more than that of the HEUC algorithm. This indicates the optimal BOCC and FOCC algorithms identify better clusters than the heuristic HEUC algorithm. The advantage of FOCC and BOCC over HEUC in SSQ gradually increases with $K$. ```{r, results='hide', message=FALSE, warning=FALSE, echo=FALSE} plot(plot3) ``` ## Conclusions The examples demonstrate the advantage of FOCC algorithm over the existing BOCC and HEUC algorithms in both runtime and cluster quality. Therefore, we recommend FOCC as the best choice for circular clustering; its performance on input circular data consisting of a large number of points, having large number of clusters far exceeds the two alternatives. ## References