Intro

Dataset Creation


set.seed(1)

df1 <- generate_dataset(n = 1000, d = 2, k = 2, m = 2, imbalance_ratio = 9)
#> Class 0 Probability Matrix - Variable 1 
#>  0.4 0.6 
#> Class 0 Probability Matrix - Variable 2 
#>  0.55 0.45 
#> Class 1 Probability Matrix - Variable 1 
#>  0.6 0.4 
#> Class 1 Probability Matrix - Variable 2 
#>  0.45 0.55

head(df1)
#>   Cat_1 Cat_2    Cont_1    Cont_2 Class
#> 1     2     1 3.3878249 0.7327539     0
#> 2     2     2 1.2620063 1.7249162     0
#> 3     2     2 1.1761498 0.4765681     0
#> 4     2     1 2.8701652 3.4538120     0
#> 5     1     1 0.7027487 1.9035630     0
#> 6     2     2 2.7650393 0.4951356     0

df1 <- df1 %>%
      mutate(across(starts_with("Cat_"), as.factor)) %>%
      mutate(Class = as.factor(Class))

summary(df1)
#>  Cat_1   Cat_2       Cont_1            Cont_2        Class  
#>  1:413   1:516   Min.   :-4.6165   Min.   :-4.5754   0:900  
#>  2:587   2:484   1st Qu.: 0.3354   1st Qu.: 0.3036   1:100  
#>                  Median : 1.3729   Median : 1.2975          
#>                  Mean   : 1.2358   Mean   : 1.2017          
#>                  3rd Qu.: 2.2624   3rd Qu.: 2.2296          
#>                  Max.   : 5.4844   Max.   : 6.1666

ggplot(df1, aes(x = Cont_1, y = Cont_2, color = factor(Class))) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = c("0" = "blue", "1" = "red")) +
  labs(title = "Separated Means",
       x = "X1", y = "X2") +
  theme_minimal() +
  theme(legend.position = "none")


df2 <- generate_dataset_new(n = 1000, d = 2, k = 2, m = 2, imbalance_ratio = 9)

ggplot(df2, aes(x = Cont_1, y = Cont_2, color = factor(Class))) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = c("0" = "blue", "1" = "red")) +
  labs(title = "Centered Mean with Exclusion Zone",
       x = "X1", y = "X2") +
  theme_minimal() +
  theme(legend.position = "none")

REF Comparison


d=3; k=3; m=6; n=500;

df <- generate_dataset(n = 500, d = 3, k = 3, m = 6, imbalance_ratio = 9)
#> Class 0 Probability Matrix - Variable 1 
#>  0.4 0.12 0.12 0.12 0.12 0.12 
#> Class 0 Probability Matrix - Variable 2 
#>  0.475 0.105 0.105 0.105 0.105 0.105 
#> Class 0 Probability Matrix - Variable 3 
#>  0.55 0.09 0.09 0.09 0.09 0.09 
#> Class 1 Probability Matrix - Variable 1 
#>  0.12 0.12 0.12 0.12 0.12 0.4 
#> Class 1 Probability Matrix - Variable 2 
#>  0.105 0.105 0.105 0.105 0.105 0.475 
#> Class 1 Probability Matrix - Variable 3 
#>  0.09 0.09 0.09 0.09 0.09 0.55

df <- df %>%
      mutate(across(starts_with("Cat_"), as.factor)) %>%
      mutate(Class = as.factor(Class))

Xcat <- df[,1:k]
Xcont <- df[,(k+1):(d+k)]

numerical_vars <- colnames(df)[grepl("Cont_", colnames(df))]
categorical_vars <- colnames(df)[grepl("Cat_", colnames(df))]
categorical_vars <- c(categorical_vars, "Class")
categorical_vars_no_class <- setdiff(categorical_vars, "Class")

# Binning 1st numerical variable
bins_eq <- generate_binned_cont(Xcont, bins = 20, bin_method = "equalwidth")[, 1]
bins_ww <- generate_binned_cont(Xcont, bins = 20, bin_method = "weighted")[, 1]

# Plot
ggplot(data.frame(bin = bins_eq), aes(x = factor(bin))) +
  geom_bar(fill = "steelblue", alpha = 0.7, color = "white") +
  labs(title = "Equalwidth binning", x = "Bin number (1–20)",
    y = "Count") + theme_minimal()


ggplot(data.frame(bin = bins_ww), aes(x = factor(bin))) +
  geom_bar(fill = "steelblue", alpha = 0.7, color = "white") +
  labs(title = "Weighted binning", x = "Bin number (1–20)",
    y = "Count") + theme_minimal()

Reference element - numerical


# Mean
colMeans(Xcont) 
#>   Cont_1   Cont_2   Cont_3 
#> 1.242398 1.220158 1.216711

# Median
matrixStats::colMedians(as.matrix(Xcont)) 
#>   Cont_1   Cont_2   Cont_3 
#> 1.464475 1.384288 1.362885

# FastMCD
CovMcd(Xcont)@center 
#>   Cont_1   Cont_2   Cont_3 
#> 1.546480 1.543070 1.514196

Reference element - categorical


# Shared Mode
df %>% dplyr::count(dplyr::across(all_of(categorical_vars_no_class)), 
        sort = TRUE) %>% dplyr::slice(1) %>% dplyr::select(-n) 
#>   Cat_1 Cat_2 Cat_3
#> 1     1     1     1

# MCA
Xcat_clean <- Xcat[, sapply(Xcat, function(x) length(unique(x)) > 1), drop = FALSE]
Xcat_clean <- Xcat_clean[, !duplicated(as.list(Xcat_clean)), drop = FALSE]
mca_result <- FactoMineR::MCA(Xcat_clean, graph = FALSE)
mca_coord <- mca_result$ind$coord
centroid <- matrixStats::colMedians(mca_coord)
idx <- which.min(apply(mca_coord, 1, function(row) sum((row - centroid)^2)))
df[idx, categorical_vars_no_class]
#>    Cat_1 Cat_2 Cat_3
#> 58     4     2     1

Reference element using Gower’s distance


reference_cat <- df %>% dplyr::count(dplyr::across(all_of(categorical_vars_no_class)), 
        sort = TRUE) %>% dplyr::slice(1) %>% dplyr::select(-n) 

reference_num <- CovMcd(Xcont)@center 

reference <- tibble(
      !!!setNames(reference_cat[1,], categorical_vars_no_class),
      !!!setNames(reference_num, numerical_vars))

reference
#> # A tibble: 1 × 6
#>   Cat_1 Cat_2 Cat_3 Cont_1 Cont_2 Cont_3
#>   <fct> <fct> <fct>  <dbl>  <dbl>  <dbl>
#> 1 1     1     1       1.56   1.54   1.51
    
for (col in categorical_vars_no_class) {
  reference[[col]] <- factor(reference[[col]], levels = levels(df[[col]]))
  df[[col]] <- factor(df[[col]], levels = levels(df[[col]]))}
    
df_with_ref <- rbind(df[, !names(df) %in% "Class"], reference)  

nrow(df_with_ref)
#> [1] 501

gower_mat <- NULL
gower_mat <- as.matrix(distmix(df_with_ref, method = "gower", idnum = (k+1):(d+k), idcat = 1:k))

gower_mat[(1:5),(1:5)]
#>            [,1]       [,2]      [,3]      [,4]      [,5]
#> [1,] 0.00000000 0.08436171 0.4568067 0.5915004 0.2726134
#> [2,] 0.08436171 0.00000000 0.4027517 0.6023634 0.2055320
#> [3,] 0.45680665 0.40275175 0.0000000 0.4050554 0.5305531
#> [4,] 0.59150038 0.60236340 0.4050554 0.0000000 0.5724279
#> [5,] 0.27261344 0.20553199 0.5305531 0.5724279 0.0000000

dist_to_ref <- as.numeric(gower_mat[nrow(gower_mat), 1:(nrow(gower_mat)-1)])

transf_dist <- as.numeric(Farness(dist_to_ref))

unique(round(dist_to_ref, 3))
#>   [1] 0.224 0.213 0.233 0.389 0.382 0.437 0.522 0.393 0.392 0.039 0.356 0.043
#>  [13] 0.184 0.203 0.381 0.570 0.045 0.191 0.204 0.216 0.237 0.559 0.540 0.369
#>  [25] 0.051 0.202 0.251 0.206 0.207 0.248 0.245 0.370 0.071 0.193 0.205 0.352
#>  [37] 0.229 0.398 0.220 0.074 0.037 0.380 0.058 0.404 0.567 0.364 0.241 0.361
#>  [49] 0.077 0.415 0.542 0.067 0.253 0.189 0.358 0.194 0.528 0.390 0.532 0.228
#>  [61] 0.259 0.236 0.420 0.408 0.199 0.256 0.399 0.384 0.395 0.533 0.234 0.616
#>  [73] 0.371 0.223 0.421 0.593 0.367 0.422 0.553 0.377 0.036 0.198 0.405 0.410
#>  [85] 0.195 0.372 0.379 0.530 0.425 0.186 0.544 0.030 0.232 0.235 0.448 0.378
#>  [97] 0.432 0.536 0.353 0.359 0.078 0.563 0.374 0.383 0.547 0.386 0.252 0.373
#> [109] 0.564 0.250 0.286 0.376 0.394 0.401 0.054 0.230 0.428 0.391 0.385 0.409
#> [121] 0.365 0.212 0.443 0.197 0.066 0.591 0.345 0.178 0.187 0.050 0.217 0.549
#> [133] 0.360 0.219 0.035 0.215 0.025 0.267 0.210 0.357 0.366 0.431 0.041 0.214
#> [145] 0.059 0.242 0.560 0.222 0.586 0.218 0.551 0.192 0.362 0.584 0.200 0.403
#> [157] 0.093 0.363 0.063 0.550 0.231 0.402 0.029 0.531 0.026 0.344 0.368 0.575
#> [169] 0.239 0.243 0.227 0.038 0.585 0.240 0.180 0.085 0.534 0.062 0.024 0.057
#> [181] 0.188 0.350 0.070 0.355 0.434 0.211 0.539 0.416 0.396 0.274 0.049 0.264
#> [193] 0.388 0.554 0.034 0.592 0.244 0.565 0.400 0.342 0.221 0.076 0.181 0.255
#> [205] 0.069 0.174 0.055 0.065 0.419 0.552 0.450 0.407 0.576 0.375 0.173 0.011
#> [217] 0.546 0.075 0.084 0.346 0.027 0.028 0.548 0.427 0.479 0.658 0.719 0.684
#> [229] 0.436 0.538 0.621 0.442 0.606 0.655 0.652 0.635 0.701 0.699 0.651 0.747
#> [241] 0.466 0.706 0.663 0.685 0.667 0.622 0.670 0.626 0.518 0.751 0.512 0.619
#> [253] 0.458 0.681 0.615 0.666 0.704 0.633 0.683 0.647 0.662 0.505 0.452 0.624
#> [265] 0.648 0.677 0.714 0.599

# The contribution from categorical variables jumps in discrete steps, 
# and the numeric contributions are often scaled small (since they’re divided
# by range and then averaged). When everything is summed and averaged, the 
# small numerical differences are "absorbed" by the categorical jumps, and when
# rounded multiple total distances can coincide. This is why we have 3 "jumps"
# in the next plot

plot_data <- data.frame(dist_to_ref  = dist_to_ref, transf_dist = transf_dist)

ggplot(plot_data, aes(x = dist_to_ref, y = transf_dist)) +
  geom_point(color = "steelblue", alpha = 0.7, size = 2) +
  labs(title = "Scatter plot",
       x = "Distance to reference", y = "Transformed distance (Farness)") +
  theme_minimal()


threshold <- 0.9
outlier_labels <- rep(0, n)
outlier_labels[which(df$Class == 1)] <- 1
predicted_outliers <- ifelse(transf_dist > threshold, 1, 0)

which(predicted_outliers == 1)
#>  [1] 453 454 463 464 466 468 470 471 473 476 482 485 487 489 496 497 499

TP <- sum(outlier_labels == 1 & predicted_outliers == 1)
TN <- sum(outlier_labels == 0 & predicted_outliers == 0)
FP <- sum(outlier_labels == 0 & predicted_outliers == 1)
FN <- sum(outlier_labels == 1 & predicted_outliers == 0)

cat("TP:", TP, "; TN:", TN, "; FP:", FP, "; FN:", FN,  "\n")
#> TP: 17 ; TN: 450 ; FP: 0 ; FN: 33

# Evaluation Metrics
epsilon <- 1e-8
recall_1 <- TP / (TP + FN + epsilon)
recall_0 <- TN / (TN + FP + epsilon)
precision_1 <- TP / (TP + FP + epsilon)
precision_0 <- TN / (TN + FN + epsilon)

cat("Re(1):", recall_1, "; Re(0):", recall_0, "; Pr(1):", precision_1, "; Pr(0):", precision_0, "\n")
#> Re(1): 0.34 ; Re(0): 1 ; Pr(1): 1 ; Pr(0): 0.931677

f1_score <- (2 * recall_1 * precision_1) / (recall_1 + precision_1 + epsilon)

# F1 Score in REF + Gower
f1_score 
#> [1] 0.5074627

Two-Stage Quantile Distance using Gower’s distance


gower_mat <- NULL
gower_mat <- as.matrix(distmix(df[,1:(d+k)], method = "gower", idnum = (k+1):(d+k), idcat = 1:k))

gower_mat[(1:5),(1:5)]
#>            [,1]       [,2]      [,3]      [,4]      [,5]
#> [1,] 0.00000000 0.08436171 0.4568067 0.5915004 0.2726134
#> [2,] 0.08436171 0.00000000 0.4027517 0.6023634 0.2055320
#> [3,] 0.45680665 0.40275175 0.0000000 0.4050554 0.5305531
#> [4,] 0.59150038 0.60236340 0.4050554 0.0000000 0.5724279
#> [5,] 0.27261344 0.20553199 0.5305531 0.5724279 0.0000000

vec <- apply(gower_mat, 1, quantile, probs = 0.9)

transf_dist <- as.numeric(Farness(vec))

threshold <- 0.9
outlier_labels <- rep(0, n)
outlier_labels[which(df$Class == 1)] <- 1
predicted_outliers <- ifelse(transf_dist > threshold, 1, 0)

which(predicted_outliers == 1)
#>  [1]   6  29  90  94  96 156 168 184 205 225 336 349 395 451 452 453 454 456 457
#> [20] 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 478
#> [39] 479 480 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 499
#> [58] 500

TP <- sum(outlier_labels == 1 & predicted_outliers == 1)
TN <- sum(outlier_labels == 0 & predicted_outliers == 0)
FP <- sum(outlier_labels == 0 & predicted_outliers == 1)
FN <- sum(outlier_labels == 1 & predicted_outliers == 0)

cat("TP:", TP, "; TN:", TN, "; FP:", FP, "; FN:", FN,  "\n")
#> TP: 45 ; TN: 437 ; FP: 13 ; FN: 5

# Evaluation Metrics
epsilon <- 1e-8
recall_1 <- TP / (TP + FN + epsilon)
recall_0 <- TN / (TN + FP + epsilon)
precision_1 <- TP / (TP + FP + epsilon)
precision_0 <- TN / (TN + FN + epsilon)

cat("Re(1):", recall_1, "; Re(0):", recall_0, "; Pr(1):", precision_1, "; Pr(0):", precision_0, "\n")
#> Re(1): 0.9 ; Re(0): 0.9711111 ; Pr(1): 0.7758621 ; Pr(0): 0.9886878

f1_score <- (2 * recall_1 * precision_1) / (recall_1 + precision_1 + epsilon)

# F1 Score in TSQD + Gower
f1_score 
#> [1] 0.8333333

Ahmad Distance Computation


aux_cond <- Ahmad.aux(x.cont = df[, numerical_vars], x.cat = df[, categorical_vars_no_class],
        type = "Norm", bins = 20, only.categ = FALSE, js = FALSE, bin_method = "weighted")

aux_cond$dist[(1:5),(1:5)]
#>          1         2        3        4         5
#> 1 0.000000 1.3239268 1.662908 1.317760 1.4235122
#> 2 1.323927 0.0000000 1.206093 1.669250 0.8747004
#> 3 1.662908 1.2060925 0.000000 1.336262 0.9317920
#> 4 1.317760 1.6692504 1.336262 0.000000 1.4344361
#> 5 1.423512 0.8747004 0.931792 1.434436 0.0000000

aux_rb <- AhmadMah.aux(x.cont = df[, numerical_vars], x.cat = df[, categorical_vars_no_class],
        type = "Norm", bins = 20, only.categ = FALSE, js = FALSE, bin_method = "weighted", method_mah = "rb")

aux_rb$dist[(1:5),(1:5)]
#>          [,1]     [,2]     [,3]     [,4]     [,5]
#> [1,] 0.000000 2.553087 3.345772 2.601286 2.863939
#> [2,] 2.553087 0.000000 2.125060 3.412934 1.310440
#> [3,] 3.345772 2.125060 0.000000 2.567532 1.290024
#> [4,] 2.601286 3.412934 2.567532 0.000000 2.827411
#> [5,] 2.863939 1.310440 1.290024 2.827411 0.000000