set.seed(1)
df1 <- generate_dataset(n = 1000, d = 2, k = 2, m = 2, imbalance_ratio = 9)
#> Class 0 Probability Matrix - Variable 1
#> 0.4 0.6
#> Class 0 Probability Matrix - Variable 2
#> 0.55 0.45
#> Class 1 Probability Matrix - Variable 1
#> 0.6 0.4
#> Class 1 Probability Matrix - Variable 2
#> 0.45 0.55
head(df1)
#> Cat_1 Cat_2 Cont_1 Cont_2 Class
#> 1 2 1 3.3878249 0.7327539 0
#> 2 2 2 1.2620063 1.7249162 0
#> 3 2 2 1.1761498 0.4765681 0
#> 4 2 1 2.8701652 3.4538120 0
#> 5 1 1 0.7027487 1.9035630 0
#> 6 2 2 2.7650393 0.4951356 0
df1 <- df1 %>%
mutate(across(starts_with("Cat_"), as.factor)) %>%
mutate(Class = as.factor(Class))
summary(df1)
#> Cat_1 Cat_2 Cont_1 Cont_2 Class
#> 1:413 1:516 Min. :-4.6165 Min. :-4.5754 0:900
#> 2:587 2:484 1st Qu.: 0.3354 1st Qu.: 0.3036 1:100
#> Median : 1.3729 Median : 1.2975
#> Mean : 1.2358 Mean : 1.2017
#> 3rd Qu.: 2.2624 3rd Qu.: 2.2296
#> Max. : 5.4844 Max. : 6.1666
ggplot(df1, aes(x = Cont_1, y = Cont_2, color = factor(Class))) +
geom_point(alpha = 0.6) +
scale_color_manual(values = c("0" = "blue", "1" = "red")) +
labs(title = "Separated Means",
x = "X1", y = "X2") +
theme_minimal() +
theme(legend.position = "none")
df2 <- generate_dataset_new(n = 1000, d = 2, k = 2, m = 2, imbalance_ratio = 9)
ggplot(df2, aes(x = Cont_1, y = Cont_2, color = factor(Class))) +
geom_point(alpha = 0.6) +
scale_color_manual(values = c("0" = "blue", "1" = "red")) +
labs(title = "Centered Mean with Exclusion Zone",
x = "X1", y = "X2") +
theme_minimal() +
theme(legend.position = "none")
d=3; k=3; m=6; n=500;
df <- generate_dataset(n = 500, d = 3, k = 3, m = 6, imbalance_ratio = 9)
#> Class 0 Probability Matrix - Variable 1
#> 0.4 0.12 0.12 0.12 0.12 0.12
#> Class 0 Probability Matrix - Variable 2
#> 0.475 0.105 0.105 0.105 0.105 0.105
#> Class 0 Probability Matrix - Variable 3
#> 0.55 0.09 0.09 0.09 0.09 0.09
#> Class 1 Probability Matrix - Variable 1
#> 0.12 0.12 0.12 0.12 0.12 0.4
#> Class 1 Probability Matrix - Variable 2
#> 0.105 0.105 0.105 0.105 0.105 0.475
#> Class 1 Probability Matrix - Variable 3
#> 0.09 0.09 0.09 0.09 0.09 0.55
df <- df %>%
mutate(across(starts_with("Cat_"), as.factor)) %>%
mutate(Class = as.factor(Class))
Xcat <- df[,1:k]
Xcont <- df[,(k+1):(d+k)]
numerical_vars <- colnames(df)[grepl("Cont_", colnames(df))]
categorical_vars <- colnames(df)[grepl("Cat_", colnames(df))]
categorical_vars <- c(categorical_vars, "Class")
categorical_vars_no_class <- setdiff(categorical_vars, "Class")
# Binning 1st numerical variable
bins_eq <- generate_binned_cont(Xcont, bins = 20, bin_method = "equalwidth")[, 1]
bins_ww <- generate_binned_cont(Xcont, bins = 20, bin_method = "weighted")[, 1]
# Plot
ggplot(data.frame(bin = bins_eq), aes(x = factor(bin))) +
geom_bar(fill = "steelblue", alpha = 0.7, color = "white") +
labs(title = "Equalwidth binning", x = "Bin number (1–20)",
y = "Count") + theme_minimal()
ggplot(data.frame(bin = bins_ww), aes(x = factor(bin))) +
geom_bar(fill = "steelblue", alpha = 0.7, color = "white") +
labs(title = "Weighted binning", x = "Bin number (1–20)",
y = "Count") + theme_minimal()
# Shared Mode
df %>% dplyr::count(dplyr::across(all_of(categorical_vars_no_class)),
sort = TRUE) %>% dplyr::slice(1) %>% dplyr::select(-n)
#> Cat_1 Cat_2 Cat_3
#> 1 1 1 1
# MCA
Xcat_clean <- Xcat[, sapply(Xcat, function(x) length(unique(x)) > 1), drop = FALSE]
Xcat_clean <- Xcat_clean[, !duplicated(as.list(Xcat_clean)), drop = FALSE]
mca_result <- FactoMineR::MCA(Xcat_clean, graph = FALSE)
mca_coord <- mca_result$ind$coord
centroid <- matrixStats::colMedians(mca_coord)
idx <- which.min(apply(mca_coord, 1, function(row) sum((row - centroid)^2)))
df[idx, categorical_vars_no_class]
#> Cat_1 Cat_2 Cat_3
#> 58 4 2 1
reference_cat <- df %>% dplyr::count(dplyr::across(all_of(categorical_vars_no_class)),
sort = TRUE) %>% dplyr::slice(1) %>% dplyr::select(-n)
reference_num <- CovMcd(Xcont)@center
reference <- tibble(
!!!setNames(reference_cat[1,], categorical_vars_no_class),
!!!setNames(reference_num, numerical_vars))
reference
#> # A tibble: 1 × 6
#> Cat_1 Cat_2 Cat_3 Cont_1 Cont_2 Cont_3
#> <fct> <fct> <fct> <dbl> <dbl> <dbl>
#> 1 1 1 1 1.56 1.54 1.51
for (col in categorical_vars_no_class) {
reference[[col]] <- factor(reference[[col]], levels = levels(df[[col]]))
df[[col]] <- factor(df[[col]], levels = levels(df[[col]]))}
df_with_ref <- rbind(df[, !names(df) %in% "Class"], reference)
nrow(df_with_ref)
#> [1] 501
gower_mat <- NULL
gower_mat <- as.matrix(distmix(df_with_ref, method = "gower", idnum = (k+1):(d+k), idcat = 1:k))
gower_mat[(1:5),(1:5)]
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.00000000 0.08436171 0.4568067 0.5915004 0.2726134
#> [2,] 0.08436171 0.00000000 0.4027517 0.6023634 0.2055320
#> [3,] 0.45680665 0.40275175 0.0000000 0.4050554 0.5305531
#> [4,] 0.59150038 0.60236340 0.4050554 0.0000000 0.5724279
#> [5,] 0.27261344 0.20553199 0.5305531 0.5724279 0.0000000
dist_to_ref <- as.numeric(gower_mat[nrow(gower_mat), 1:(nrow(gower_mat)-1)])
transf_dist <- as.numeric(Farness(dist_to_ref))
unique(round(dist_to_ref, 3))
#> [1] 0.224 0.213 0.233 0.389 0.382 0.437 0.522 0.393 0.392 0.039 0.356 0.043
#> [13] 0.184 0.203 0.381 0.570 0.045 0.191 0.204 0.216 0.237 0.559 0.540 0.369
#> [25] 0.051 0.202 0.251 0.206 0.207 0.248 0.245 0.370 0.071 0.193 0.205 0.352
#> [37] 0.229 0.398 0.220 0.074 0.037 0.380 0.058 0.404 0.567 0.364 0.241 0.361
#> [49] 0.077 0.415 0.542 0.067 0.253 0.189 0.358 0.194 0.528 0.390 0.532 0.228
#> [61] 0.259 0.236 0.420 0.408 0.199 0.256 0.399 0.384 0.395 0.533 0.234 0.616
#> [73] 0.371 0.223 0.421 0.593 0.367 0.422 0.553 0.377 0.036 0.198 0.405 0.410
#> [85] 0.195 0.372 0.379 0.530 0.425 0.186 0.544 0.030 0.232 0.235 0.448 0.378
#> [97] 0.432 0.536 0.353 0.359 0.078 0.563 0.374 0.383 0.547 0.386 0.252 0.373
#> [109] 0.564 0.250 0.286 0.376 0.394 0.401 0.054 0.230 0.428 0.391 0.385 0.409
#> [121] 0.365 0.212 0.443 0.197 0.066 0.591 0.345 0.178 0.187 0.050 0.217 0.549
#> [133] 0.360 0.219 0.035 0.215 0.025 0.267 0.210 0.357 0.366 0.431 0.041 0.214
#> [145] 0.059 0.242 0.560 0.222 0.586 0.218 0.551 0.192 0.362 0.584 0.200 0.403
#> [157] 0.093 0.363 0.063 0.550 0.231 0.402 0.029 0.531 0.026 0.344 0.368 0.575
#> [169] 0.239 0.243 0.227 0.038 0.585 0.240 0.180 0.085 0.534 0.062 0.024 0.057
#> [181] 0.188 0.350 0.070 0.355 0.434 0.211 0.539 0.416 0.396 0.274 0.049 0.264
#> [193] 0.388 0.554 0.034 0.592 0.244 0.565 0.400 0.342 0.221 0.076 0.181 0.255
#> [205] 0.069 0.174 0.055 0.065 0.419 0.552 0.450 0.407 0.576 0.375 0.173 0.011
#> [217] 0.546 0.075 0.084 0.346 0.027 0.028 0.548 0.427 0.479 0.658 0.719 0.684
#> [229] 0.436 0.538 0.621 0.442 0.606 0.655 0.652 0.635 0.701 0.699 0.651 0.747
#> [241] 0.466 0.706 0.663 0.685 0.667 0.622 0.670 0.626 0.518 0.751 0.512 0.619
#> [253] 0.458 0.681 0.615 0.666 0.704 0.633 0.683 0.647 0.662 0.505 0.452 0.624
#> [265] 0.648 0.677 0.714 0.599
# The contribution from categorical variables jumps in discrete steps,
# and the numeric contributions are often scaled small (since they’re divided
# by range and then averaged). When everything is summed and averaged, the
# small numerical differences are "absorbed" by the categorical jumps, and when
# rounded multiple total distances can coincide. This is why we have 3 "jumps"
# in the next plot
plot_data <- data.frame(dist_to_ref = dist_to_ref, transf_dist = transf_dist)
ggplot(plot_data, aes(x = dist_to_ref, y = transf_dist)) +
geom_point(color = "steelblue", alpha = 0.7, size = 2) +
labs(title = "Scatter plot",
x = "Distance to reference", y = "Transformed distance (Farness)") +
theme_minimal()
threshold <- 0.9
outlier_labels <- rep(0, n)
outlier_labels[which(df$Class == 1)] <- 1
predicted_outliers <- ifelse(transf_dist > threshold, 1, 0)
which(predicted_outliers == 1)
#> [1] 453 454 463 464 466 468 470 471 473 476 482 485 487 489 496 497 499
TP <- sum(outlier_labels == 1 & predicted_outliers == 1)
TN <- sum(outlier_labels == 0 & predicted_outliers == 0)
FP <- sum(outlier_labels == 0 & predicted_outliers == 1)
FN <- sum(outlier_labels == 1 & predicted_outliers == 0)
cat("TP:", TP, "; TN:", TN, "; FP:", FP, "; FN:", FN, "\n")
#> TP: 17 ; TN: 450 ; FP: 0 ; FN: 33
# Evaluation Metrics
epsilon <- 1e-8
recall_1 <- TP / (TP + FN + epsilon)
recall_0 <- TN / (TN + FP + epsilon)
precision_1 <- TP / (TP + FP + epsilon)
precision_0 <- TN / (TN + FN + epsilon)
cat("Re(1):", recall_1, "; Re(0):", recall_0, "; Pr(1):", precision_1, "; Pr(0):", precision_0, "\n")
#> Re(1): 0.34 ; Re(0): 1 ; Pr(1): 1 ; Pr(0): 0.931677
f1_score <- (2 * recall_1 * precision_1) / (recall_1 + precision_1 + epsilon)
# F1 Score in REF + Gower
f1_score
#> [1] 0.5074627
gower_mat <- NULL
gower_mat <- as.matrix(distmix(df[,1:(d+k)], method = "gower", idnum = (k+1):(d+k), idcat = 1:k))
gower_mat[(1:5),(1:5)]
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.00000000 0.08436171 0.4568067 0.5915004 0.2726134
#> [2,] 0.08436171 0.00000000 0.4027517 0.6023634 0.2055320
#> [3,] 0.45680665 0.40275175 0.0000000 0.4050554 0.5305531
#> [4,] 0.59150038 0.60236340 0.4050554 0.0000000 0.5724279
#> [5,] 0.27261344 0.20553199 0.5305531 0.5724279 0.0000000
vec <- apply(gower_mat, 1, quantile, probs = 0.9)
transf_dist <- as.numeric(Farness(vec))
threshold <- 0.9
outlier_labels <- rep(0, n)
outlier_labels[which(df$Class == 1)] <- 1
predicted_outliers <- ifelse(transf_dist > threshold, 1, 0)
which(predicted_outliers == 1)
#> [1] 6 29 90 94 96 156 168 184 205 225 336 349 395 451 452 453 454 456 457
#> [20] 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 478
#> [39] 479 480 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 499
#> [58] 500
TP <- sum(outlier_labels == 1 & predicted_outliers == 1)
TN <- sum(outlier_labels == 0 & predicted_outliers == 0)
FP <- sum(outlier_labels == 0 & predicted_outliers == 1)
FN <- sum(outlier_labels == 1 & predicted_outliers == 0)
cat("TP:", TP, "; TN:", TN, "; FP:", FP, "; FN:", FN, "\n")
#> TP: 45 ; TN: 437 ; FP: 13 ; FN: 5
# Evaluation Metrics
epsilon <- 1e-8
recall_1 <- TP / (TP + FN + epsilon)
recall_0 <- TN / (TN + FP + epsilon)
precision_1 <- TP / (TP + FP + epsilon)
precision_0 <- TN / (TN + FN + epsilon)
cat("Re(1):", recall_1, "; Re(0):", recall_0, "; Pr(1):", precision_1, "; Pr(0):", precision_0, "\n")
#> Re(1): 0.9 ; Re(0): 0.9711111 ; Pr(1): 0.7758621 ; Pr(0): 0.9886878
f1_score <- (2 * recall_1 * precision_1) / (recall_1 + precision_1 + epsilon)
# F1 Score in TSQD + Gower
f1_score
#> [1] 0.8333333
aux_cond <- Ahmad.aux(x.cont = df[, numerical_vars], x.cat = df[, categorical_vars_no_class],
type = "Norm", bins = 20, only.categ = FALSE, js = FALSE, bin_method = "weighted")
aux_cond$dist[(1:5),(1:5)]
#> 1 2 3 4 5
#> 1 0.000000 1.3239268 1.662908 1.317760 1.4235122
#> 2 1.323927 0.0000000 1.206093 1.669250 0.8747004
#> 3 1.662908 1.2060925 0.000000 1.336262 0.9317920
#> 4 1.317760 1.6692504 1.336262 0.000000 1.4344361
#> 5 1.423512 0.8747004 0.931792 1.434436 0.0000000
aux_rb <- AhmadMah.aux(x.cont = df[, numerical_vars], x.cat = df[, categorical_vars_no_class],
type = "Norm", bins = 20, only.categ = FALSE, js = FALSE, bin_method = "weighted", method_mah = "rb")
aux_rb$dist[(1:5),(1:5)]
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.000000 2.553087 3.345772 2.601286 2.863939
#> [2,] 2.553087 0.000000 2.125060 3.412934 1.310440
#> [3,] 3.345772 2.125060 0.000000 2.567532 1.290024
#> [4,] 2.601286 3.412934 2.567532 0.000000 2.827411
#> [5,] 2.863939 1.310440 1.290024 2.827411 0.000000