Chapter 5 Diagnostic cardinality | Supplemental Material for An Exploration of Exploration: Measuring the ability of lexicase selection to find obscure pathways

5.1 Overview

# Relative location of data.
working_directory <-
  "experiments/2021-05-27-cardinality/analysis/"
# working_directory <- "./"

# Settings for visualization
cb_palette <- "Set2"
# Create directory to dump plots
dir.create(paste0(working_directory, "imgs"), showWarnings=FALSE)

5.2 Analysis dependencies

library(ggplot2)
library(tidyverse)
library(knitr)
library(cowplot)
library(viridis)
library(RColorBrewer)
library(rstatix)
library(ggsignif)
library(Hmisc)
library(kableExtra)
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")

These analyses were conducted in the following computing environment:

print(version)

##                _                           
## platform       x86_64-pc-linux-gnu         
## arch           x86_64                      
## os             linux-gnu                   
## system         x86_64, linux-gnu           
## status                                     
## major          4                           
## minor          1.0                         
## year           2021                        
## month          05                          
## day            18                          
## svn rev        80317                       
## language       R                           
## version.string R version 4.1.0 (2021-05-18)
## nickname       Camp Pontanezen

5.3 Setup

data_loc <- paste0(
  working_directory,
  "data/timeseries-res-1000g.csv"
)
data <- read.csv(
  data_loc,
  na.strings="NONE"
)

data$cardinality <- as.factor(
  data$OBJECTIVE_CNT
)
data$selection_name <- as.factor(
  data$selection_name
)

data$elite_trait_avg <-
  data$ele_agg_per / data$OBJECTIVE_CNT

data$unique_start_positions_coverage <-
  data$uni_str_pos / data$OBJECTIVE_CNT

final_data <- filter(data, gen==max(data$gen))

# Labeler for stats annotations
p_label <- function(p_value) {
  threshold = 0.0001
  if (p_value < threshold) {
    return(paste0("p < ", threshold))
  } else {
    return(paste0("p = ", p_value))
  }
}

# Significance threshold
alpha <- 0.05

####### misc #######
# Configure our default graphing theme
theme_set(theme_cowplot())

5.4 Exploration diagnostic performance

First, we look at performance over time. Specifically, we look at the normalized aggregage score of the most performant individuals over time. To control for different cardinalities having different maximum scores, we normalized performances (by dividing by cardinality) to values between 0 and 100.

elite_trait_ave_fit <- ggplot(
    data,
    aes(
      x=gen,
      y=elite_trait_avg,
      color=cardinality,
      fill=cardinality
    )
  ) +
  stat_summary(geom="line", fun=mean) +
  stat_summary(
    geom="ribbon",
    fun.data="mean_cl_boot",
    fun.args=list(conf.int=0.95),
    alpha=0.2,
    linetype=0
  ) +
  scale_y_continuous(
    name="Average trait performance",
    limits=c(0, 100)
  ) +
  scale_x_continuous(
    name="Generation"
  ) +
  scale_fill_brewer(
    name="Cardinaltiy",
    palette=cb_palette
  ) +
  scale_color_brewer(
    name="Cardinaltiy",
    palette=cb_palette
  )
elite_trait_ave_fit

5.4.1 Final performance

Next, we look only at the final performances of each treatment

# Compute manual labels for geom_signif
stat.test <- final_data %>%
  wilcox_test(elite_trait_avg ~ cardinality) %>%
  adjust_pvalue(method = "bonferroni") %>%
  add_significance() %>%
  add_xy_position(x="cardinality",step.increase=1)
stat.test$manual_position <- stat.test$y.position * 1.05
stat.test$label <- mapply(p_label,stat.test$p.adj)

elite_trait_ave_fit_final <- ggplot(
    final_data,
    aes(x=cardinality, y=elite_trait_avg, fill=cardinality)
  ) +
  geom_flat_violin(
    position = position_nudge(x = .2, y = 0),
    alpha = .8,
    scale="width"
  ) +
  geom_point(
    mapping=aes(color=cardinality),
    position = position_jitter(width = .15),
    size = .5,
    alpha = 0.8
  ) +
  geom_boxplot(
    width = .1,
    outlier.shape = NA,
    alpha = 0.5
  ) +
  scale_y_continuous(
    name="Average trait performance",
    limits=c(0, 100)
  ) +
  scale_x_discrete(
    name="Cardinality"
  ) +
  scale_fill_brewer(
    name="Cardinaltiy",
    palette=cb_palette
  ) +
  scale_color_brewer(
    name="Cardinaltiy",
    palette=cb_palette
  ) +
  theme(
    legend.position="none"
  )
elite_trait_ave_fit_final

stat.test %>%
  kbl() %>%
  kable_styling(
    bootstrap_options = c(
      "striped",
      "hover",
      "condensed",
      "responsive"
    )
  ) %>%
  scroll_box(width="600px")

.y.	group1	group2	n1	n2	statistic	p.adj.signif	y.position	groups	xmin	xmax	manual_position	label
elite_trait_avg	10	20	50	50	2399	****	191.9440	10, 20	1	2	201.5412	p < 1e-04
elite_trait_avg	10	50	50	50	2404	****	288.4852	10, 50	1	3	302.9095	p < 1e-04
elite_trait_avg	10	100	50	50	2438	****	385.0264	10 , 100	1	4	404.2777	p < 1e-04
elite_trait_avg	10	200	50	50	2500	****	481.5676	10 , 200	1	5	505.6460	p < 1e-04
elite_trait_avg	10	500	50	50	2500	****	578.1088	10 , 500	1	6	607.0142	p < 1e-04
elite_trait_avg	10	1000	50	50	2500	****	674.6500	10 , 1000	1	7	708.3825	p < 1e-04
elite_trait_avg	20	50	50	50	2500	****	771.1912	20, 50	2	3	809.7508	p < 1e-04
elite_trait_avg	20	100	50	50	2500	****	867.7324	20 , 100	2	4	911.1190	p < 1e-04
elite_trait_avg	20	200	50	50	2500	****	964.2736	20 , 200	2	5	1012.4873	p < 1e-04
elite_trait_avg	20	500	50	50	2500	****	1060.8148	20 , 500	2	6	1113.8555	p < 1e-04
elite_trait_avg	20	1000	50	50	2500	****	1157.3560	20 , 1000	2	7	1215.2238	p < 1e-04
elite_trait_avg	50	100	50	50	2166	****	1253.8972	50 , 100	3	4	1316.5921	p < 1e-04
elite_trait_avg	50	200	50	50	2500	****	1350.4384	50 , 200	3	5	1417.9603	p < 1e-04
elite_trait_avg	50	500	50	50	2500	****	1446.9796	50 , 500	3	6	1519.3286	p < 1e-04
elite_trait_avg	50	1000	50	50	2500	****	1543.5208	50 , 1000	3	7	1620.6968	p < 1e-04
elite_trait_avg	100	200	50	50	2500	****	1640.0620	100, 200	4	5	1722.0651	p < 1e-04
elite_trait_avg	100	500	50	50	2500	****	1736.6032	100, 500	4	6	1823.4334	p < 1e-04
elite_trait_avg	100	1000	50	50	2500	****	1833.1444	100 , 1000	4	7	1924.8016	p < 1e-04
elite_trait_avg	200	500	50	50	2500	****	1929.6856	200, 500	5	6	2026.1699	p < 1e-04
elite_trait_avg	200	1000	50	50	2500	****	2026.2268	200 , 1000	5	7	2127.5381	p < 1e-04
elite_trait_avg	500	1000	50	50	2500	****	2122.7680	500 , 1000	6	7	2228.9064	p < 1e-04

5.5 Activation position coverage

Next, we analyze the number of unique starting position maintained by populations.

ggplot(data, aes(x=gen, y=uni_str_pos, color=cardinality)) +
  stat_summary(geom="line", fun=mean) +
  stat_summary(
    geom="ribbon",
    fun.data="mean_cl_boot",
    fun.args=list(conf.int=0.95),
    alpha=0.2,
    linetype=0
  ) +
  scale_y_continuous(
    name="Unique activation positions (population)",
  ) +
  scale_x_continuous(
    name="Generation"
  )

Different cardinalities have numbers of possible starting positions, so next, we look at the proportion of starting positions (out of all possible) maintained by populations.

unique_start_positions_coverage_fig <- ggplot(
    data,
    aes(
      x=gen,
      y=unique_start_positions_coverage,
      color=cardinality,
      fill=cardinality
    )
  ) +
  stat_summary(geom="line", fun=mean) +
  stat_summary(
    geom="ribbon",
    fun.data="mean_cl_boot",
    fun.args=list(conf.int=0.95),
    alpha=0.2,
    linetype=0
  ) +
  scale_y_continuous(
    name="Activation position coverage",
    limits=c(0.0, 1.05)
  ) +
  scale_x_continuous(
    name="Generation"
  ) +
  scale_fill_brewer(
    name="Cardinaltiy",
    palette=cb_palette
  ) +
  scale_color_brewer(
    name="Cardinaltiy",
    palette=cb_palette
  )
unique_start_positions_coverage_fig

5.5.1 Final activation position coverage

# Compute manual labels for geom_signif
stat.test <- final_data %>%
  wilcox_test(unique_start_positions_coverage ~ cardinality) %>%
  adjust_pvalue(method = "bonferroni") %>%
  add_significance() %>%
  add_xy_position(x="cardinality",step.increase=1)
stat.test$manual_position <- stat.test$y.position * 1.05
stat.test$label <- mapply(p_label,stat.test$p.adj)

final_unique_start_positions_coverage_fig <- ggplot(
    final_data,
    aes(
      x=cardinality,
      y=unique_start_positions_coverage,
      fill=cardinality
    )
  ) +
  geom_flat_violin(
    position = position_nudge(x = .2, y = 0),
    alpha = .8,
    scale="width"
  ) +
  geom_point(
    mapping=aes(color=cardinality),
    position = position_jitter(width = .15),
    size = .5,
    alpha = 0.8
  ) +
  geom_boxplot(
    width = .1,
    outlier.shape = NA,
    alpha = 0.5
  ) +
  scale_y_continuous(
    name="Activation position coverage",
    limits=c(0, 1.05)
  ) +
  scale_x_discrete(
    name="Cardinality"
  ) +
  scale_fill_brewer(
    name="Cardinaltiy",
    palette=cb_palette
  ) +
  scale_color_brewer(
    name="Cardinaltiy",
    palette=cb_palette
  ) +
  theme(
    legend.position="none"
  )
final_unique_start_positions_coverage_fig

stat.test %>%
  kbl() %>%
  kable_styling(
    bootstrap_options = c(
      "striped",
      "hover",
      "condensed",
      "responsive"
    )
  ) %>%
  scroll_box(width="600px")

.y.	group1	group2	n1	n2	statistic	p	p.adj	p.adj.signif	y.position	groups	xmin	xmax	manual_position	label
unique_start_positions_coverage	10	20	50	50	1448	0.126	1	ns	1.94900	10, 20	1	2	2.046450	p = 1
unique_start_positions_coverage	10	50	50	50	2424	0.000	0	****	2.94545	10, 50	1	3	3.092722	p < 1e-04
unique_start_positions_coverage	10	100	50	50	2500	0.000	0	****	3.94190	10 , 100	1	4	4.138995	p < 1e-04
unique_start_positions_coverage	10	200	50	50	2500	0.000	0	****	4.93835	10 , 200	1	5	5.185268	p < 1e-04
unique_start_positions_coverage	10	500	50	50	2500	0.000	0	****	5.93480	10 , 500	1	6	6.231540	p < 1e-04
unique_start_positions_coverage	10	1000	50	50	2500	0.000	0	****	6.93125	10 , 1000	1	7	7.277812	p < 1e-04
unique_start_positions_coverage	20	50	50	50	2492	0.000	0	****	7.92770	20, 50	2	3	8.324085	p < 1e-04
unique_start_positions_coverage	20	100	50	50	2500	0.000	0	****	8.92415	20 , 100	2	4	9.370358	p < 1e-04
unique_start_positions_coverage	20	200	50	50	2500	0.000	0	****	9.92060	20 , 200	2	5	10.416630	p < 1e-04
unique_start_positions_coverage	20	500	50	50	2500	0.000	0	****	10.91705	20 , 500	2	6	11.462903	p < 1e-04
unique_start_positions_coverage	20	1000	50	50	2500	0.000	0	****	11.91350	20 , 1000	2	7	12.509175	p < 1e-04
unique_start_positions_coverage	50	100	50	50	2500	0.000	0	****	12.90995	50 , 100	3	4	13.555447	p < 1e-04
unique_start_positions_coverage	50	200	50	50	2500	0.000	0	****	13.90640	50 , 200	3	5	14.601720	p < 1e-04
unique_start_positions_coverage	50	500	50	50	2500	0.000	0	****	14.90285	50 , 500	3	6	15.647993	p < 1e-04
unique_start_positions_coverage	50	1000	50	50	2500	0.000	0	****	15.89930	50 , 1000	3	7	16.694265	p < 1e-04
unique_start_positions_coverage	100	200	50	50	2500	0.000	0	****	16.89575	100, 200	4	5	17.740537	p < 1e-04
unique_start_positions_coverage	100	500	50	50	2500	0.000	0	****	17.89220	100, 500	4	6	18.786810	p < 1e-04
unique_start_positions_coverage	100	1000	50	50	2500	0.000	0	****	18.88865	100 , 1000	4	7	19.833082	p < 1e-04
unique_start_positions_coverage	200	500	50	50	2500	0.000	0	****	19.88510	200, 500	5	6	20.879355	p < 1e-04
unique_start_positions_coverage	200	1000	50	50	2500	0.000	0	****	20.88155	200 , 1000	5	7	21.925628	p < 1e-04
unique_start_positions_coverage	500	1000	50	50	2500	0.000	0	****	21.87800	500 , 1000	6	7	22.971900	p < 1e-04

5.6 Does activation position coverage predict performance?

ggplot(
    final_data,
    aes(
      x=unique_start_positions_coverage,
      y=elite_trait_avg,
      color=cardinality
    )
  ) +
  geom_point() +
  scale_color_brewer(
    name="Cardinaltiy",
    palette=cb_palette
  )

cor.test(
  x=final_data$unique_start_positions_coverage,
  y=final_data$elite_trait_avg,
  method="spearman",
  exact=FALSE
)

## 
##  Spearman's rank correlation rho
## 
## data:  final_data$unique_start_positions_coverage and final_data$elite_trait_avg
## S = 262488, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9632668

5.7 Manuscript figures

Combine figures for the manuscript.

legend <- cowplot::get_legend(
    elite_trait_ave_fit +
      guides(
        color=guide_legend(nrow=1),
        fill=guide_legend(nrow=1)
      ) +
      theme(
        legend.position = "bottom",
        legend.box="horizontal",
        legend.justification="center"
      )
  )

grid <- plot_grid(
  elite_trait_ave_fit +
    ggtitle("Performance over time") +
    theme(legend.position="none"),
  elite_trait_ave_fit_final +
    ggtitle("Final performance") +
    theme(),
  unique_start_positions_coverage_fig +
    ggtitle("Activation position coverage over time") +
    theme(legend.position="none"),
  final_unique_start_positions_coverage_fig +
    ggtitle("Final activation position coverage") +
    theme(),
  nrow=2,
  ncol=2,
  rel_widths=c(3,2),
  labels="auto"
)

grid <- plot_grid(
  grid,
  legend,
  nrow=2,
  ncol=1,
  rel_heights=c(1, 0.1)
)

save_plot(
  paste(working_directory, "imgs/cardinality-panel.pdf", sep=""),
  grid,
  base_width=12,
  base_height=8
)

grid