Chapter 8 Down-sampled lexicase | Supplemental Material for An Exploration of Exploration: Measuring the ability of lexicase selection to find obscure pathways

8.1 Overview

# Relative location of data.
working_directory <- "experiments/2021-05-28-downsampled/analysis/"
# working_directory <- "./"

# Settings for visualization
cb_palette <- "Set2"

# Create directory to dump plots
dir.create(paste0(working_directory, "imgs"), showWarnings=FALSE)

8.2 Analysis dependencies

library(ggplot2)
library(tidyverse)
library(knitr)
library(cowplot)
library(viridis)
library(RColorBrewer)
library(rstatix)
library(ggsignif)
library(Hmisc)
library(kableExtra)
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")

These analyses were conducted in the following computing environment:

print(version)

##                _                           
## platform       x86_64-pc-linux-gnu         
## arch           x86_64                      
## os             linux-gnu                   
## system         x86_64, linux-gnu           
## status                                     
## major          4                           
## minor          1.0                         
## year           2021                        
## month          05                          
## day            18                          
## svn rev        80317                       
## language       R                           
## version.string R version 4.1.0 (2021-05-18)
## nickname       Camp Pontanezen

8.3 Setup

data_loc <- paste0(working_directory, "data/timeseries.csv")
data <- read.csv(data_loc, na.strings="NONE")

data$cardinality <- as.factor(
  data$OBJECTIVE_CNT
)
data$selection_name <- as.factor(
  data$selection_name
)

data$epsilon <- as.factor(
  data$LEX_EPS
)

data$proportion <- factor(
  data$DSLEX_PROP,
  levels=c(1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01)
)

data$proportion_label <- data$DSLEX_PROP * 100
data$proportion_label <- paste0(data$proportion_label, "%")
data$proportion_label <- factor(
  data$proportion_label,
  levels=c("100%", "50%", "20%", "10%", "5%" , "2%", "1%")
)

data$elite_trait_avg <-
  data$ele_agg_per / data$OBJECTIVE_CNT
data$unique_start_positions_coverage <-
  data$uni_str_pos / data$OBJECTIVE_CNT

final_data <- filter(data, evaluations==max(data$evaluations))

# Labeler for stats annotations
p_label <- function(p_value) {
  threshold = 0.0001
  if (p_value < threshold) {
    return(paste0("p < ", threshold))
  } else {
    return(paste0("p = ", p_value))
  }
}

# Significance threshold
alpha <- 0.05

####### misc #######
# Configure our default graphing theme
theme_set(theme_cowplot())

8.4 Exploration diagnostic performance

elite_ave_performance_fig <- ggplot(
    data,
    aes(
      x=evaluations,
      y=elite_trait_avg,
      color=proportion_label,
      fill=proportion_label
    )
  ) +
  stat_summary(geom="line", fun=mean) +
  stat_summary(
    geom="ribbon",
    fun.data="mean_cl_boot",
    fun.args=list(conf.int=0.95),
    alpha=0.2,
    linetype=0
  ) +
  scale_y_continuous(
    name="Average trait performance",
    limits=c(0, 100)
  ) +
  scale_x_continuous(
    name="Evaluations"
  ) +
  scale_fill_brewer(
    name="Sampling rate",
    palette=cb_palette
  ) +
  scale_color_brewer(
    name="Sampling rate",
    palette=cb_palette
  )
elite_ave_performance_fig

8.4.1 Final performance

# Compute manual labels for geom_signif
stat.test <- final_data %>%
  wilcox_test(elite_trait_avg ~ proportion_label) %>%
  adjust_pvalue(method = "bonferroni") %>%
  add_significance() %>%
  add_xy_position(x="proportion_label",step.increase=1)
stat.test$manual_position <- stat.test$y.position * 1.05
stat.test$label <- mapply(p_label,stat.test$p.adj)

elite_final_performance_fig <- ggplot(
    final_data,
    aes(
      x=proportion_label,
      y=elite_trait_avg,
      fill=proportion_label
    )
  ) +
  geom_flat_violin(
    position = position_nudge(x = .2, y = 0),
    alpha = .8,
    scale="width"
  ) +
  geom_point(
    mapping=aes(color=proportion_label),
    position = position_jitter(width = .15),
    size = .5,
    alpha = 0.8
  ) +
  geom_boxplot(
    width = .1,
    outlier.shape = NA,
    alpha = 0.5
  ) +
  scale_y_continuous(
    name="Average trait performance",
    limits=c(0, 100)
  ) +
  scale_x_discrete(
    name="Sampling rate"
  ) +
  scale_fill_brewer(
    name="Sampling rate",
    palette=cb_palette
  ) +
  scale_color_brewer(
    name="Sampling rate",
    palette=cb_palette
  ) +
  theme(
    legend.position="none"
  )
elite_final_performance_fig

stat.test %>%
  kbl() %>%
  kable_styling(
    bootstrap_options = c(
      "striped",
      "hover",
      "condensed",
      "responsive"
    )
  ) %>%
  scroll_box(width="600px")

.y.	group1	group2	n1	n2	statistic	p	p.adj	p.adj.signif	y.position	groups	xmin	xmax	manual_position	label
elite_trait_avg	100%	50%	50	50	2500	0.00e+00	0.0000000	****	184.488	100%, 50%	1	2	193.7124	p < 1e-04
elite_trait_avg	100%	20%	50	50	2500	0.00e+00	0.0000000	****	277.224	100%, 20%	1	3	291.0852	p < 1e-04
elite_trait_avg	100%	10%	50	50	2500	0.00e+00	0.0000000	****	369.960	100%, 10%	1	4	388.4580	p < 1e-04
elite_trait_avg	100%	5%	50	50	2500	0.00e+00	0.0000000	****	462.696	100%, 5%	1	5	485.8308	p < 1e-04
elite_trait_avg	100%	2%	50	50	2500	0.00e+00	0.0000000	****	555.432	100%, 2%	1	6	583.2036	p < 1e-04
elite_trait_avg	100%	1%	50	50	2500	0.00e+00	0.0000000	****	648.168	100%, 1%	1	7	680.5764	p < 1e-04
elite_trait_avg	50%	20%	50	50	1344	5.19e-01	1.0000000	ns	740.904	50%, 20%	2	3	777.9492	p = 1
elite_trait_avg	50%	10%	50	50	1862	2.49e-05	0.0005229	***	833.640	50%, 10%	2	4	875.3220	p = 0.0005229
elite_trait_avg	50%	5%	50	50	2093	0.00e+00	0.0000001	****	926.376	50%, 5%	2	5	972.6948	p < 1e-04
elite_trait_avg	50%	2%	50	50	2167	0.00e+00	0.0000000	****	1019.112	50%, 2%	2	6	1070.0676	p < 1e-04
elite_trait_avg	50%	1%	50	50	2199	0.00e+00	0.0000000	****	1111.848	50%, 1%	2	7	1167.4404	p < 1e-04
elite_trait_avg	20%	10%	50	50	1623	1.00e-02	0.2100000	ns	1204.584	20%, 10%	3	4	1264.8132	p = 0.21
elite_trait_avg	20%	5%	50	50	1863	2.42e-05	0.0005082	***	1297.320	20%, 5%	3	5	1362.1860	p = 0.0005082
elite_trait_avg	20%	2%	50	50	1974	6.00e-07	0.0000128	****	1390.056	20%, 2%	3	6	1459.5588	p < 1e-04
elite_trait_avg	20%	1%	50	50	2064	0.00e+00	0.0000004	****	1482.792	20%, 1%	3	7	1556.9316	p < 1e-04
elite_trait_avg	10%	5%	50	50	1673	4.00e-03	0.0840000	ns	1575.528	10%, 5%	4	5	1654.3044	p = 0.084
elite_trait_avg	10%	2%	50	50	1979	5.00e-07	0.0000107	****	1668.264	10%, 2%	4	6	1751.6772	p < 1e-04
elite_trait_avg	10%	1%	50	50	2021	1.00e-07	0.0000023	****	1761.000	10%, 1%	4	7	1849.0500	p < 1e-04
elite_trait_avg	5%	2%	50	50	1555	3.60e-02	0.7560000	ns	1853.736	5%, 2%	5	6	1946.4228	p = 0.756
elite_trait_avg	5%	1%	50	50	1674	4.00e-03	0.0840000	ns	1946.472	5%, 1%	5	7	2043.7956	p = 0.084
elite_trait_avg	2%	1%	50	50	1489	1.00e-01	1.0000000	ns	2039.208	2%, 1%	6	7	2141.1684	p = 1

8.5 Activation position coverage

unique_start_position_coverage_fig <- ggplot(
    data,
    aes(
      x=evaluations,
      y=unique_start_positions_coverage,
      color=proportion_label,
      fill=proportion_label
    )
  ) +
  stat_summary(geom="line", fun=mean) +
  stat_summary(
    geom="ribbon",
    fun.data="mean_cl_boot",
    fun.args=list(conf.int=0.95),
    alpha=0.2,
    linetype=0
  ) +
  scale_y_continuous(
    name="Activation position coverage",
    limits=c(0.0, 1.0)
  ) +
  scale_x_continuous(
    name="Evaluations"
  ) +
  scale_fill_brewer(
    name="Sampling rate",
    palette=cb_palette
  ) +
  scale_color_brewer(
    name="Sampling rate",
    palette=cb_palette
  )
unique_start_position_coverage_fig

8.5.1 Final activation position coverage

# Compute manual labels for geom_signif
stat.test <- final_data %>%
  wilcox_test(unique_start_positions_coverage ~ proportion_label) %>%
  adjust_pvalue(method = "bonferroni") %>%
  add_significance() %>%
  add_xy_position(x="proportion_label",step.increase=1)
stat.test$manual_position <- stat.test$y.position * 1.05
stat.test$label <- mapply(p_label,stat.test$p.adj)

unique_start_positions_coverage_final_fig <- ggplot(
    final_data,
    aes(
      x=proportion_label,
      y=unique_start_positions_coverage,
      fill=proportion_label
    )
  ) +
  geom_flat_violin(
    position = position_nudge(x = .2, y = 0),
    alpha = .8,
    scale="width"
  ) +
  geom_point(
    mapping=aes(color=proportion_label),
    position = position_jitter(width = .15),
    size = .5,
    alpha = 0.8
  ) +
  geom_boxplot(
    width = .1,
    outlier.shape = NA,
    alpha = 0.5
  ) +
  scale_y_continuous(
    name="Activation position coverage",
    limits=c(0, 1.0)
  ) +
  scale_x_discrete(
    name="Sampling rate"
  ) +
  scale_fill_brewer(
    name="Sampling rate",
    palette=cb_palette
  ) +
  scale_color_brewer(
    name="Sampling rate",
    palette=cb_palette
  ) +
  theme(
    legend.position="none"
  )
unique_start_positions_coverage_final_fig

stat.test %>%
  kbl() %>%
  kable_styling(
    bootstrap_options = c(
      "striped",
      "hover",
      "condensed",
      "responsive"
    )
  ) %>%
  scroll_box(width="600px")

.y.	group1	group2	n1	n2	statistic	p	p.adj	p.adj.signif	y.position	groups	xmin	xmax	manual_position	label
unique_start_positions_coverage	100%	50%	50	50	2500.0	0.00e+00	0.0000000	****	0.7300	100%, 50%	1	2	0.766500	p < 1e-04
unique_start_positions_coverage	100%	20%	50	50	2500.0	0.00e+00	0.0000000	****	1.0975	100%, 20%	1	3	1.152375	p < 1e-04
unique_start_positions_coverage	100%	10%	50	50	2500.0	0.00e+00	0.0000000	****	1.4650	100%, 10%	1	4	1.538250	p < 1e-04
unique_start_positions_coverage	100%	5%	50	50	2500.0	0.00e+00	0.0000000	****	1.8325	100%, 5%	1	5	1.924125	p < 1e-04
unique_start_positions_coverage	100%	2%	50	50	2500.0	0.00e+00	0.0000000	****	2.2000	100%, 2%	1	6	2.310000	p < 1e-04
unique_start_positions_coverage	100%	1%	50	50	2500.0	0.00e+00	0.0000000	****	2.5675	100%, 1%	1	7	2.695875	p < 1e-04
unique_start_positions_coverage	50%	20%	50	50	607.0	1.00e-07	0.0000012	****	2.9350	50%, 20%	2	3	3.081750	p < 1e-04
unique_start_positions_coverage	50%	10%	50	50	258.0	0.00e+00	0.0000000	****	3.3025	50%, 10%	2	4	3.467625	p < 1e-04
unique_start_positions_coverage	50%	5%	50	50	365.5	0.00e+00	0.0000000	****	3.6700	50%, 5%	2	5	3.853500	p < 1e-04
unique_start_positions_coverage	50%	2%	50	50	306.5	0.00e+00	0.0000000	****	4.0375	50%, 2%	2	6	4.239375	p < 1e-04
unique_start_positions_coverage	50%	1%	50	50	175.5	0.00e+00	0.0000000	****	4.4050	50%, 1%	2	7	4.625250	p < 1e-04
unique_start_positions_coverage	20%	10%	50	50	800.5	1.00e-03	0.0210000		4.7725	20%, 10%	3	4	5.011125	p = 0.021
unique_start_positions_coverage	20%	5%	50	50	944.0	2.90e-02	0.6090000	ns	5.1400	20%, 5%	3	5	5.397000	p = 0.609
unique_start_positions_coverage	20%	2%	50	50	833.0	3.00e-03	0.0630000	ns	5.5075	20%, 2%	3	6	5.782875	p = 0.063
unique_start_positions_coverage	20%	1%	50	50	678.5	5.63e-05	0.0011823	**	5.8750	20%, 1%	3	7	6.168750	p = 0.0011823
unique_start_positions_coverage	10%	5%	50	50	1397.0	3.04e-01	1.0000000	ns	6.2425	10%, 5%	4	5	6.554625	p = 1
unique_start_positions_coverage	10%	2%	50	50	1276.5	8.56e-01	1.0000000	ns	6.6100	10%, 2%	4	6	6.940500	p = 1
unique_start_positions_coverage	10%	1%	50	50	1118.5	3.60e-01	1.0000000	ns	6.9775	10%, 1%	4	7	7.326375	p = 1
unique_start_positions_coverage	5%	2%	50	50	1128.0	3.94e-01	1.0000000	ns	7.3450	5%, 2%	5	6	7.712250	p = 1
unique_start_positions_coverage	5%	1%	50	50	973.0	5.30e-02	1.0000000	ns	7.7125	5%, 1%	5	7	8.098125	p = 1
unique_start_positions_coverage	2%	1%	50	50	1096.0	2.84e-01	1.0000000	ns	8.0800	2%, 1%	6	7	8.484000	p = 1

8.6 Manuscript figures

legend <- cowplot::get_legend(
    elite_ave_performance_fig +
      guides(
        color=guide_legend(nrow=1),
        fill=guide_legend(nrow=1)
      ) +
      theme(
        legend.position = "bottom",
        legend.box="horizontal",
        legend.justification="center"
      )
  )

grid <- plot_grid(
  elite_ave_performance_fig +
    ggtitle("Performance over time") +
    theme(legend.position="none"),
  elite_final_performance_fig +
    ggtitle("Final performance") +
    theme(),
  unique_start_position_coverage_fig +
    ggtitle("Activation position coverage over time") +
    theme(legend.position="none"),
  unique_start_positions_coverage_final_fig +
    ggtitle("Final activation position coverage") +
    theme(),
  nrow=2,
  ncol=2,
  rel_widths=c(3,2),
  labels="auto"
)

grid <- plot_grid(
  grid,
  legend,
  nrow=2,
  ncol=1,
  rel_heights=c(1, 0.1)
)

save_plot(
  paste(working_directory, "imgs/down-sampled-panel.pdf", sep=""),
  grid,
  base_width=12,
  base_height=8
)

grid