Demonstrating SignIT and other methods

We begin by showing how SignIT runs and what its resulting output looks like. We then compare this against existing methods for individual sample mutation signature decomposition. This will serve to show the approach for each method, and begin a discussion of strengths and limitations. We will use the standard COSMIC 30-signature reference mutation set. You can configure this in the code below. We will also use msimr to create simulated mutation catalogs. You can also configure the settings for the catalogs below.

Presence of “novel” signatures

The following simulation shows what happens when novel signatures are present.

normalize <- function(v) {
  return(v / sum(v))
}

n_signatures = 8
n_mutations = 10000
reference_signatures <- get_reference_signatures()

mode = 'demo'

if (mode == 'random') {
  chosen_signatures <- sample(paste('Signature', 1:30), size = n_signatures, replace = F)
  omitted_signature <- sample(chosen_signatures, size = 1)
  remaining_signatures <- names(reference_signatures)[! names(reference_signatures) == omitted_signature]

  test_ref <- reference_signatures[, c('mutation_type', chosen_signatures)]
  known_ref <- reference_signatures[, remaining_signatures]
  
  exposures <- tibble(
    signature = reference_signatures %>% select(-mutation_type) %>% names()
  ) %>%
    mutate(
      exposure = if_else(
        signature %in% chosen_signatures, 
        runif(n()),
        0
      ),
      exposure = normalize(exposure) * n_mutations
    )
  
  simulated_catalog <- msimR::get_simulated_mutation_catalog(
    signatures = reference_signatures, 
    exposures = exposures
  )
} else if (mode == 'demo') {
  data('missing_signature_demo_data')
  attach(missing_signature_demo_data)
}

signit_exposures <- get_exposures(simulated_catalog, known_ref)
## Sampling
## 
## SAMPLING FOR MODEL 'signature_model' NOW (CHAIN 1).
## 
## Gradient evaluation took 0 seconds
## 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Adjust your expectations accordingly!
## 
## 
## Iteration:   1 / 400 [  0%]  (Warmup)
## Iteration:  10 / 400 [  2%]  (Warmup)
## Iteration:  20 / 400 [  5%]  (Warmup)
## Iteration:  30 / 400 [  7%]  (Warmup)
## Iteration:  40 / 400 [ 10%]  (Warmup)
## Iteration:  50 / 400 [ 12%]  (Warmup)
## Iteration:  60 / 400 [ 15%]  (Warmup)
## Iteration:  70 / 400 [ 17%]  (Warmup)
## Iteration:  80 / 400 [ 20%]  (Warmup)
## Iteration:  90 / 400 [ 22%]  (Warmup)
## Iteration: 100 / 400 [ 25%]  (Warmup)
## Iteration: 110 / 400 [ 27%]  (Warmup)
## Iteration: 120 / 400 [ 30%]  (Warmup)
## Iteration: 130 / 400 [ 32%]  (Warmup)
## Iteration: 140 / 400 [ 35%]  (Warmup)
## Iteration: 150 / 400 [ 37%]  (Warmup)
## Iteration: 160 / 400 [ 40%]  (Warmup)
## Iteration: 170 / 400 [ 42%]  (Warmup)
## Iteration: 180 / 400 [ 45%]  (Warmup)
## Iteration: 190 / 400 [ 47%]  (Warmup)
## Iteration: 200 / 400 [ 50%]  (Warmup)
## Iteration: 201 / 400 [ 50%]  (Sampling)
## Iteration: 210 / 400 [ 52%]  (Sampling)
## Iteration: 220 / 400 [ 55%]  (Sampling)
## Iteration: 230 / 400 [ 57%]  (Sampling)
## Iteration: 240 / 400 [ 60%]  (Sampling)
## Iteration: 250 / 400 [ 62%]  (Sampling)
## Iteration: 260 / 400 [ 65%]  (Sampling)
## Iteration: 270 / 400 [ 67%]  (Sampling)
## Iteration: 280 / 400 [ 70%]  (Sampling)
## Iteration: 290 / 400 [ 72%]  (Sampling)
## Iteration: 300 / 400 [ 75%]  (Sampling)
## Iteration: 310 / 400 [ 77%]  (Sampling)
## Iteration: 320 / 400 [ 80%]  (Sampling)
## Iteration: 330 / 400 [ 82%]  (Sampling)
## Iteration: 340 / 400 [ 85%]  (Sampling)
## Iteration: 350 / 400 [ 87%]  (Sampling)
## Iteration: 360 / 400 [ 90%]  (Sampling)
## Iteration: 370 / 400 [ 92%]  (Sampling)
## Iteration: 380 / 400 [ 95%]  (Sampling)
## Iteration: 390 / 400 [ 97%]  (Sampling)
## Iteration: 400 / 400 [100%]  (Sampling)
## 
##  Elapsed Time: 0.952 seconds (Warm-up)
##                0.279 seconds (Sampling)
##                1.231 seconds (Total)
## 
## 
## SAMPLING FOR MODEL 'signature_model' NOW (CHAIN 2).
## 
## Gradient evaluation took 0 seconds
## 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Adjust your expectations accordingly!
## 
## 
## Iteration:   1 / 400 [  0%]  (Warmup)
## Iteration:  10 / 400 [  2%]  (Warmup)
## Iteration:  20 / 400 [  5%]  (Warmup)
## Iteration:  30 / 400 [  7%]  (Warmup)
## Iteration:  40 / 400 [ 10%]  (Warmup)
## Iteration:  50 / 400 [ 12%]  (Warmup)
## Iteration:  60 / 400 [ 15%]  (Warmup)
## Iteration:  70 / 400 [ 17%]  (Warmup)
## Iteration:  80 / 400 [ 20%]  (Warmup)
## Iteration:  90 / 400 [ 22%]  (Warmup)
## Iteration: 100 / 400 [ 25%]  (Warmup)
## Iteration: 110 / 400 [ 27%]  (Warmup)
## Iteration: 120 / 400 [ 30%]  (Warmup)
## Iteration: 130 / 400 [ 32%]  (Warmup)
## Iteration: 140 / 400 [ 35%]  (Warmup)
## Iteration: 150 / 400 [ 37%]  (Warmup)
## Iteration: 160 / 400 [ 40%]  (Warmup)
## Iteration: 170 / 400 [ 42%]  (Warmup)
## Iteration: 180 / 400 [ 45%]  (Warmup)
## Iteration: 190 / 400 [ 47%]  (Warmup)
## Iteration: 200 / 400 [ 50%]  (Warmup)
## Iteration: 201 / 400 [ 50%]  (Sampling)
## Iteration: 210 / 400 [ 52%]  (Sampling)
## Iteration: 220 / 400 [ 55%]  (Sampling)
## Iteration: 230 / 400 [ 57%]  (Sampling)
## Iteration: 240 / 400 [ 60%]  (Sampling)
## Iteration: 250 / 400 [ 62%]  (Sampling)
## Iteration: 260 / 400 [ 65%]  (Sampling)
## Iteration: 270 / 400 [ 67%]  (Sampling)
## Iteration: 280 / 400 [ 70%]  (Sampling)
## Iteration: 290 / 400 [ 72%]  (Sampling)
## Iteration: 300 / 400 [ 75%]  (Sampling)
## Iteration: 310 / 400 [ 77%]  (Sampling)
## Iteration: 320 / 400 [ 80%]  (Sampling)
## Iteration: 330 / 400 [ 82%]  (Sampling)
## Iteration: 340 / 400 [ 85%]  (Sampling)
## Iteration: 350 / 400 [ 87%]  (Sampling)
## Iteration: 360 / 400 [ 90%]  (Sampling)
## Iteration: 370 / 400 [ 92%]  (Sampling)
## Iteration: 380 / 400 [ 95%]  (Sampling)
## Iteration: 390 / 400 [ 97%]  (Sampling)
## Iteration: 400 / 400 [100%]  (Sampling)
## 
##  Elapsed Time: 0.862 seconds (Warm-up)
##                0.335 seconds (Sampling)
##                1.197 seconds (Total)
## 
## 
## SAMPLING FOR MODEL 'signature_model' NOW (CHAIN 3).
## 
## Gradient evaluation took 0 seconds
## 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Adjust your expectations accordingly!
## 
## 
## Iteration:   1 / 400 [  0%]  (Warmup)
## Iteration:  10 / 400 [  2%]  (Warmup)
## Iteration:  20 / 400 [  5%]  (Warmup)
## Iteration:  30 / 400 [  7%]  (Warmup)
## Iteration:  40 / 400 [ 10%]  (Warmup)
## Iteration:  50 / 400 [ 12%]  (Warmup)
## Iteration:  60 / 400 [ 15%]  (Warmup)
## Iteration:  70 / 400 [ 17%]  (Warmup)
## Iteration:  80 / 400 [ 20%]  (Warmup)
## Iteration:  90 / 400 [ 22%]  (Warmup)
## Iteration: 100 / 400 [ 25%]  (Warmup)
## Iteration: 110 / 400 [ 27%]  (Warmup)
## Iteration: 120 / 400 [ 30%]  (Warmup)
## Iteration: 130 / 400 [ 32%]  (Warmup)
## Iteration: 140 / 400 [ 35%]  (Warmup)
## Iteration: 150 / 400 [ 37%]  (Warmup)
## Iteration: 160 / 400 [ 40%]  (Warmup)
## Iteration: 170 / 400 [ 42%]  (Warmup)
## Iteration: 180 / 400 [ 45%]  (Warmup)
## Iteration: 190 / 400 [ 47%]  (Warmup)
## Iteration: 200 / 400 [ 50%]  (Warmup)
## Iteration: 201 / 400 [ 50%]  (Sampling)
## Iteration: 210 / 400 [ 52%]  (Sampling)
## Iteration: 220 / 400 [ 55%]  (Sampling)
## Iteration: 230 / 400 [ 57%]  (Sampling)
## Iteration: 240 / 400 [ 60%]  (Sampling)
## Iteration: 250 / 400 [ 62%]  (Sampling)
## Iteration: 260 / 400 [ 65%]  (Sampling)
## Iteration: 270 / 400 [ 67%]  (Sampling)
## Iteration: 280 / 400 [ 70%]  (Sampling)
## Iteration: 290 / 400 [ 72%]  (Sampling)
## Iteration: 300 / 400 [ 75%]  (Sampling)
## Iteration: 310 / 400 [ 77%]  (Sampling)
## Iteration: 320 / 400 [ 80%]  (Sampling)
## Iteration: 330 / 400 [ 82%]  (Sampling)
## Iteration: 340 / 400 [ 85%]  (Sampling)
## Iteration: 350 / 400 [ 87%]  (Sampling)
## Iteration: 360 / 400 [ 90%]  (Sampling)
## Iteration: 370 / 400 [ 92%]  (Sampling)
## Iteration: 380 / 400 [ 95%]  (Sampling)
## Iteration: 390 / 400 [ 97%]  (Sampling)
## Iteration: 400 / 400 [100%]  (Sampling)
## 
##  Elapsed Time: 1.271 seconds (Warm-up)
##                0.26 seconds (Sampling)
##                1.531 seconds (Total)
## 
## 
## SAMPLING FOR MODEL 'signature_model' NOW (CHAIN 4).
## 
## Gradient evaluation took 0 seconds
## 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Adjust your expectations accordingly!
## 
## 
## Iteration:   1 / 400 [  0%]  (Warmup)
## Iteration:  10 / 400 [  2%]  (Warmup)
## Iteration:  20 / 400 [  5%]  (Warmup)
## Iteration:  30 / 400 [  7%]  (Warmup)
## Iteration:  40 / 400 [ 10%]  (Warmup)
## Iteration:  50 / 400 [ 12%]  (Warmup)
## Iteration:  60 / 400 [ 15%]  (Warmup)
## Iteration:  70 / 400 [ 17%]  (Warmup)
## Iteration:  80 / 400 [ 20%]  (Warmup)
## Iteration:  90 / 400 [ 22%]  (Warmup)
## Iteration: 100 / 400 [ 25%]  (Warmup)
## Iteration: 110 / 400 [ 27%]  (Warmup)
## Iteration: 120 / 400 [ 30%]  (Warmup)
## Iteration: 130 / 400 [ 32%]  (Warmup)
## Iteration: 140 / 400 [ 35%]  (Warmup)
## Iteration: 150 / 400 [ 37%]  (Warmup)
## Iteration: 160 / 400 [ 40%]  (Warmup)
## Iteration: 170 / 400 [ 42%]  (Warmup)
## Iteration: 180 / 400 [ 45%]  (Warmup)
## Iteration: 190 / 400 [ 47%]  (Warmup)
## Iteration: 200 / 400 [ 50%]  (Warmup)
## Iteration: 201 / 400 [ 50%]  (Sampling)
## Iteration: 210 / 400 [ 52%]  (Sampling)
## Iteration: 220 / 400 [ 55%]  (Sampling)
## Iteration: 230 / 400 [ 57%]  (Sampling)
## Iteration: 240 / 400 [ 60%]  (Sampling)
## Iteration: 250 / 400 [ 62%]  (Sampling)
## Iteration: 260 / 400 [ 65%]  (Sampling)
## Iteration: 270 / 400 [ 67%]  (Sampling)
## Iteration: 280 / 400 [ 70%]  (Sampling)
## Iteration: 290 / 400 [ 72%]  (Sampling)
## Iteration: 300 / 400 [ 75%]  (Sampling)
## Iteration: 310 / 400 [ 77%]  (Sampling)
## Iteration: 320 / 400 [ 80%]  (Sampling)
## Iteration: 330 / 400 [ 82%]  (Sampling)
## Iteration: 340 / 400 [ 85%]  (Sampling)
## Iteration: 350 / 400 [ 87%]  (Sampling)
## Iteration: 360 / 400 [ 90%]  (Sampling)
## Iteration: 370 / 400 [ 92%]  (Sampling)
## Iteration: 380 / 400 [ 95%]  (Sampling)
## Iteration: 390 / 400 [ 97%]  (Sampling)
## Iteration: 400 / 400 [100%]  (Sampling)
## 
##  Elapsed Time: 0.924 seconds (Warm-up)
##                0.4 seconds (Sampling)
##                1.324 seconds (Total)
plot_exposure_posteriors_bleed(signit_exposures)

plot_nnls_solution(signit_exposures) %>% print

deconstructsigs_data <- as.data.frame(simulated_catalog) %>% 
  mutate(sample = 'x', count = count/sum(count)) %>%
  spread(mutation_type, count) %>%
  column_to_rownames('sample')
dsigs_signatures <- known_ref %>%
  as.data.frame %>%
  gather(signature, probability, -mutation_type) %>%
  spread(mutation_type, probability) %>%
  mutate(
    signature = gsub(' ', '.', signature),
    signature = factor(signature, levels = paste('Signature', 1:30, sep = '.'))
  ) %>%
  arrange(signature) %>%
  column_to_rownames('signature')
dsigs_exposures <- whichSignatures(deconstructsigs_data, signatures.ref = dsigs_signatures)
deconstructSigs::makePie(dsigs_exposures)

sigest_exposures <- suboptimalSigExposures(
  m = simulated_catalog$count %>% as.matrix(),
  P = reference_signatures_as_matrix(known_ref, simulated_catalog),
  R = 1000,
) %>%
  .$exposures %>%
  as.data.frame() %>%
  rownames_to_column('signature') %>%
  gather(replicate, exposure, -signature) %>%
  as_tibble()
  
exposures %>% filter(exposure != 0)
## # A tibble: 8 x 2
##   signature    exposure
##   <chr>           <dbl>
## 1 Signature 2      394.
## 2 Signature 3     1292.
## 3 Signature 8      734.
## 4 Signature 10     873.
## 5 Signature 18     663.
## 6 Signature 21    1607.
## 7 Signature 27    2241.
## 8 Signature 28    2195.
true_exposure_plot <- exposures %>%
  mutate(
    signature = gsub('Signature ', '', signature) %>% factor(levels = 1:30)
  ) %>%
  ggplot(aes(
    x = signature,
    y = exposure
  )) +
  geom_point() +
  labs(
    y = 'Exposure\n(Mutations)'
  ) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

sigest_exposure_plot <- sigest_exposures %>%
  mutate(
    signature = gsub('Signature ', '', signature) %>% factor(levels = 1:30),
    exposure = exposure * n_mutations
  ) %>%
  ggplot(aes(
    x = signature,
    y = exposure
  )) +
  geom_violin() +
  labs(
    y = 'Exposure\n(Mutations)'
  ) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

dsigs_exposure_plot <- dsigs_exposures$weights %>%
  as.data.frame %>%
  gather('signature', 'exposure') %>%
  bind_rows(tribble(
    ~signature, ~exposure,
    'Unknown', 1 - sum(dsigs_exposures$weights)
  )) %>%
  mutate(
    signature = gsub('Signature.', '', signature) %>% factor(levels = c(1:30, 'Unknown')),
    exposure = exposure * n_mutations
  ) %>%
  ggplot(aes(
    x = signature,
    y = exposure
  )) +
  geom_point() +
  labs(
    y = 'Exposure\n(Mutations)'
  ) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

signit_exposures$exposure_chain %>%
  mutate(signature = gsub('Signature ', '', signature) %>% factor(levels = 1:30)) %>%
  ggplot(aes(
    x = signature,
    y = exposure
  )) +
  geom_violin() +
  scale_x_discrete(drop = F) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

signit_exposure_plot <- plot_exposure_posteriors(signit_exposures) +
  scale_x_discrete(drop = F) +
  labs(
    y = 'Exposure\n(Mutations)'
  )

plot_grid(
  true_exposure_plot,
  dsigs_exposure_plot,
  sigest_exposure_plot,
  signit_exposure_plot,
  ncol = 1,
  rel_heights = c(1, 1.2, 1, 1)
)

plot_signature <- function(catalog) {
  catalog %>%
    mutate(
      context = gsub('(.)\\[(.)>(.)\\](.)', '\\1\\4', mutation_type),
      base_change = gsub('(.)\\[(.)>(.)\\](.)', '\\2>\\3', mutation_type)
    ) %>%
    ggplot(aes(
      x = context,
      y = count
    )) +
    facet_grid(. ~ base_change) +
    geom_bar(stat = 'identity')
}

collapsed_exposures = collapse_signatures_by_bleed(signit_exposures)
## Max bleed was 0.777. Merging Signature 22 and Signature 25.
## Sampling
## 28 signature solution obtained.
## Max bleed was 0.471. Merging Signature 21 and Signature 26.
## Sampling
## 27 signature solution obtained.
## Max bleed was 0.456. Merging Signature 18 and Signature 29.
## Sampling
## 26 signature solution obtained.
## Max bleed was 0.461. Merging Signature 24 and Signature 18/Signature 29.
## Sampling
## 25 signature solution obtained.
## Max bleed was 0.401. Merging Signature 3 and Signature 8.
## Sampling
## 24 signature solution obtained.
## Choosing signature set with maximum WAIC
## 25 signature solution chosen
collapsed_exposures_signit_plot <- plot_exposure_posteriors_bleed(collapsed_exposures)

plot_grid(
  true_exposure_plot,
  collapsed_exposures_signit_plot,
  ncol = 1,
  rel_heights = c(1,2)
)

exposures %>%
  filter(exposure > 0)
## # A tibble: 8 x 2
##   signature    exposure
##   <chr>           <dbl>
## 1 Signature 2      394.
## 2 Signature 3     1292.
## 3 Signature 8      734.
## 4 Signature 10     873.
## 5 Signature 18     663.
## 6 Signature 21    1607.
## 7 Signature 27    2241.
## 8 Signature 28    2195.
merged_signature_names = with(
  collapsed_exposures, 
  signature_names[! signature_names %in% signit_exposures$signature_names]
)

merged_signature_table <- collapsed_exposures$reference_signatures %>%
  gather(signature, probability, -mutation_type) %>%
  filter(signature %in% merged_signature_names)

omitted_signature_spectrum <- reference_signatures[, c('mutation_type', omitted_signature)] %>%
  gather(signature, probability, -mutation_type) %>%
  mutate(signature = paste('Hidden: ', signature))

merged_signature_table %>%
  bind_rows(omitted_signature_spectrum) %>%
  mutate(
    context = gsub('(.)\\[.>.\\](.)', '\\1\\2', mutation_type),
    base_change = gsub('.\\[(.>.)\\].', '\\1', mutation_type),
    signature = gsub('Signature ', '', signature)
  ) %>%
  ggplot(aes(
    x = context,
    y = probability
  )) +
  facet_grid(signature ~ base_change) +
  geom_bar(stat = 'identity') +
  theme(
    axis.text.x = element_text(angle = 90, size = 6, hjust = 1, vjust = 0.5)
  )