script <- system.file("examples/atac_dnase_chip_example.R",package = "EpiCompare")
source(script)
htmltools::includeHTML("https://github.com/neurogenomics/EpiCompare/releases/download/1.3.3/atac_dnase_chip_example.html")
EpiCompareReport
knitr::opts_chunk$set(echo = TRUE, 
                      message = params$debug,
                      warning = params$debug,
                      cache = FALSE, 
                      error = params$error,
                      results = if(isTRUE(params$interact)) 'asis' else 'markup')
### Create Output Directory ###
# if save_output = TRUE
if(params$save_output){
  outfile_dir <- file.path(params$output_dir,"EpiCompare_file")
  if(!dir.exists(outfile_dir)){
    dir.create(outfile_dir, showWarnings = FALSE, recursive = TRUE)
  }
}

#### ------ Prepare genome builds ------ ####
# e.g. genome_build <- list(reference="hg19",peakfiles="hg38",blacklist="hg19")
# or... genome_build <- "hg19"
builds <- prepare_genome_builds(genome_build = params$genome_build,
                                blacklist = params$blacklist)
## Standardise all data to hg19 build 
output_build <- prepare_output_build(params$genome_build_output)

#### ------ Prepare peaklist(s) ------ ####
# and check that the list is named, if not default filenames are used 
peaklist <- prepare_peaklist(peaklist = params$peakfiles) 
peaklist <- liftover_grlist(grlist = peaklist, 
                            input_build = builds$peaklist,
                            output_build = output_build)

#### ------ Prepare reference(s) ------ ####
reference <- prepare_reference(reference = params$reference)
reference <- liftover_grlist(grlist = reference, 
                            input_build = builds$reference,
                            output_build = output_build)

#### ------ Prepare blacklist ------ ####
blacklist <- prepare_blacklist(blacklist = params$blacklist,
                               output_build = output_build,
                               blacklist_build = builds$blacklist) 

### Standardise peaklist(s) ###
peaklist <- tidy_peakfile(peaklist = peaklist,
                          blacklist = blacklist)
peaklist_tidy <- peaklist

### Standardise reference(s) ###
# and include in peaklist
reference_tidy <- reference
if (!is.null(reference)){
  reference_tidy <- tidy_peakfile(peaklist = reference,
                                  blacklist = blacklist)
  peaklist_tidy <- c(peaklist_tidy, reference_tidy)
}

### Obtain Genome Annotation ### 
txdb <- check_genome_build(genome_build = output_build)

### Dynamic Figure Height ###  
fig_height <- fig_length(default_size = 7,
                         number_of_items = length(peaklist_tidy),
                         max_items = 10)

needs_ref <- function(arg,
                      reference=NULL){ 
  if(isTRUE(arg)){
    if(is.null(reference)){
      cat("NOTE: This plot is not generated when reference=NULL.")
      return(FALSE)
    } else{
      return(TRUE)
    }
  } else {
    return(FALSE)
  }
}

EpiCompare compares epigenomic datasets for quality control and benchmarking purposes.

Summary

Table of Contents

The report consists of three sections:

  1. General Metrics: Metrics on peaks for each sample: % blacklisted and non-standard peaks, peak widths, fragments, duplication rates.

  2. Peak Overlap: Frequency, percentage, statistical significance of overlapping and non-overlapping peaks. This also includes upset, precision-recall, and correlation plots.

  3. Functional Annotation: Functional annotation (ChromHMM analysis, peak annotation, and enrichment analysis) of peaks. This also includes peak distributions around transcription start sites (TSS).

Input Datasets

  • Reference peakfile: Dnase_ENCFF185XRG_reference
  • Total of 4 peak files:
# print peak file names and numerate 
cat(paste0(" - File ",seq_len(length(names(peaklist_tidy))),": ",
        names(peaklist_tidy),
        collapse = "\n\n"))
  • File 1: ATAC_ENCFF558BLC

  • File 2: Dnase_ENCFF274YGF

  • File 3: ChIP_ENCFF038DDS

  • File 4: Dnase_ENCFF185XRG_reference

Processed, filtered and lifted files for: peaklist, reference, blacklist

download_button(object = list(peaklist = peaklist,
                              reference = reference_tidy,
                              blacklist = blacklist),
                save_output = params$save_output, 
                filename = paste0("processed_peakfiles_",output_build), 
                outfile_dir = outfile_dir)

Command

The EpiCompare function call used to generate the report:

cmd <- report_command(params = params, 
                      peaklist_tidy = peaklist_tidy,
                      reference_tidy = reference_tidy)
cat(cmd)
EpiCompare(peakfiles = list('ATAC_ENCFF558BLC','Dnase_ENCFF274YGF','ChIP_ENCFF038DDS','Dnase_ENCFF185XRG_reference'),
           genome_build =  list('hg38'),
           genome_build_output = 'hg38',
           blacklist = NULL,
           picard_files = list(),
           reference = 'Dnase_ENCFF185XRG_reference',
           upset_plot = TRUE,
           stat_plot = TRUE,
           chromHMM_plot = TRUE,
           chromHMM_annotation = 'K562',
           chipseeker_plot = TRUE,
           enrichment_plot = TRUE,
           tss_plot = TRUE,
           tss_distance = c(-3000,3000),
           precision_recall_plot = TRUE,
           n_threshold = 20,
           corr_plot = TRUE,
           interact = TRUE,
           save_output = TRUE,
           output_dir = '/var/folders/zq/h7mtybc533b1qzkys_ttgpth0000gn/T//RtmphV63Tj',
           workers = 1,
           error = FALSE)

1 General Metrics

1.1 Peak Information

1.1.1 Column descriptions

  • PeakN before tidy: Total number of peaks including those blacklisted and those in non-standard chromosomes.
  • Blacklisted peaks removed (%): Percentage of blacklisted peaks in samples. e.g. ENCODE hg19 blacklist includes regions in the hg19 genome that have anomalous and/or unstructured signals independent of the cell-line or experiment.
  • Non-standard peaks removed (%): Percentage of peaks in non-standard and/or mitochondrial chromosomes. Identified using BRGenomics::tidyChromosomes().
  • PeakN after tidy: Total number of peaks after removing those in blacklisted regions and non-standard chromosomes.

:information: Note: All EpiCompare analyses conducted on the peak files after they have been filtered (i.e. blacklisted regions and non-standard chromosomes removed) and lifted (as needed).


peak_info_df <- peak_info(peaklist = peaklist,
                          blacklist = blacklist)
download_button(object = peak_info_df,
                filename = "peak_info_df", 
                output_extension = ".csv")
# Print table 
knitr::kable(peak_info_df, format = "markdown") 
PeakN Before Tidy Blacklisted Peaks Removed (%) Non-standard Peaks Removed (%) PeakN After Tidy
ATAC_ENCFF558BLC 12459 0 0 12459
Dnase_ENCFF274YGF 12452 0 0 12452
ChIP_ENCFF038DDS 5221 0 0 5221
save_output(save_output = params$save_output,
            file = peak_info_df,
            file_type = "data.frame",
            filename = "peak_info",
            outpath = outfile_dir) 
remove(peak_info_df)

1.2 Fragment Information

Metrics on fragments is shown only if Picard summary is provided. See manual for help.

1.2.1 Column descriptions

  • Mapped_Fragments: Number of mapped read pairs in the file.
  • Duplication_Rate: Percentage of mapped sequence that is marked as duplicate.
  • Unique_Fragments: Number of mapped sequence that is not marked as duplicate.


if (!is.null(params$picard_files)){
  fragment_info_df <- fragment_info(picard_list = params$picard_files)
  download_button(object = fragment_info_df,
                  filename = "fragment_info",
                  output_extension = ".csv")
  # Print data frame
  knitr::kable(fragment_info_df, format = "markdown") 
  save_output(save_output = params$save_output,
              file = fragment_info_df,
              file_type = "data.frame",
              filename = "fragment_info",
              outpath = outfile_dir) 
  remove(fragment_info_df)
}   

1.3 Peak widths

Distribution of peak widths in samples


width_plot <- width_boxplot(peaklist = peaklist_tidy, 
                            interact = params$interact)
download_button(object = width_plot,
                filename = "width_boxplot",
                self_contained = params$interact,
                add_download_button = params$add_download_button)

NULL

width_plot$plot
# Save boxplot
save_output(save_output = params$save_output,
            file = width_plot$plot,
            file_type = "ggplot",
            filename = "width_plot",
            outpath = outfile_dir, 
            interactive = params$interact)
# Remove variable
remove(width_plot)

2 Peak Overlap

2.1 Percentage Overlap

Percentage of overlapping peaks between samples. Hover over heatmap for percentage values.

The heatmap can be interpreted as follows:

  • Recall: y-axis: Percent of peaks in y-axis samples that overlap with peaks in x-axis samples. in the x-axis samples.
  • Precision: x-axis: Percent of peaks in x-axis samples that overlap with peaks in y-axis samples.


overlap_heatmap <- overlap_heatmap(peaklist = peaklist_tidy,
                                   interact = params$interact)
download_button(object = overlap_heatmap,
                filename = "overlap_heatmap", 
                self_contained = params$interact,add_download_button = params$add_download_button)

NULL

overlap_heatmap$plot
# Save output 
save_output(save_output = params$save_output,
            file = overlap_heatmap$plot,
            file_type = "ggplot",
            filename = "samples_percent_overlap",
            outpath = outfile_dir,
            interactive = params$interact)
# Delete variable
remove(overlap_heatmap)

2.2 Upset Plot

Upset plot of overlapping peaks between samples. See here on how to interpret the plot.

upset_plot <- NULL
if(isTRUE(params$upset_plot)){
  upset_plot <- overlap_upset_plot(peaklist = peaklist_tidy)
  download_button(object = upset_plot,
                  filename = "upset_plot") 
  save_output(save_output = params$save_output,
              file = upset_plot,
              file_type = "image",
              filename = "upset_plot",
              outpath = outfile_dir)
}

Error : C stack usage 14613382 is too close to the limit quartz_off_screen 2

upset_plot

$plot $data peak ATAC_ENCFF558BLC ChIP_ENCFF038DDS Dnase_ENCFF185XRG_reference Dnase_ENCFF274YGF 1 1 1 0 1 1 2 2 1 1 1 1 3 3 1 0 0 0 4 4 1 0 1 0 5 5 1 0 1 0 6 6 1 0 0 0 7 7 1 0 0 0 8 8 1 1 1 1 9 9 1 0 1 0 10 10 1 1 1 1 11 11 1 0 1 1 12 12 1 0 0 1 13 13 1 1 1 1 14 14 1 1 1 1 15 15 1 1 1 1 16 16 1 0 0 0 17 17 1 0 0 0 18 18 1 1 1 0 19 19 1 1 1 1 20 20 1 1 1 1 21 21 1 0 0 0 22 22 1 0 0 0 23 23 1 0 0 0 24 24 1 1 0 0 25 25 1 1 1 1 26 26 1 1 1 1 27 27 1 0 1 0 28 28 1 0 0 0 29 29 1 0 1 0 30 30 1 0 1 1 31 31 1 0 0 0 32 32 1 0 1 1 33 33 1 0 0 0 34 34 1 1 1 1 35 35 1 0 1 1 36 36 1 0 1 1 37 37 1 0 0 0 38 38 1 1 1 1 39 39 1 0 0 0 40 40 1 0 0 0 41 41 1 0 1 1 42 42 1 0 0 0 43 43 1 1 1 1 44 44 1 1 1 1 45 45 1 0 0 0 46 46 1 0 0 0 47 47 1 0 1 1 48 48 1 0 1 1 49 49 1 0 0 0 50 50 1 0 1 1 51 51 1 1 1 1 52 52 1 1 1 1 53 53 1 1 1 0 54 54 1 0 1 0 55 55 1 0 1 1 56 56 1 1 0 0 57 57 1 0 1 1 58 58 1 1 1 1 59 59 1 0 1 1 60 60 1 0 0 0 61 61 1 0 1 0 62 62 1 0 1 0 63 63 1 1 1 1 64 64 1 0 0 0 65 65 1 0 1 1 66 66 1 1 1 1 67 67 1 0 0 0 68 68 1 0 1 0 69 69 1 0 1 1 70 70 1 0 0 0 71 71 1 1 1 1 72 72 1 0 0 0 73 73 1 0 1 1 74 74 1 1 1 1 75 75 1 0 0 0 76 76 1 0 0 0 77 77 1 0 0 0 78 78 1 0 1 0 79 79 1 0 1 1 80 80 1 0 1 0 81 81 1 0 0 0 82 82 1 0 0 0 83 83 1 0 1 1 84 84 1 0 0 0 85 85 1 0 1 0 86 86 1 1 1 1 87 87 1 0 1 1 88 88 1 0 1 1 89 89 1 0 0 0 90 90 1 1 1 1 91 91 1 0 1 1 92 92 1 0 1 1 93 93 1 0 0 0 94 94 1 0 1 1 95 95 1 1 1 1 96 96 1 0 0 0 97 97 1 0 0 0 98 98 1 1 1 1 99 99 1 1 1 1 100 100 1 1 1 1 101 101 1 1 1 1 102 102 1 0 1 1 103 103 1 0 1 1 104 104 1 0 0 0 105 105 1 0 0 0 106 106 1 0 1 1 107 107 1 0 1 1 108 108 1 0 1 1 109 109 1 0 1 0 110 110 1 1 1 1 111 111 1 0 0 0 112 112 1 0 1 1 113 113 1 0 1 1 114 114 1 1 1 1 115 115 1 1 1 1 116 116 1 0 1 1 117 117 1 1 1 1 118 118 1 0 1 0 119 119 1 0 1 1 120 120 1 1 1 1 121 121 1 0 0 0 122 122 1 1 1 1 123 123 1 1 1 1 124 124 1 1 1 1 125 125 1 0 0 0 126 126 1 0 0 0 127 127 1 0 1 1 128 128 1 0 1 1 129 129 1 0 0 0 130 130 1 1 1 1 131 131 1 0 0 0 132 132 1 0 0 1 133 133 1 1 1 1 134 134 1 0 1 0 135 135 1 0 0 0 136 136 1 0 1 1 137 137 1 1 1 1 138 138 1 0 0 0 139 139 1 1 1 1 140 140 1 0 1 1 141 141 1 0 1 1 142 142 1 0 1 1 143 143 1 0 1 1 144 144 1 0 0 0 145 145 1 0 0 0 146 146 1 0 0 0 147 147 1 0 0 0 148 148 1 0 1 1 149 149 1 0 1 1 150 150 1 0 1 1 151 151 1 0 1 1 152 152 1 0 1 1 153 153 1 0 1 0 154 154 1 0 1 0 155 155 1 0 1 0 156 156 1 1 1 1 157 157 1 0 0 0 158 158 1 0 1 1 159 159 1 0 1 1 160 160 1 0 1 0 161 161 1 0 1 1 162 162 1 0 1 1 163 163 1 0 0 0 164 164 1 0 0 0 165 165 1 0 1 1 166 166 1 0 1 1 167 167 1 0 1 0 168 168 1 1 1 1 169 169 1 0 1 1 170 170 1 0 1 1 171 171 1 0 0 0 172 172 1 0 1 0 173 173 1 0 0 0 174 174 1 0 1 1 175 175 1 0 0 0 176 176 1 0 1 0 177 177 1 0 1 1 178 178 1 0 1 1 179 179 1 0 1 1 180 180 1 0 0 0 181 181 1 0 0 0 182 182 1 0 1 1 183 183 1 1 1 1 184 184 1 0 1 1 185 185 1 0 1 0 186 186 1 0 1 0 187 187 1 0 0 0 188 188 1 0 0 0 189 189 1 0 0 0 190 190 1 0 0 0 191 191 1 0 0 0 192 192 1 0 0 0 193 193 1 0 0 0 194 194 1 0 0 0 195 195 1 0 0 0 196 196 1 0 1 1 197 197 1 0 0 0 198 198 1 0 1 1 199 199 1 0 1 1 200 200 1 0 0 0 [ reached ‘max’ / getOption(“max.print”) – omitted 46411 rows ]

remove(upset_plot)

2.3 Statistical Significance

Depending on the format of the reference file, EpiCompare produces different plots:

  • Reference dataset has BED6+4 format (peak calling performed with MACS2): EpiCompare generates paired boxplot per sample showing the distribution of -log10(q-value) of reference peaks that are overlapping and non-overlapping with the sample dataset.
  • Reference dataset does not have BED6+4 format: EpiCompare generates a barplot of percentage of overlapping sample peaks with the reference. Statistical significance (adjusted p-value) is written above each bar.

Reference peakfile: Dnase_ENCFF185XRG_reference

2.3.0.1 Keys

  • Overlap: Sample peaks in Reference peaks
  • Unique: Sample peaks not in Reference peaks


if (needs_ref(params$stat_plot, reference)){
  stat_plot <- overlap_stat_plot(reference = reference_tidy,
                                 peaklist = peaklist,
                                 txdb = txdb, 
                                 interact = params$interact)
  download_button(object = stat_plot,
                  filename = "overlap_stat_plot", 
                  button_label = "Download overlap stat plot",
                  self_contained = params$interact,
                  add_download_button = params$add_download_button)
  stat_plot$plot
  # Save output 
  save_output(save_output = params$save_output,
              file = stat_plot$plot,
              file_type = "ggplot",
              filename = "stat_plot",
              outpath = outfile_dir, 
              interactive = params$interact)
  # Remove variables
  remove(stat_plot)
}

2.4 Precision-Recall Curves

The first plot shows the balance between precision and recall across multiple peak calling stringency thresholds.

  • Precision: Number of sample peaks in reference.
  • Recall: Number of reference peaks in sample.

The second plot shows F1 score (a score that combines precision and recall) across the different peak calling stringency thresholds.

  • F1: 2*(precision*recall) / (precision+recall)
pr_out <- NULL
if(needs_ref(params$precision_recall_plot, reference)){
    #### Create save path ####
    save_path <- if(isFALSE(params$save_output)){NULL}else{
        file.path(outfile_dir,"precision_recall.csv")
    }
    pr_out <- plot_precision_recall(peakfiles = peaklist, 
                                    reference = reference_tidy,
                                    n_threshold = params$n_threshold,
                                    workers = params$workers,
                                    show_plot = FALSE,
                                    verbose = FALSE,
                                    save_path = save_path, 
                                    interact = params$interact)
    download_button(object = pr_out,
                    filename = "precision_recall", 
                    self_contained = params$interact,
                    add_download_button = params$add_download_button)  
}

NULL

pr_out$precision_recall_plot
cat("\n\n")
pr_out$f1_plot
remove(pr_out)

2.5 Correlation Plot

The correlation plot shows the correlation between the quantiles when the genome is binned at a set size. These quantiles are based on the intensity of the peak, dependent on the peak caller used (q-value for MACS2):

cp_out <- NULL
if(isTRUE(params$corr_plot)){
    #### Create save path ####
    save_path <- if(isFALSE(params$save_output)){NULL}else{
        file.path(outfile_dir,"corr.csv.gz")
    }
    cp_out <- plot_corr(peakfiles = peaklist_tidy, 
                        # reference can be NULL
                        reference = reference_tidy,
                        genome_build = output_build,
                        bin_size = params$bin_size,
                        workers = params$workers,
                        show_plot = FALSE,
                        save_path = save_path,
                        interact = params$interact)  
    download_button(object = cp_out,
                    filename = "correlation_plot", 
                    self_contained = params$interact,add_download_button = params$add_download_button)
}

NULL

cp_out$corr_plot
remove(cp_out)

3 Functional Annotation

3.1 ChromHMM

ChromHMM annotates and characterises peaks into different chromatin states.

ChromHMM annotations used in EpiCompare were obtained from here.

  • Cell-type annotation file used in this analysis: K562

ChromHMM annotation definitions:

For more information on ChromHMM states, please see here

  1. Active promoter: being transcribed
  2. Weak promoter: less transcriptional activity
  3. Poised promoter: ready for transcriptional activity
  4. Strong enhancer: activate transcription at high level
  5. Strong enhancer: activate transcription at high level
  6. Weak enhancer: activate transcription at low level
  7. Weak enhancer: activate transcription at low level
  8. Insulator: block transcription
  9. Txn elongation: transcription elongation
  10. Txn elongation: transcription elongation
  11. Weak Txn: weak transcription
  12. Repressed: decreased transcription
  13. Heterochrom.Io: heterochromatin and/or low signal
  14. Repetitive CNV: repetitive or copy number variation
  15. Repetitive CNV: repetitive or copy number variation


3.1.1 All samples

ChromHMM annotation of individual samples.

samples_chromHMM <- NULL
if(isTRUE(params$chromHMM_plot)){
  # Get ChromHMM annotation file 
  chromHMM_list <- get_chromHMM_annotation(cell_line = params$chromHMM_annotation)
  # Plot chromHMM
  samples_chromHMM <- plot_chromHMM(peaklist = peaklist_tidy, 
                                    chromHMM_annotation = chromHMM_list, 
                                    genome_build = output_build,
                                    interact = params$interact)
  download_button(object = samples_chromHMM,
                  filename = "samples_ChromHMM",
                  self_contained = params$interact,add_download_button = params$add_download_button) 
  save_output(save_output = params$save_output,
              file = samples_chromHMM,
              file_type = "ggplot",
              filename = "samples_ChromHMM",
              outpath = outfile_dir,
              interactive = params$interact)  
}
samples_chromHMM
remove(samples_chromHMM)

3.1.2 Overlap: Sample peaks in Reference peaks

Percentage of Sample peaks found in Reference peaks (Reference peakfile: Dnase_ENCFF185XRG_reference)

if(needs_ref(params$chromHMM_plot, reference)){
  # generate data frame of percentage overlap 
  sample_in_ref_df <- overlap_percent(peaklist1 = peaklist_tidy,
                                      peaklist2 = reference_tidy,
                                      invert = FALSE)
  download_button(object = sample_in_ref_df,
                  filename = "sample_in_ref_df",
                  output_extension = ".csv")
  knitr::kable(sample_in_ref_df, format = "markdown") 
}
query subject overlap total Percentage type peaklist1 peaklist2
ATAC_ENCFF558BLC Dnase_ENCFF185XRG_reference 8604 12459 69.1 precision ATAC_ENCFF558BLC Dnase_ENCFF185XRG_reference
Dnase_ENCFF274YGF Dnase_ENCFF185XRG_reference 10226 12452 82.1 precision Dnase_ENCFF274YGF Dnase_ENCFF185XRG_reference
ChIP_ENCFF038DDS Dnase_ENCFF185XRG_reference 3760 5221 72.0 precision ChIP_ENCFF038DDS Dnase_ENCFF185XRG_reference
Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference 16479 16479 100.0 precision Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference
Dnase_ENCFF185XRG_reference ATAC_ENCFF558BLC 10891 16479 66.1 recall ATAC_ENCFF558BLC Dnase_ENCFF185XRG_reference
Dnase_ENCFF185XRG_reference Dnase_ENCFF274YGF 10190 16479 61.8 recall Dnase_ENCFF274YGF Dnase_ENCFF185XRG_reference
Dnase_ENCFF185XRG_reference ChIP_ENCFF038DDS 6038 16479 36.6 recall ChIP_ENCFF038DDS Dnase_ENCFF185XRG_reference
Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference 16479 16479 100.0 recall Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference

ChromHMM annotation of sample peaks found in reference peaks.

sample_in_ref_chromHMM <- NULL
if(needs_ref(params$chromHMM_plot, reference)){ 
  # Obtain overlapping peaks 
  sample_in_ref_list <- mapply(peaklist_tidy, FUN=function(file){
      IRanges::subsetByOverlaps(x = file, 
                                ranges = reference_tidy[[1]])
  }) 
  # Run ChromHMM
  sample_in_ref_chromHMM <- plot_chromHMM(peaklist = sample_in_ref_list,
                                          chromHMM_annotation = chromHMM_list,
                                          genome_build = output_build,
                                          interact = params$interact)
  download_button(object = sample_in_ref_chromHMM,
                filename = "sample_in_ref_chromHMM", 
                self_contained = params$interact,add_download_button = params$add_download_button) 
  save_output(save_output = params$save_output,
              file = sample_in_ref_chromHMM,
              file_type = "ggplot",
              filename = "sample_in_ref_ChromHMM",
              outpath = outfile_dir,
              interactive = params$interact)  
}
sample_in_ref_chromHMM
remove(sample_in_ref_chromHMM)

3.1.3 Overlap: Reference peaks in Sample peaks

Percentage of Reference peaks found in Sample peaks (Reference peakfile: Dnase_ENCFF185XRG_reference)

if (needs_ref(params$chromHMM_plot, reference)){
    # Data frame of overlapping peaks 
    ref_in_sample_df <- overlap_percent(peaklist1 = reference_tidy,
                                        peaklist2 = peaklist_tidy,
                                        invert = FALSE)
   download_button(object = ref_in_sample_df,
                  filename = "ref_in_sample_df",
                  output_extension = ".csv")
   knitr::kable(ref_in_sample_df, format = "markdown")
}
query subject overlap total Percentage type peaklist1 peaklist2
Dnase_ENCFF185XRG_reference ATAC_ENCFF558BLC 10891 16479 66.1 precision Dnase_ENCFF185XRG_reference ATAC_ENCFF558BLC
Dnase_ENCFF185XRG_reference Dnase_ENCFF274YGF 10190 16479 61.8 precision Dnase_ENCFF185XRG_reference Dnase_ENCFF274YGF
Dnase_ENCFF185XRG_reference ChIP_ENCFF038DDS 6038 16479 36.6 precision Dnase_ENCFF185XRG_reference ChIP_ENCFF038DDS
Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference 16479 16479 100.0 precision Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference
ATAC_ENCFF558BLC Dnase_ENCFF185XRG_reference 8604 12459 69.1 recall Dnase_ENCFF185XRG_reference ATAC_ENCFF558BLC
Dnase_ENCFF274YGF Dnase_ENCFF185XRG_reference 10226 12452 82.1 recall Dnase_ENCFF185XRG_reference Dnase_ENCFF274YGF
ChIP_ENCFF038DDS Dnase_ENCFF185XRG_reference 3760 5221 72.0 recall Dnase_ENCFF185XRG_reference ChIP_ENCFF038DDS
Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference 16479 16479 100.0 recall Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference

ChromHMM annotation of reference peaks found in sample peaks.

ref_in_sample_chromHMM <- NULL
if (needs_ref(params$chromHMM_plot, reference)){
  # Subset overlapping peaks
  ref_in_sample_list <- mapply(peaklist_tidy, FUN = function(file){
    IRanges::subsetByOverlaps(x = reference_tidy[[1]], 
                              ranges = file)
    }) 
  # Plot ChromHMM
  ref_in_sample_chromHMM <- plot_chromHMM(peaklist = ref_in_sample_list,
                                          chromHMM_annotation = chromHMM_list,
                                          genome_build = output_build,
                                          interact = params$interact)
  download_button(object = ref_in_sample_chromHMM,
                  filename = "ref_in_sample_chromHMM", 
                  self_contained = params$interact,add_download_button = params$add_download_button)  
  save_output(save_output = params$save_output,
              file = ref_in_sample_chromHMM,
              file_type = "ggplot",
              filename = "ref_in_sample_ChromHMM",
              outpath = outfile_dir,
              interactive = params$interact) 
}
ref_in_sample_chromHMM
remove(ref_in_sample_chromHMM)

3.1.4 Unique: Sample peaks not in Reference peaks

Percentage of sample peaks not found in reference peaks (Reference peakfile: Dnase_ENCFF185XRG_reference)

if (needs_ref(params$chromHMM_plot, reference)){
  # Data frame of non-overlapping peaks 
  sample_not_in_ref_df <- overlap_percent(peaklist1 = peaklist_tidy,
                                          peaklist2 = reference_tidy,
                                          invert = TRUE)
  download_button(object = sample_not_in_ref_df,
                  filename = "sample_not_in_ref_df", 
                  output_extension = ".csv")
  knitr::kable(sample_not_in_ref_df, format = "markdown")
}
query subject overlap total Percentage type peaklist1 peaklist2
ATAC_ENCFF558BLC Dnase_ENCFF185XRG_reference 3855 12459 30.9 precision ATAC_ENCFF558BLC Dnase_ENCFF185XRG_reference
Dnase_ENCFF274YGF Dnase_ENCFF185XRG_reference 2226 12452 17.9 precision Dnase_ENCFF274YGF Dnase_ENCFF185XRG_reference
ChIP_ENCFF038DDS Dnase_ENCFF185XRG_reference 1461 5221 28.0 precision ChIP_ENCFF038DDS Dnase_ENCFF185XRG_reference
Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference 0 16479 0.0 precision Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference
Dnase_ENCFF185XRG_reference ATAC_ENCFF558BLC 5588 16479 33.9 recall ATAC_ENCFF558BLC Dnase_ENCFF185XRG_reference
Dnase_ENCFF185XRG_reference Dnase_ENCFF274YGF 6289 16479 38.2 recall Dnase_ENCFF274YGF Dnase_ENCFF185XRG_reference
Dnase_ENCFF185XRG_reference ChIP_ENCFF038DDS 10441 16479 63.4 recall ChIP_ENCFF038DDS Dnase_ENCFF185XRG_reference
Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference 0 16479 0.0 recall Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference

ChromHMM annotation of sample peaks not found in reference peaks.

sample_not_in_ref_chromHMM <- NULL
if (needs_ref(params$chromHMM_plot, reference)){
  sample_not_in_ref_list <- mapply(peaklist_tidy, FUN = function(file){
    IRanges::subsetByOverlaps(x = file, 
                              ranges = reference_tidy[[1]], 
                              invert = TRUE)
    }) 
  # Run ChromHMM
  sample_not_in_ref_chromHMM<-plot_chromHMM(peaklist = sample_not_in_ref_list,
                                            chromHMM_annotation = chromHMM_list,
                                            genome_build = output_build,
                                            interact = params$interact)
  download_button(object = sample_not_in_ref_chromHMM,
                 filename = "sample_not_in_ref_chromHMM", 
                 self_contained = params$interact,add_download_button = params$add_download_button)  
  save_output(save_output = params$save_output,
              file = sample_not_in_ref_chromHMM,
              file_type = "ggplot",
              filename = "sample_not_in_ref_ChromHMM",
              outpath = outfile_dir,
              interactive = params$interact)  
} 
sample_not_in_ref_chromHMM
remove(sample_not_in_ref_chromHMM)

3.1.5 Unique: Reference peaks not in Sample peaks

Percentage of reference peaks not found in sample peaks (Reference peakfile: Dnase_ENCFF185XRG_reference)

if (needs_ref(params$chromHMM_plot, reference)){
  # Data frame of non-overlapping peaks 
  ref_not_in_sample_df <- overlap_percent(peaklist1 = reference_tidy,
                                          peaklist2 = peaklist_tidy,
                                          invert = TRUE)
  download_button(object = ref_not_in_sample_df,
                  filename = "ref_not_in_sample_df", 
                  output_extension = ".csv")
  knitr::kable(ref_not_in_sample_df, format = "markdown")
}
query subject overlap total Percentage type peaklist1 peaklist2
Dnase_ENCFF185XRG_reference ATAC_ENCFF558BLC 5588 16479 33.9 precision Dnase_ENCFF185XRG_reference ATAC_ENCFF558BLC
Dnase_ENCFF185XRG_reference Dnase_ENCFF274YGF 6289 16479 38.2 precision Dnase_ENCFF185XRG_reference Dnase_ENCFF274YGF
Dnase_ENCFF185XRG_reference ChIP_ENCFF038DDS 10441 16479 63.4 precision Dnase_ENCFF185XRG_reference ChIP_ENCFF038DDS
Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference 0 16479 0.0 precision Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference
ATAC_ENCFF558BLC Dnase_ENCFF185XRG_reference 3855 12459 30.9 recall Dnase_ENCFF185XRG_reference ATAC_ENCFF558BLC
Dnase_ENCFF274YGF Dnase_ENCFF185XRG_reference 2226 12452 17.9 recall Dnase_ENCFF185XRG_reference Dnase_ENCFF274YGF
ChIP_ENCFF038DDS Dnase_ENCFF185XRG_reference 1461 5221 28.0 recall Dnase_ENCFF185XRG_reference ChIP_ENCFF038DDS
Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference 0 16479 0.0 recall Dnase_ENCFF185XRG_reference Dnase_ENCFF185XRG_reference

ChromHMM annotation of reference peaks not found in sample peaks.

ref_not_in_sample_chromHMM <- NULL
if (needs_ref(params$chromHMM_plot, reference)){
  # Subset unique peaks 
  ref_not_in_sample_list <- mapply(peaklist_tidy, FUN = function(file){
    IRanges::subsetByOverlaps(x = reference_tidy[[1]], 
                              ranges = file, 
                              invert = TRUE)
    })
  # Run ChromHMM
  ref_not_in_sample_chromHMM<-plot_chromHMM(peaklist = ref_not_in_sample_list,
                                            chromHMM_annotation = chromHMM_list,
                                            genome_build = output_build,
                                            interact = params$interact)
  download_button(object = ref_not_in_sample_chromHMM,
                  filename = "ref_not_in_sample_chromHMM", 
                  self_contained = params$interact,add_download_button = params$add_download_button)  
  save_output(save_output = params$save_output,
              file = ref_not_in_sample_chromHMM,
              file_type = "ggplot",
              filename = "ref_not_in_sample_ChromHMM",
              outpath = outfile_dir,
              interactive = params$interact) 
}
ref_not_in_sample_chromHMM
remove(ref_not_in_sample_chromHMM)

3.2 Annotate Peaks

EpiCompare uses ChIPseeker::annotatePeak() to annotate peaks with the nearest gene and genomic regions where the peak is located. The peaks are annotated with genes taken from human genome annotations (hg19 or hg38) distributed by Bioconductor.

chipseeker_plot <- NULL
if(isTRUE(params$chipseeker_plot)){
  chipseeker_plot <- plot_ChIPseeker_annotation(
    peaklist = peaklist_tidy,
    txdb = txdb, 
    tss_distance = params$tss_distance,
    interact = params$interact)
  download_button(object = chipseeker_plot,
                  filename = "chipseeker_plot", 
                  self_contained = params$interact,add_download_button = params$add_download_button) 
  save_output(save_output = params$save_output,
              file = chipseeker_plot,
              file_type = "ggplot",
              filename = "chipseeker_annotation",
              outpath = outfile_dir, 
              interactive = params$interact) 
}
chipseeker_plot
remove(chipseeker_plot)

3.3 Functional Enrichment Analyses

EpiCompare performs KEGG pathway and GO enrichment analysis using clusterProfiler. ChIPseeker::annotatePeak() is first used to assign peaks to nearest genes. Biological themes amongst the genes are identified using ontologies (KEGG and GO). The peaks are annotated with genes taken from annotations of human genome (hg19 or hg38) provided by Bioconductor.

enrichment_plots <- NULL
if (isTRUE(params$enrichment_plot)){
  enrichment_plots <- plot_enrichment(peaklist = peaklist_tidy,
                                      txdb = txdb, 
                                      tss_distance = params$tss_distance,
                                      interact = params$interact) 
  # Figure height
  max_terms <- max(
    length(unique(enrichment_plots$kegg_plot$data$Description)),
    length(unique(enrichment_plots$go_plot$data$Description))
  )
  fig_height <- fig_length(default_size = 10,
                           number_of_items = max_terms,
                           max_items = 20)
}

3.3.1 KEGG

Kyoto Encyclopedia of Genes and Genomes (KEGG) enrichment results.

if (isTRUE(params$enrichment_plot)){
  download_button(object = enrichment_plots$kegg_plot,
                  filename = "KEGG_plot", 
                  self_contained = params$interact,add_download_button = params$add_download_button)  
  save_output(save_output = params$save_output,
              file = enrichment_plots$kegg_plot,
              file_type = "ggplot",
              filename = "KEGG_analysis",
              outpath = outfile_dir, 
              interactive = params$interact)
}
enrichment_plots$kegg_plot

3.3.2 GO

Gene Ontology (GO) enrichment results.

GeneRatio definition:

GeneRatio is the number of observed genes divided by the number of expected genes from each GO category. GO terms can be useful when assessing different biological themes present in each epigenomic dataset.
if (isTRUE(params$enrichment_plot)){
  download_button(object = enrichment_plots$go_plot,
                  filename = "GO_plot", 
                  self_contained = params$interact,add_download_button = params$add_download_button) 
  save_output(save_output = params$save_output,
              file = enrichment_plots$go_plot,
              file_type = "ggplot",
              filename = "GO_analysis",
              outpath = outfile_dir,
              interactive = params$interact)  
} 
enrichment_plots$go_plot
remove(enrichment_plots)

3.4 Peak Frequency around TSS

This plots peaks that are mapping to transcriptional start sites (TSS). TSS regions are defined as the flanking sequence of the TSS sites.

  • By default, this function plots the frequency of peaks upstream (-3000bp) and downstream (default: +3000bp) of TSS. These ranges can be adjusted with the argument EpCompare(tss_distance=).

  • The grey area around the main frequency line represents the 95% confidence interval estimated by bootstrapping.

tssplt <- NULL
if (isTRUE(params$tss_plot)){
  tssplt <- tss_plot(peaklist = peaklist_tidy,
                     txdb = txdb,
                     tss_distance = params$tss_distance,
                     workers = params$workers,
                     interact = params$interact)
  download_button(object = tssplt,
                  filename = "tss_plots", 
                  self_contained = params$interact,add_download_button = params$add_download_button)
}

Running bootstrapping for tag matrix… 2023-02-24 19:58:54 Running bootstrapping for tag matrix… 2023-02-24 19:59:25 Running bootstrapping for tag matrix… 2023-02-24 19:59:46 Running bootstrapping for tag matrix… 2023-02-24 20:00:20 NULL

tssplt
$ATAC_ENCFF558BLC
$Dnase_ENCFF274YGF
$ChIP_ENCFF038DDS
$Dnase_ENCFF185XRG_reference
remove(tssplt,p)

Citation

If you use EpiCompare, please cite:

cat(utils::citation("EpiCompare")$textVersion)

EpiCompare: R package for the comparison and quality control of epigenomic peak files (2022) Sera Choi, Brian M. Schilder, Leyla Abbasova, Alan E. Murphy, Nathan G. Skene, bioRxiv, 2022.07.22.501149; doi: https://doi.org/10.1101/2022.07.22.501149

Session Info

utils::sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Ventura 13.2.1
## 
## Matrix products: default
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] EpiCompare_1.3.4     rtracklayer_1.58.0   GenomicRanges_1.50.2 GenomeInfoDb_1.34.9  IRanges_2.32.0      
## [6] S4Vectors_0.36.1     BiocGenerics_0.44.0  PeakyFinders_0.99.3 
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3                           R.methodsS3_1.8.2                       
##   [3] tidyr_1.3.0                              ggplot2_3.4.1                           
##   [5] bit64_4.0.5                              knitr_1.42                              
##   [7] DelayedArray_0.24.0                      R.utils_2.12.2                          
##   [9] data.table_1.14.8                        KEGGREST_1.38.0                         
##  [11] RCurl_1.98-1.10                          GEOquery_2.66.0                         
##  [13] generics_0.1.3                           GenomicFeatures_1.50.4                  
##  [15] cowplot_1.1.1                            RSQLite_2.3.0                           
##  [17] shadowtext_0.1.2                         bit_4.0.5                               
##  [19] tzdb_0.3.0                               enrichplot_1.18.3                       
##  [21] webshot_0.5.4                            xml2_1.3.3                              
##  [23] lubridate_1.9.2                          httpuv_1.6.9                            
##  [25] SummarizedExperiment_1.28.0              assertthat_0.2.1                        
##  [27] viridis_0.6.2                            xfun_0.37                               
##  [29] hms_1.1.2                                jquerylib_0.1.4                         
##  [31] evaluate_0.20                            promises_1.2.0.1                        
##  [33] TSP_1.2-2                                fansi_1.0.4                             
##  [35] restfulr_0.0.15                          progress_1.2.2                          
##  [37] caTools_1.18.2                           dendextend_1.16.0                       
##  [39] dbplyr_2.3.0                             igraph_1.4.0                            
##  [41] DBI_1.1.3                                geneplotter_1.76.0                      
##  [43] htmlwidgets_1.6.1                        purrr_1.0.1                             
##  [45] ellipsis_0.3.2                           crosstalk_1.2.0                         
##  [47] dplyr_1.1.0                              annotate_1.76.0                         
##  [49] gridBase_0.4-7                           biomaRt_2.54.0                          
##  [51] MatrixGenerics_1.10.0                    vctrs_0.5.2                             
##  [53] Biobase_2.58.0                           cachem_1.0.6                            
##  [55] withr_2.5.0                              ggforce_0.4.1                           
##  [57] BSgenome.Hsapiens.UCSC.hg38_1.4.5        HDO.db_0.99.1                           
##  [59] BSgenome_1.66.3                          genomation_1.30.0                       
##  [61] vroom_1.6.1                              GenomicAlignments_1.34.0                
##  [63] treeio_1.22.0                            prettyunits_1.1.1                       
##  [65] DOSE_3.24.2                              ExperimentHub_2.6.0                     
##  [67] ape_5.7                                  dir.expiry_1.6.0                        
##  [69] lazyeval_0.2.2                           crayon_1.5.2                            
##  [71] basilisk.utils_1.10.0                    labeling_0.4.2                          
##  [73] pkgconfig_2.0.3                          tweenr_2.0.2                            
##  [75] nlme_3.1-162                             pkgload_1.3.2                           
##  [77] seriation_1.4.1                          rlang_1.0.6                             
##  [79] lifecycle_1.0.3                          downloader_0.4                          
##  [81] registry_0.5-1                           filelock_1.0.2                          
##  [83] BiocFileCache_2.6.1                      seqPattern_1.30.0                       
##  [85] AnnotationHub_3.6.0                      polyclip_1.10-4                         
##  [87] matrixStats_0.63.0                       Matrix_1.5-3                            
##  [89] aplot_0.1.9                              boot_1.3-28.1                           
##  [91] base64enc_0.1-3                          png_0.1-8                               
##  [93] viridisLite_0.4.1                        rjson_0.2.21                            
##  [95] ca_0.71.1                                bitops_1.0-7                            
##  [97] gson_0.0.9                               R.oo_1.25.0                             
##  [99] KernSmooth_2.23-20                       Biostrings_2.66.0                       
## [101] blob_1.2.3                               MACSr_1.6.0                             
## [103] stringr_1.5.0                            qvalue_2.30.0                           
## [105] regioneR_1.30.0                          readr_2.1.4                             
## [107] gridGraphics_0.5-1                       echodata_0.99.16                        
## [109] scales_1.2.1                             memoise_2.0.1                           
## [111] magrittr_2.0.3                           plyr_1.8.8                              
## [113] gplots_3.1.3                             zlibbioc_1.44.0                         
## [115] compiler_4.2.1                           scatterpie_0.1.8                        
## [117] echoconda_0.99.9                         TxDb.Hsapiens.UCSC.hg38.knownGene_3.16.0
## [119] BiocIO_1.8.0                             RColorBrewer_1.1-3                      
## [121] plotrix_3.8-2                            DESeq2_1.38.3                           
## [123] Rsamtools_2.14.0                         cli_3.6.0                               
## [125] XVector_0.38.0                           patchwork_1.1.2                         
## [127] MASS_7.3-58.2                            tidyselect_1.2.0                        
## [129] stringi_1.7.12                           highr_0.10                              
## [131] yaml_2.3.7                               GOSemSim_2.24.0                         
## [133] locfit_1.5-9.7                           ggrepel_0.9.3                           
## [135] grid_4.2.1                               sass_0.4.5                              
## [137] fastmatch_1.1-3                          tools_4.2.1                             
## [139] timechange_0.2.0                         parallel_4.2.1                          
## [141] rstudioapi_0.14                          foreach_1.5.2                           
## [143] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2  piggyback_0.1.4                         
## [145] gridExtra_2.3                            BRGenomics_1.10.0                       
## [147] farver_2.1.1                             ComplexUpset_1.3.3                      
## [149] ggraph_2.1.0                             digest_0.6.31                           
## [151] BiocManager_1.30.19                      shiny_1.7.4                             
## [153] Rcpp_1.0.10                              BiocVersion_3.16.0                      
## [155] later_1.3.0                              org.Hs.eg.db_3.16.0                     
## [157] httr_1.4.4                               AnnotationDbi_1.60.0                    
## [159] colorspace_2.1-0                         brio_1.1.3                              
## [161] XML_3.99-0.13                            fs_1.6.1                                
## [163] reticulate_1.28                          splines_4.2.1                           
## [165] yulab.utils_0.0.6                        tidytree_0.4.2                          
## [167] graphlayouts_0.8.4                       basilisk_1.10.2                         
## [169] ggplotify_0.1.0                          plotly_4.10.1                           
## [171] xtable_1.8-4                             jsonlite_1.8.4                          
## [173] ggtree_3.6.2                             downloadthis_0.3.2                      
## [175] heatmaply_1.4.2                          tidygraph_1.2.3                         
## [177] ggfun_0.0.9                              testthat_3.1.6                          
## [179] R6_2.5.1                                 pillar_1.8.1                            
## [181] htmltools_0.5.4                          mime_0.12                               
## [183] clusterProfiler_4.6.0                    glue_1.6.2                              
## [185] fastmap_1.1.0                            DT_0.27                                 
## [187] BiocParallel_1.32.5                      interactiveDisplayBase_1.36.0           
## [189] codetools_0.2-19                         ChIPseeker_1.34.1                       
## [191] fgsea_1.24.0                             utf8_1.2.3                              
## [193] lattice_0.20-45                          bslib_0.4.2                             
## [195] tibble_3.1.8                             curl_5.0.0                              
## [197] bsplus_0.1.4                             gtools_3.9.4                            
## [199] zip_2.2.2                                GO.db_3.16.0                            
## [201] openxlsx_4.2.5.2                         limma_3.54.1                            
## [203] rmarkdown_2.20.1                         munsell_0.5.0                           
## [205] GenomeInfoDbData_1.2.9                   iterators_1.0.14                        
## [207] impute_1.72.3                            reshape2_1.4.4                          
## [209] gtable_0.3.1