In brief, segmentation()
(i) identifies segments with homogeneous log2-FC
along the locus of interest and (ii) counts reads overlapping them.
In details, segmentation()
(i) starts by calling fpopw::Fpop_w()
on the
log2-FC profile of both strands to retreive the segments boundaries
(changepoints). The hyperparameter alpha
specified by the user controls
the number of returned segments. Corresponding genomic regions are stored as
a GenomicRanges object. (ii) Finally, segmentation()
calls
Rsubread::featureCounts()
which assigns to the regions the mapped reads
from each replicate of each biological condition. By default a read is
allowed to be assigned to more than one region if it is found to overlap
with more than one region. Rsubread::featureCounts()
ends by building a
count matrix. segmentation()
ends by returning the regions and associated
count matrix as SummarizedExperiment
object.
Usage
segmentation(
data,
weightType = "unweighted",
modelSelectionType = "yao",
featureCountsType = "fromBam",
compressed = TRUE,
alpha = 2,
segmentNeighborhood = FALSE,
Kmax = NULL,
verbose = FALSE,
nbThreadsGridSearch = 1,
alphas = NULL,
gridSearch = FALSE,
outputDirectory = ".",
nbThreadsFeatureCounts = 1,
strandSpecific = 1,
read2pos = NULL,
isPairedEnd = FALSE,
featureCountsOtherParams = list()
)
Arguments
- data
The
List
object returned byloadData()
.- weightType
A
String
. Select the type of weights associated to the log2-FC per-base:unweighted : all observations have the same weight (=1) ;
zeroInflated : low counts have less weight.
- modelSelectionType
A
String
. Select the penalty used by FPOP:yao : Yao's penalty
alpha*sigma^2*log(n)
.
- featureCountsType
A
String
. Select how to summarize counts:fromBam : from bam files using Rsubread::featureCounts ;
fromCoverage : from coverage profiles.
- compressed
A
Boolean
. Indicate if the observations have to be compressed (does not change the segmentation results and decreases the running time).- alpha
A
Double
. Segmentation hyperparameter used in Yao's penalty:alpha*sigma^2*log(n)
. The number of changepoints returned byfpopw::Fpop_w()
is a decreasing function ofalpha
.- segmentNeighborhood
A
Boolean
. Indicate the weighted pDPA algorithm (Rigaill 2010 and 2015) has to be used to explore the space of segmentations.- Kmax
An
Integer
. Segmentations with 1 to Kmax segments are recovered using segment neighborhood.- verbose
A
Boolean
. Should all the operations performed be displayed ?- nbThreadsGridSearch
An
Integer
. Number of threads used by the grid search procedure.- alphas
A vector of
Double
. A series of alphas used by the grid search procedure.- gridSearch
A
Boolean
. Indicate if a grid search has to be used to explore the space of segmentations.- outputDirectory
A
String
. Absolute path to the output directory.- nbThreadsFeatureCounts
An
Integer
. Passed on toRsubread::featureCounts()
.- strandSpecific
An
Integer
. Passed on toRsubread::featureCounts()
.- read2pos
An
Integer
. Passed on toRsubread::featureCounts()
.- isPairedEnd
Passed on to
Rsubread::featureCounts()
.- featureCountsOtherParams
A
List
. Other paramters passed on toRsubread::featureCounts()
.
Examples
# Create a working directory for running the example.
working_directory <- "./DIFFSEGR_TEST"
dir.create(working_directory)
# Save sample information in a text file.
sample_info <- data.frame(
sample = c("pnp1_1_1", "pnp1_1_2", "wt_1", "wt_2"),
condition = rep(c("pnp1_1", "wt"), each = 2),
replicate = rep(1:2,2),
bam = sapply(
c("pnp1_1_1_ChrC_71950_78500.bam",
"pnp1_1_2_ChrC_71950_78500.bam",
"wt_1_ChrC_71950_78500.bam",
"wt_2_ChrC_71950_78500.bam"
),
function(bam) system.file("extdata", bam, package = "DiffSegR")
),
coverage = file.path(
working_directory,
paste0(c("pnp1_1_1", "pnp1_1_2", "wt_1", "wt_2"), ".rds")
)
)
write.table(
sample_info,
file.path(working_directory, "sample_info.txt")
)
# Build coverages and log2-FC.
data <- loadData(
sampleInfo = file.path(working_directory,"sample_info.txt"),
locus = list(
seqid = "ChrC",
chromStart = 71950,
chromEnd = 78500
),
referenceCondition = "wt",
stranded = TRUE,
fromBam = TRUE,
nbThreads = 1
)
#> building coverage profiles ...
#>
#> ========== _____ _ _ ____ _____ ______ _____
#> ===== / ____| | | | _ \| __ \| ____| /\ | __ \
#> ===== | (___ | | | | |_) | |__) | |__ / \ | | | |
#> ==== \___ \| | | | _ <| _ /| __| / /\ \ | | | |
#> ==== ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
#> ========== |_____/ \____/|____/|_| \_\______/_/ \_\_____/
#> Rsubread 2.10.4
#>
#> //========================== featureCounts setting ===========================\\
#> || ||
#> || Input files : 4 BAM files ||
#> || ||
#> || pnp1_1_1_ChrC_71950_78500.bam ||
#> || pnp1_1_2_ChrC_71950_78500.bam ||
#> || wt_1_ChrC_71950_78500.bam ||
#> || wt_2_ChrC_71950_78500.bam ||
#> || ||
#> || Paired-end : no ||
#> || Count read pairs : no ||
#> || Annotation : R data.frame ||
#> || Dir for temp files : . ||
#> || Threads : 1 ||
#> || Level : meta-feature level ||
#> || Multimapping reads : counted ||
#> || Multi-overlapping reads : not counted ||
#> || Min overlapping bases : 1 ||
#> || Read reduction : to 3' end ||
#> || ||
#> \\============================================================================//
#>
#> //================================= Running ==================================\\
#> || ||
#> || Load annotation file .Rsubread_UserProvidedAnnotation_pid49270 ... ||
#> || Features : 13102 ||
#> || Meta-features : 13102 ||
#> || Chromosomes/contigs : 1 ||
#> || ||
#> || Process BAM file pnp1_1_1_ChrC_71950_78500.bam... ||
#> || Strand specific : stranded ||
#> || Single-end reads are included. ||
#> || Total alignments : 24260 ||
#> || Successfully assigned alignments : 24249 (100.0%) ||
#> || Running time : 0.00 minutes ||
#> || ||
#> || Process BAM file pnp1_1_2_ChrC_71950_78500.bam... ||
#> || Strand specific : stranded ||
#> || Single-end reads are included. ||
#> || Total alignments : 21100 ||
#> || Successfully assigned alignments : 21098 (100.0%) ||
#> || Running time : 0.00 minutes ||
#> || ||
#> || Process BAM file wt_1_ChrC_71950_78500.bam... ||
#> || Strand specific : stranded ||
#> || Single-end reads are included. ||
#> || Total alignments : 17533 ||
#> || Successfully assigned alignments : 17531 (100.0%) ||
#> || Running time : 0.00 minutes ||
#> || ||
#> || Process BAM file wt_2_ChrC_71950_78500.bam... ||
#> || Strand specific : stranded ||
#> || Single-end reads are included. ||
#> || Total alignments : 14070 ||
#> || Successfully assigned alignments : 14066 (100.0%) ||
#> || Running time : 0.00 minutes ||
#> || ||
#> || Write the final count table. ||
#> || Write the read assignment summary. ||
#> || ||
#> \\============================================================================//
#>
#>
#> ========== _____ _ _ ____ _____ ______ _____
#> ===== / ____| | | | _ \| __ \| ____| /\ | __ \
#> ===== | (___ | | | | |_) | |__) | |__ / \ | | | |
#> ==== \___ \| | | | _ <| _ /| __| / /\ \ | | | |
#> ==== ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
#> ========== |_____/ \____/|____/|_| \_\______/_/ \_\_____/
#> Rsubread 2.10.4
#>
#> //========================== featureCounts setting ===========================\\
#> || ||
#> || Input files : 4 BAM files ||
#> || ||
#> || pnp1_1_1_ChrC_71950_78500.bam ||
#> || pnp1_1_2_ChrC_71950_78500.bam ||
#> || wt_1_ChrC_71950_78500.bam ||
#> || wt_2_ChrC_71950_78500.bam ||
#> || ||
#> || Paired-end : no ||
#> || Count read pairs : no ||
#> || Annotation : R data.frame ||
#> || Dir for temp files : . ||
#> || Threads : 1 ||
#> || Level : meta-feature level ||
#> || Multimapping reads : counted ||
#> || Multi-overlapping reads : not counted ||
#> || Min overlapping bases : 1 ||
#> || Read reduction : to 5' end ||
#> || ||
#> \\============================================================================//
#>
#> //================================= Running ==================================\\
#> || ||
#> || Load annotation file .Rsubread_UserProvidedAnnotation_pid49270 ... ||
#> || Features : 13102 ||
#> || Meta-features : 13102 ||
#> || Chromosomes/contigs : 1 ||
#> || ||
#> || Process BAM file pnp1_1_1_ChrC_71950_78500.bam... ||
#> || Strand specific : stranded ||
#> || Single-end reads are included. ||
#> || Total alignments : 24260 ||
#> || Successfully assigned alignments : 24251 (100.0%) ||
#> || Running time : 0.00 minutes ||
#> || ||
#> || Process BAM file pnp1_1_2_ChrC_71950_78500.bam... ||
#> || Strand specific : stranded ||
#> || Single-end reads are included. ||
#> || Total alignments : 21100 ||
#> || Successfully assigned alignments : 21088 (99.9%) ||
#> || Running time : 0.00 minutes ||
#> || ||
#> || Process BAM file wt_1_ChrC_71950_78500.bam... ||
#> || Strand specific : stranded ||
#> || Single-end reads are included. ||
#> || Total alignments : 17533 ||
#> || Successfully assigned alignments : 17520 (99.9%) ||
#> || Running time : 0.00 minutes ||
#> || ||
#> || Process BAM file wt_2_ChrC_71950_78500.bam... ||
#> || Strand specific : stranded ||
#> || Single-end reads are included. ||
#> || Total alignments : 14070 ||
#> || Successfully assigned alignments : 14060 (99.9%) ||
#> || Running time : 0.00 minutes ||
#> || ||
#> || Write the final count table. ||
#> || Write the read assignment summary. ||
#> || ||
#> \\============================================================================//
#>
#> building log2-FC per-base ...
#> finished in 0.355238199234009 secs
# Summarize the differential landscape.
SExp <- segmentation(
data = data,
nbThreadsFeatureCounts = 1,
outputDirectory = working_directory
)
#> changepoint detection on ChrC 71950 78500 plus ...
#> > model selection ...
#> > report available in ./DIFFSEGR_TEST/all_models_plus.pdf
#> changepoint detection on ChrC 71950 78500 minus ...
#> > model selection ...
#> > report available in ./DIFFSEGR_TEST/all_models_minus.pdf
#> finished in 0.153049468994141 secs
#> counting ...
#>
#> ========== _____ _ _ ____ _____ ______ _____
#> ===== / ____| | | | _ \| __ \| ____| /\ | __ \
#> ===== | (___ | | | | |_) | |__) | |__ / \ | | | |
#> ==== \___ \| | | | _ <| _ /| __| / /\ \ | | | |
#> ==== ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
#> ========== |_____/ \____/|____/|_| \_\______/_/ \_\_____/
#> Rsubread 2.10.4
#>
#> //========================== featureCounts setting ===========================\\
#> || ||
#> || Input files : 4 BAM files ||
#> || ||
#> || pnp1_1_1_ChrC_71950_78500.bam ||
#> || pnp1_1_2_ChrC_71950_78500.bam ||
#> || wt_1_ChrC_71950_78500.bam ||
#> || wt_2_ChrC_71950_78500.bam ||
#> || ||
#> || Paired-end : no ||
#> || Count read pairs : no ||
#> || Annotation : R data.frame ||
#> || Dir for temp files : . ||
#> || Threads : 1 ||
#> || Level : meta-feature level ||
#> || Multimapping reads : counted ||
#> || Multi-overlapping reads : counted ||
#> || Min overlapping bases : 1 ||
#> || ||
#> \\============================================================================//
#>
#> //================================= Running ==================================\\
#> || ||
#> || Load annotation file .Rsubread_UserProvidedAnnotation_pid49270 ... ||
#> || Features : 69 ||
#> || Meta-features : 69 ||
#> || Chromosomes/contigs : 1 ||
#> || ||
#> || Process BAM file pnp1_1_1_ChrC_71950_78500.bam... ||
#> || Strand specific : stranded ||
#> || Single-end reads are included. ||
#> || Total alignments : 24260 ||
#> || Successfully assigned alignments : 24260 (100.0%) ||
#> || Running time : 0.00 minutes ||
#> || ||
#> || Process BAM file pnp1_1_2_ChrC_71950_78500.bam... ||
#> || Strand specific : stranded ||
#> || Single-end reads are included. ||
#> || Total alignments : 21100 ||
#> || Successfully assigned alignments : 21100 (100.0%) ||
#> || Running time : 0.00 minutes ||
#> || ||
#> || Process BAM file wt_1_ChrC_71950_78500.bam... ||
#> || Strand specific : stranded ||
#> || Single-end reads are included. ||
#> || Total alignments : 17533 ||
#> || Successfully assigned alignments : 17533 (100.0%) ||
#> || Running time : 0.00 minutes ||
#> || ||
#> || Process BAM file wt_2_ChrC_71950_78500.bam... ||
#> || Strand specific : stranded ||
#> || Single-end reads are included. ||
#> || Total alignments : 14070 ||
#> || Successfully assigned alignments : 14070 (100.0%) ||
#> || Running time : 0.00 minutes ||
#> || ||
#> || Write the final count table. ||
#> || Write the read assignment summary. ||
#> || ||
#> \\============================================================================//
#>
#> finished in 0.125571012496948 secs
# In genomic order, first to fifth regions and ...
print(SummarizedExperiment::mcols(SExp)[1:5,])
#> DataFrame with 5 rows and 4 columns
#> modelStart modelEnd modelMean featureId
#> <numeric> <numeric> <numeric> <character>
#> ChrC_71950_72255_plus 1 306 0.0257894 ChrC_71950_72255_plus
#> ChrC_72256_72402_plus 307 453 0.6352520 ChrC_72256_72402_plus
#> ChrC_72403_74038_plus 454 2089 0.2826332 ChrC_72403_74038_plus
#> ChrC_74039_74305_plus 2090 2356 0.0910074 ChrC_74039_74305_plus
#> ChrC_74306_74841_plus 2357 2892 0.3207993 ChrC_74306_74841_plus
# ... associated counts.
print(SummarizedExperiment::assay(SExp)[1:5,])
#> pnp1_1_1 pnp1_1_2 wt_1 wt_2
#> ChrC_71950_72255_plus 143 139 84 84
#> ChrC_72256_72402_plus 649 665 326 293
#> ChrC_72403_74038_plus 13203 11731 10849 8366
#> ChrC_74039_74305_plus 155 132 98 104
#> ChrC_74306_74841_plus 2337 1996 1589 1218
# delete working directory
unlink(working_directory, recursive = TRUE)