The Spectra class to manage and access MS data
Source:R/Spectra-functions.R
, R/Spectra.R
Spectra.Rd
The Spectra
class encapsules spectral mass spectrometry data and
related metadata.
It supports multiple data backends, e.g. in-memory (MsBackendMemory,
MsBackendDataFrame()
), on-disk as mzML (MsBackendMzR()
) or HDF5
(MsBackendHdf5Peaks()
).
Usage
applyProcessing(
object,
f = processingChunkFactor(object),
BPPARAM = bpparam(),
...
)
concatenateSpectra(x, ...)
combineSpectra(
x,
f = x$dataStorage,
p = x$dataStorage,
FUN = combinePeaksData,
...,
BPPARAM = bpparam()
)
joinSpectraData(x, y, by.x = "spectrumId", by.y, suffix.y = ".y")
processingLog(x)
deisotopeSpectra(
x,
substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL"),
tolerance = 0,
ppm = 20,
charge = 1
)
reduceSpectra(x, tolerance = 0, ppm = 20)
filterPrecursorMaxIntensity(x, tolerance = 0, ppm = 20)
filterPrecursorIsotopes(
x,
tolerance = 0,
ppm = 20,
substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL")
)
scalePeaks(x, by = sum, msLevel. = uniqueMsLevels(x))
filterPrecursorPeaks(
object,
tolerance = 0,
ppm = 20,
mz = c("==", ">="),
msLevel. = uniqueMsLevels(object)
)
# S4 method for class 'missing'
Spectra(
object,
processingQueue = list(),
metadata = list(),
...,
backend = MsBackendMemory(),
BPPARAM = bpparam()
)
# S4 method for class 'MsBackend'
Spectra(
object,
processingQueue = list(),
metadata = list(),
...,
BPPARAM = bpparam()
)
# S4 method for class 'character'
Spectra(
object,
processingQueue = list(),
metadata = list(),
source = MsBackendMzR(),
backend = source,
...,
BPPARAM = bpparam()
)
# S4 method for class 'ANY'
Spectra(
object,
processingQueue = list(),
metadata = list(),
source = MsBackendMemory(),
backend = source,
...,
BPPARAM = bpparam()
)
# S4 method for class 'Spectra,MsBackend'
setBackend(
object,
backend,
f = processingChunkFactor(object),
...,
BPPARAM = bpparam()
)
# S4 method for class 'Spectra'
c(x, ...)
# S4 method for class 'Spectra,ANY'
split(x, f, drop = FALSE, ...)
# S4 method for class 'Spectra'
export(object, backend, ...)
# S4 method for class 'Spectra'
acquisitionNum(object)
# S4 method for class 'Spectra'
peaksData(
object,
columns = c("mz", "intensity"),
f = processingChunkFactor(object),
...,
BPPARAM = bpparam()
)
# S4 method for class 'Spectra'
peaksVariables(object)
# S4 method for class 'Spectra'
centroided(object)
# S4 method for class 'Spectra'
centroided(object) <- value
# S4 method for class 'Spectra'
collisionEnergy(object)
# S4 method for class 'Spectra'
collisionEnergy(object) <- value
# S4 method for class 'Spectra'
dataOrigin(object)
# S4 method for class 'Spectra'
dataOrigin(object) <- value
# S4 method for class 'Spectra'
dataStorage(object)
# S4 method for class 'Spectra'
dropNaSpectraVariables(object)
# S4 method for class 'Spectra'
intensity(object, f = processingChunkFactor(object), ...)
# S4 method for class 'Spectra'
ionCount(object)
# S4 method for class 'Spectra'
isCentroided(object, ...)
# S4 method for class 'Spectra'
isEmpty(x)
# S4 method for class 'Spectra'
isolationWindowLowerMz(object)
# S4 method for class 'Spectra'
isolationWindowLowerMz(object) <- value
# S4 method for class 'Spectra'
isolationWindowTargetMz(object)
# S4 method for class 'Spectra'
isolationWindowTargetMz(object) <- value
# S4 method for class 'Spectra'
isolationWindowUpperMz(object)
# S4 method for class 'Spectra'
isolationWindowUpperMz(object) <- value
# S4 method for class 'Spectra'
containsMz(
object,
mz = numeric(),
tolerance = 0,
ppm = 20,
which = c("any", "all"),
BPPARAM = bpparam()
)
# S4 method for class 'Spectra'
containsNeutralLoss(
object,
neutralLoss = 0,
tolerance = 0,
ppm = 20,
BPPARAM = bpparam()
)
# S4 method for class 'Spectra'
spectrapply(
object,
FUN,
...,
chunkSize = integer(),
f = factor(),
BPPARAM = SerialParam()
)
# S4 method for class 'Spectra'
length(x)
# S4 method for class 'Spectra'
msLevel(object)
# S4 method for class 'Spectra'
mz(object, f = processingChunkFactor(object), ...)
# S4 method for class 'Spectra'
lengths(x, use.names = FALSE)
# S4 method for class 'Spectra'
polarity(object)
# S4 method for class 'Spectra'
polarity(object) <- value
# S4 method for class 'Spectra'
precScanNum(object)
# S4 method for class 'Spectra'
precursorCharge(object)
# S4 method for class 'Spectra'
precursorIntensity(object)
# S4 method for class 'Spectra'
precursorMz(object)
# S4 method for class 'Spectra'
rtime(object)
# S4 method for class 'Spectra'
rtime(object) <- value
# S4 method for class 'Spectra'
scanIndex(object)
# S4 method for class 'Spectra'
selectSpectraVariables(
object,
spectraVariables = union(spectraVariables(object), peaksVariables(object))
)
# S4 method for class 'Spectra'
smoothed(object)
# S4 method for class 'Spectra'
smoothed(object) <- value
# S4 method for class 'Spectra'
spectraData(object, columns = spectraVariables(object))
# S4 method for class 'Spectra'
spectraData(object) <- value
# S4 method for class 'Spectra'
spectraNames(object)
# S4 method for class 'Spectra'
spectraNames(object) <- value
# S4 method for class 'Spectra'
spectraVariables(object)
# S4 method for class 'Spectra'
tic(object, initial = TRUE)
# S4 method for class 'Spectra'
x$name
# S4 method for class 'Spectra'
x$name <- value
# S4 method for class 'Spectra'
x[[i, j, ...]]
# S4 method for class 'Spectra'
x[[i, j, ...]] <- value
# S4 method for class 'Spectra'
x[i, j, ..., drop = FALSE]
# S4 method for class 'Spectra'
filterAcquisitionNum(
object,
n = integer(),
dataStorage = character(),
dataOrigin = character()
)
# S4 method for class 'Spectra'
filterEmptySpectra(object)
# S4 method for class 'Spectra'
filterDataOrigin(object, dataOrigin = character())
# S4 method for class 'Spectra'
filterDataStorage(object, dataStorage = character())
# S4 method for class 'Spectra'
filterFourierTransformArtefacts(
object,
halfWindowSize = 0.05,
threshold = 0.2,
keepIsotopes = TRUE,
maxCharge = 5,
isotopeTolerance = 0.005
)
# S4 method for class 'Spectra'
filterIntensity(
object,
intensity = c(0, Inf),
msLevel. = uniqueMsLevels(object),
...
)
# S4 method for class 'Spectra'
filterIsolationWindow(object, mz = numeric())
# S4 method for class 'Spectra'
filterMsLevel(object, msLevel. = integer())
# S4 method for class 'Spectra'
filterMzRange(
object,
mz = numeric(),
msLevel. = uniqueMsLevels(object),
keep = TRUE
)
# S4 method for class 'Spectra'
filterMzValues(
object,
mz = numeric(),
tolerance = 0,
ppm = 20,
msLevel. = uniqueMsLevels(object),
keep = TRUE
)
# S4 method for class 'Spectra'
filterPolarity(object, polarity = integer())
# S4 method for class 'Spectra'
filterPrecursorMz(object, mz = numeric())
# S4 method for class 'Spectra'
filterPrecursorMzRange(object, mz = numeric())
# S4 method for class 'Spectra'
filterPrecursorMzValues(object, mz = numeric(), ppm = 20, tolerance = 0)
# S4 method for class 'Spectra'
filterPrecursorCharge(object, z = integer())
# S4 method for class 'Spectra'
filterPrecursorScan(object, acquisitionNum = integer(), f = dataOrigin(object))
# S4 method for class 'Spectra'
filterRt(object, rt = numeric(), msLevel. = uniqueMsLevels(object))
# S4 method for class 'Spectra'
reset(object, ...)
# S4 method for class 'Spectra'
filterRanges(
object,
spectraVariables = character(),
ranges = numeric(),
match = c("all", "any")
)
# S4 method for class 'Spectra'
filterValues(
object,
spectraVariables = character(),
values = numeric(),
ppm = 0,
tolerance = 0,
match = c("all", "any")
)
# S4 method for class 'Spectra'
bin(
x,
binSize = 1L,
breaks = NULL,
msLevel. = uniqueMsLevels(x),
FUN = sum,
zero.rm = TRUE
)
# S4 method for class 'Spectra,Spectra'
compareSpectra(
x,
y,
MAPFUN = joinPeaks,
tolerance = 0,
ppm = 20,
FUN = ndotproduct,
...,
SIMPLIFY = TRUE
)
# S4 method for class 'Spectra,missing'
compareSpectra(
x,
y = NULL,
MAPFUN = joinPeaks,
tolerance = 0,
ppm = 20,
FUN = ndotproduct,
...,
SIMPLIFY = TRUE
)
# S4 method for class 'Spectra'
pickPeaks(
object,
halfWindowSize = 2L,
method = c("MAD", "SuperSmoother"),
snr = 0,
k = 0L,
descending = FALSE,
threshold = 0,
msLevel. = uniqueMsLevels(object),
...
)
# S4 method for class 'Spectra'
replaceIntensitiesBelow(
object,
threshold = min,
value = 0,
msLevel. = uniqueMsLevels(object)
)
# S4 method for class 'Spectra'
smooth(
x,
halfWindowSize = 2L,
method = c("MovingAverage", "WeightedMovingAverage", "SavitzkyGolay"),
msLevel. = uniqueMsLevels(x),
...
)
# S4 method for class 'Spectra'
addProcessing(object, FUN, ..., spectraVariables = character())
coreSpectraVariables()
# S4 method for class 'Spectra'
uniqueMsLevels(object, ...)
# S4 method for class 'Spectra'
backendBpparam(object, BPPARAM = bpparam())
# S4 method for class 'Spectra'
combinePeaks(
object,
tolerance = 0,
ppm = 20,
intensityFun = base::mean,
mzFun = base::mean,
weighted = TRUE,
msLevel. = uniqueMsLevels(object),
...
)
# S4 method for class 'Spectra'
entropy(object, normalized = TRUE)
# S4 method for class 'ANY'
entropy(object, ...)
# S4 method for class 'Spectra'
dataStorageBasePath(object)
# S4 method for class 'Spectra'
dataStorageBasePath(object) <- value
asDataFrame(
object,
i = seq_along(object),
spectraVars = spectraVariables(object)
)
Arguments
- object
For
Spectra()
: either aDataFrame
ormissing
. See section on creation ofSpectra
objects for details. For all other methods aSpectra
object.- f
For
split()
: factor defining how to splitx
. Seebase::split()
for details. ForsetBackend()
: factor defining how to split the data for parallelized copying of the spectra data to the new backend. For some backends changing this parameter can lead to errors. ForcombineSpectra()
:factor
defining the grouping of the spectra that should be combined. Forspectrapply()
:factor
howobject
should be splitted. ForfilterPrecursorScan()
: defining which spectra belong to the same original data file (sample): Defaults tof = dataOrigin(x)
. Forintensity()
,mz()
andpeaksData()
: factor defining how data should be chunk-wise loaded an processed. Defaults toprocessingChunkFactor()
.- BPPARAM
Parallel setup configuration. See
bpparam()
for more information. This is passed directly to thebackendInitialize()
method of the MsBackend.- ...
Additional arguments.
- x
A
Spectra
object.- p
For
combineSpectra()
:factor
defining how to split the inputSpectra
for parallel processing. Defaults tox$dataStorage
, i.e., depending on the used backend, per-file parallel processing will be performed.- FUN
For
addProcessing()
: function to be applied to the peak matrix of each spectrum inobject
. ForcompareSpectra()
: function to compare intensities of peaks between two spectra with each other. ForcombineSpectra()
: function to combine the (peak matrices) of the spectra. See section Data manipulations and examples below for more details. Forbin()
: function to aggregate intensity values of peaks falling into the same bin. Defaults toFUN = sum
thus summing up intensities. Forspectrapply()
andchunkapply()
: function to be applied toSpectra
.- y
A
Spectra
object. ADataFrame
forjoinSpectraData()
.- by.x
A
character(1)
specifying the spectra variable used for merging. Default is"spectrumId"
.- by.y
A
character(1)
specifying the column used for merging. Set toby.x
if missing.- suffix.y
A
character(1)
specifying the suffix to be used for making the names of columns in the merged spectra variables unique. This suffix will be used to amendnames(y)
, whilespectraVariables(x)
will remain unchanged.- substDefinition
For
deisotopeSpectra()
andfilterPrecursorIsotopes()
:matrix
ordata.frame
with definitions of isotopic substitutions. Uses by default isotopic substitutions defined from all compounds in the Human Metabolome Database (HMDB). Seeisotopologues()
orisotopicSubstitutionMatrix()
for details.- tolerance
For
compareSpectra()
,containsMz()
,deisotopeSpectra()
,filterMzValues()
andreduceSpectra()
:numeric(1)
allowing to define a constant maximal accepted difference between m/z values for peaks to be matched (or grouped). ForcontainsMz()
it can also be of length equalmz
to specify a different tolerance for each m/z value. ForfilterPrecursorMaxIntensity()
:numeric(1)
defining the (constant) maximal accepted difference of precursor m/z values of spectra for grouping them into precursor groups. ForfilterPrecursorIsotopes()
: passed directly to theisotopologues()
function. ForfilterValues()
:numeric
of any length allowing to define a maximal accepted difference between user inputvalues
and thespectraVariables
values. If it is not equal to the length of the value provided with parameterspectraVariables
,tolerance[1]
will be recycled. Default istolerance = 0
- ppm
For
compareSpectra()
,containsMz()
,deisotopeSpectra()
,filterMzValues()
andreduceSpectra()
:numeric(1)
defining a relative, m/z-dependent, maximal accepted difference between m/z values for peaks to be matched (or grouped). ForfilterPrecursorMaxIntensity()
:numeric(1)
defining the relative maximal accepted difference of precursor m/z values of spectra for grouping them into precursor groups. ForfilterPrecursorIsotopes()
: passed directly to theisotopologues()
function. ForfilterValues()
:numeric
of any length allowing to define a maximal accepted difference between user inputvalues
and thespectraVariables
values. If it is not equal to the length of the value provided with parameterspectraVariables
,ppm[1]
will be recycled.- charge
For
deisotopeSpectra()
: expected charge of the ionized compounds. Seeisotopologues()
for details.- by
For
scalePeaks()
: function to calculate a singlenumeric
from intensity values of a spectrum by which all intensities (of that spectrum) should be divided by. The defaultby = sum
will divide intensities of each spectrum by the sum of intensities of that spectrum.- msLevel.
integer
defining the MS level(s) of the spectra to which the function should be applied (defaults to all MS levels ofobject
. ForfilterMsLevel()
: the MS level to whichobject
should be subsetted.- mz
For
filterIsolationWindow()
:numeric(1)
with the m/z value to filter the object. ForfilterPrecursorMz()
andfilterMzRange()
:numeric(2)
defining the lower and upper m/z boundary. ForfilterMzValues()
andfilterPrecursorMzValues()
:numeric
with the m/z values to match peaks or precursor m/z against.- processingQueue
For
Spectra()
: optionallist
of ProcessingStep objects.- metadata
For
Spectra()
: optionallist
with metadata information.- backend
For
Spectra()
: MsBackend to be used as backend. See section on creation ofSpectra
objects for details. ForsetBackend()
: instance of MsBackend that supportssetBackend()
(i.e. for whichsupportsSetBackend()
returnsTRUE
). Such backends have a parameterdata
in theirbackendInitialize()
function that support passing the full spectra data to the initialize method. See section on creation ofSpectra
objects for details. Forexport()
: MsBackend to be used to export the data.- source
For
Spectra()
: instance of MsBackend that can be used to import spectrum data from the provided files. See section Creation of objects, conversion and changing the backend for more details.- drop
For
[
,split()
: not considered.- columns
For
spectraData()
accessor: optionalcharacter
with column names (spectra variables) that should be included in the returnedDataFrame
. By default, all columns are returned. ForpeaksData()
accessor: optionalcharacter
with requested columns in the individualmatrix
of the returnedlist
. Defaults toc("mz", "value")
but any values returned bypeaksVariables(object)
withobject
being theSpectra
object are supported.- value
replacement value for
<-
methods. See individual method description or expected data type.- which
for
containsMz()
: either"any"
or"all"
defining whether any (the default) or all providedmz
have to be present in the spectrum.- neutralLoss
for
containsNeutralLoss()
:numeric(1)
defining the value which should be subtracted from the spectrum's precursor m/z.- chunkSize
For
spectrapply()
: size of the chunks into whichSpectra
should be split. This parameter overrides parametersf
andBPPARAM
.- use.names
For
lengths()
: ignored.- spectraVariables
For
selectSpectraVariables()
:character
with the names of the spectra variables to which the backend should be subsetted.For
addProcessing()
:character
with additional spectra variables that should be passed along to the function defined withFUN
. See function description for details.For
filterRanges()
andfilterValues()
:character
vector specifying the column(s) fromspectraData(object)
on which to filter the data and that correspond to the the names of the spectra variables that should be used for the filtering.
- initial
For
tic()
:logical(1)
whether the initially reported total ion current should be reported, or whether the total ion current should be (re)calculated on the actual data (initial = FALSE
, same asionCount()
).- name
For
$
and$<-
: the name of the spectra variable to return or set.- i
For
[
:integer
,logical
orcharacter
to subset the object. ForasDataFrame()
annumeric
indicating which scans to coerce to aDataFrame
(default isseq_along(object)
).- j
For
[
: not supported.- n
for
filterAcquisitionNum()
:integer
with the acquisition numbers to filter for.- dataStorage
For
filterDataStorage()
:character
to define which spectra to keep. ForfilterAcquisitionNum()
: optionally specify if filtering should occur only for spectra of selecteddataStorage
.- dataOrigin
For
filterDataOrigin()
:character
to define which spectra to keep. ForfilterAcquisitionNum()
: optionally specify if filtering should occurr only for spectra of selecteddataOrigin
.- halfWindowSize
For
pickPeaks()
:integer(1)
, used in the identification of the mass peaks: a local maximum has to be the maximum in the window from(i - halfWindowSize):(i + halfWindowSize)
.For
smooth()
:integer(1)
, used in the smoothing algorithm, the window reaches from(i - halfWindowSize):(i + halfWindowSize)
.For
filterFourierTransformArtefacts()
:numeric(1)
defining the m/z window left and right of a peak where to remove fourier transform artefacts.
- threshold
For
pickPeaks()
: adouble(1)
defining the proportion of the maximal peak intensity. Just values above are used for the weighted mean calculation.For
replaceIntensitiesBelow()
: anumeric(1)
defining the threshold or afunction
to calculate the threshold for each spectrum on its intensity values. Defaults tothreshold = min
.For
filterFourierTransformArtefacts()
: the relative intensity (to a peak) below which peaks are considered fourier artefacts. Defaults tothreshold = 0.2
hence removing peaks that have an intensity below 0.2 times the intensity of the tested peak (within the selectedhalfWindowSize
).
- keepIsotopes
For
filterFourierTransformArtefacts()
: whether isotope peaks should not be removed as fourier artefacts.- maxCharge
For
filterFourierTransformArtefacts()
: the maximum charge to be considered for isotopes.- isotopeTolerance
For
filterFourierTransformArtefacts()
: the m/ztolerance
to be used to define whether peaks might be isotopes of the current tested peak.- intensity
For
filterIntensity()
:numeric
of length 1 or 2 defining either the lower or the lower and upper intensity limit for the filtering, or afunction
that takes the intensities as input and returns alogical
(same length then peaks in the spectrum) whether the peak should be retained or not. Defaults tointensity = c(0, Inf)
thus only peaks withNA
intensity are removed.- keep
For
filterMzValues()
andfilterMzRange()
:logical(1)
whether the matching peaks should be retained (keep = TRUE
, the default) or dropped (keep = FALSE
).- polarity
for
filterPolarity()
:integer
specifying the polarity to to subsetobject
.- z
For
filterPrecursorCharge()
:integer()
with the precursor charges to be used as filter.- acquisitionNum
for
filterPrecursorScan()
:integer
with the acquisition number of the spectra to which the object should be subsetted.- rt
for
filterRt()
:numeric(2)
defining the retention time range to be used to subset/filterobject
.- ranges
for
filterRanges()
: Anumeric
vector of paired values (upper and lower boundary) that define the ranges to filter theobject
. These paired values need to be in the same order as thespectraVariables
parameter (see below).- match
For
filterRanges()
andfilterValues()
:character(1)
defining whether the condition has to match for all providedranges
/values
(match = "all"
; the default), or for any of them (match = "any"
) for spectra to be retained.- values
for
filterValues()
: Anumeric
vector that define the values to filter the Spectra data. These values need to be in the same order as thespectraVariables
parameter.- binSize
For
bin()
:numeric(1)
defining the size for the m/z bins. Defaults tobinSize = 1
.- breaks
For
bin()
:numeric
defining the m/z breakpoints between bins.- zero.rm
logical
. Forbin()
: indicating whether to remove bins with zero intensity. Defaults toTRUE
, meaning the function will discard bins created with an intensity of 0 to enhance memory efficiency.- MAPFUN
For
compareSpectra()
: function to map/match peaks between the two compared spectra. SeejoinPeaks()
for more information and possible functions.- SIMPLIFY
For
compareSpectra()
whether the result matrix should be simplified to anumeric
if possible (i.e. if eitherx
ory
is of length 1).- method
For
pickPeaks()
:character(1)
, the noise estimators that should be used, currently the the Median Absolute Deviation (method = "MAD"
) and Friedman's Super Smoother (method = "SuperSmoother"
) are supported.For
smooth()
:character(1)
, the smoothing function that should be used, currently, the Moving-Average- (method = "MovingAverage"
), Weighted-Moving-Average- (method = "WeightedMovingAverage")
, Savitzky-Golay-Smoothing (method = "SavitzkyGolay"
) are supported.
- snr
For
pickPeaks()
:double(1)
defining the Signal-to-Noise-Ratio. The intensity of a local maximum has to be higher thansnr * noise
to be considered as peak.- k
For
pickPeaks()
:integer(1)
, number of values left and right of the peak that should be considered in the weighted mean calculation.- descending
For
pickPeaks()
:logical
, ifTRUE
just values between the nearest valleys around the peak centroids are used.- intensityFun
For
combinePeaks()
: function to be used to aggregate intensities for all peaks in each peak group into a single intensity value.- mzFun
For
combinePeaks()
: function to aggregate m/z values for all peaks within each peak group into a single m/z value. This parameter is ignored ifweighted = TRUE
(the default).- weighted
For
combinePeaks()
:logical(1)
whether m/z values of peaks within each peak group should be aggregated into a single m/z value using an intensity-weighted mean. Defaults toweighted = TRUE
.- normalized
for
entropy()
:logical(1)
whether the normalized entropy should be calculated (default). See alsonentropy()
for details.- spectraVars
character()
indicating what spectra variables to add to theDataFrame
. Default isspectraVariables(object)
, i.e. all available variables.
Details
The Spectra
class uses by default a lazy data manipulation strategy,
i.e. data manipulations such as performed with replaceIntensitiesBelow()
are not applied immediately to the data, but applied on-the-fly to the
spectrum data once it is retrieved. For some backends that allow to write
data back to the data storage (such as the MsBackendMemory()
,
MsBackendDataFrame()
and MsBackendHdf5Peaks()
) it is possible to apply
to queue with the applyProcessing
function. See the *Data manipulation and
analysis methods section below for more details.
For more information on parallel or chunk-wise processing (especially
helpful for very large data sets) see processingChunkSize()
.
To apply arbitrary functions to a Spectra
use the spectrapply()
function
(or directly chunkapply()
for chunk-wise processing). See description of
the spectrapply()
function below for details.
For details on plotting spectra, see plotSpectra()
.
Clarifications regarding scan/acquisition numbers and indices:
A
spectrumId
(orspectrumID
) is a vendor specific field in the mzML file that contains some information about the run/spectrum, e.g.:controllerType=0 controllerNumber=1 scan=5281 file=2
acquisitionNum
is a more a less sanitize spectrum id generated from thespectrumId
field bymzR
(see here).scanIndex
is themzR
generated sequence number of the spectrum in the raw file (which doesn't have to be the same as theacquisitionNum
)
See also this issue.
Creation of objects, conversion, changing the backend and export
Spectra
classes can be created with the Spectra()
constructor function
which supports the following formats:
parameter
object
is adata.frame
orDataFrame
containing the spectrum data. The providedbackend
(by default a MsBackendMemory) will be initialized with that data.parameter
object
is a MsBackend (assumed to be already initialized).parameter
object
is missing, in which case it is supposed that the data is provided by the MsBackend class passed along with thebackend
argument.parameter
object
is of typecharacter
and is expected to be the file names(s) from which spectra should be imported. Parametersource
allows to define a MsBackend that is able to import the data from the provided source files. The default value forsource
isMsBackendMzR()
which allows to import spectra data from mzML, mzXML or CDF files.
With ...
additional arguments can be passed to the backend's
backendInitialize()
method. Parameter backend
allows to specify which
MsBackend should be used for data storage.
The backend of a Spectra
object can be changed with the setBackend()
method that takes an instance of the new backend as second parameter
backend
. A call to setBackend(sps, backend = MsBackendDataFrame())
would for example change the backend of sps
to the in-memory
MsBackendDataFrame
. Changing to a backend is only supported if that
backend has a data
parameter in its backendInitialize()
method and if
supportsSetBackend()
returns TRUE
for that backend. setBackend()
will
transfer the full spectra data from the originating backend as a
DataFrame
to the new backend.
Most read-only backends do not support setBackend()
. It is for example
not possible to change the backend to a read-only backend (such as
the MsBackendMzR()
backend).
The definition of the function is:
setBackend(object, backend, ..., f = dataStorage(object), BPPARAM = bpparam())
and its parameters are:
parameter
object
: theSpectra
object.parameter
backend
: an instance of the new backend, e.g.[MsBackendMemory()]
.parameter
f
: factor allowing to parallelize the change of the backends. By default the process of copying the spectra data from the original to the new backend is performed separately (and in parallel) for each file. Users are advised to use the default setting.parameter
...
: optional additional arguments passed to thebackendInitialize()
method of the newbackend
.parameter
BPPARAM
: setup for the parallel processing. Seebpparam()
for details.
Data from a Spectra
object can be exported to a file with the
export()
function. The actual export of the data has to be performed by
the export
method of the MsBackend class defined with the mandatory
parameter backend
. Note however that not all backend classes support
export of data. From the MsBackend
classes in the Spectra
package
currently only the MsBackendMzR
backend supports data export (to
mzML/mzXML file(s)); see the help page of the MsBackend for
information on its arguments or the examples below or the vignette
for examples.
The definition of the function is
export(object, backend, ...)
and its
parameters are:
object
: theSpectra
object to be exported.backend
: instance of a class extending MsBackend which supports export of the data (i.e. which has a definedexport
method)....
: additional parameters specific for theMsBackend
passed with parameterbackend
.
The dataStorageBasePath()
and dataStorageBasePath<-
functions allow, for
backend classes that support this operation, to get or change the base
path to the directory where the backend stores the data. In-memory backends
such as MsBackendMemory or MsBackendDataFrame keeping all MS data in
memory don't support, and need, this function, but for MsBackendMzR this
function can be used to update/adapt the path to the directory containing
the original data files. Thus, for Spectra
objects (using this backend)
that were moved to another file system or computer, these functions allow to
adjust/adapt the base file path.
Accessing spectra data
$
,$<-
: gets (or sets) a spectra variable for all spectra inobject
. See examples for details. Note that replacing values of a peaks variable is not supported with a non-empty processing queue, i.e. if any filtering or data manipulations on the peaks data was performed. In these casesapplyProcessing()
needs to be called first to apply all cached data operations.[[
,[[<-
: access or set/add a single spectrum variable (column) in the backend.acquisitionNum()
: returns the acquisition number of each spectrum. Returns aninteger
of length equal to the number of spectra (withNA_integer_
if not available).centroided()
,centroided<-
: gets or sets the centroiding information of the spectra.centroided()
returns alogical
vector of length equal to the number of spectra withTRUE
if a spectrum is centroided,FALSE
if it is in profile mode andNA
if it is undefined. See alsoisCentroided()
for estimating from the spectrum data whether the spectrum is centroided.value
forcentroided<-
is either a singlelogical
or alogical
of length equal to the number of spectra inobject
.collisionEnergy()
,collisionEnergy<-
: gets or sets the collision energy for all spectra inobject
.collisionEnergy()
returns anumeric
with length equal to the number of spectra (NA_real_
if not present/defined),collisionEnergy<-
takes anumeric
of length equal to the number of spectra inobject
.coreSpectraVariables()
: returns the core spectra variables along with their expected data type.dataOrigin()
,dataOrigin<-
: gets or sets the data origin for each spectrum.dataOrigin()
returns acharacter
vector (same length thanobject
) with the origin of the spectra.dataOrigin<-
expects acharacter
vector (same length thanobject
) with the replacement values for the data origin of each spectrum.dataStorage()
: returns acharacter
vector (same length thanobject
) with the data storage location of each spectrum.intensity()
: gets the intensity values from the spectra. Returns aNumericList()
ofnumeric
vectors (intensity values for each spectrum). The length of the list is equal to the number ofspectra
inobject
.ionCount()
: returns anumeric
with the sum of intensities for each spectrum. If the spectrum is empty (seeisEmpty()
),NA_real_
is returned.isCentroided()
: a heuristic approach assessing if the spectra inobject
are in profile or centroided mode. The function takes theqtl
th quantile top peaks, then calculates the difference between adjacent m/z value and returnsTRUE
if the first quartile is greater thank
. (SeeSpectra:::.isCentroided()
for the code.)isEmpty()
: checks whether a spectrum inobject
is empty (i.e. does not contain any peaks). Returns alogical
vector of length equal number of spectra.isolationWindowLowerMz()
,isolationWindowLowerMz<-
: gets or sets the lower m/z boundary of the isolation window.isolationWindowTargetMz()
,isolationWindowTargetMz<-
: gets or sets the target m/z of the isolation window.isolationWindowUpperMz()
,isolationWindowUpperMz<-
: gets or sets the upper m/z boundary of the isolation window.containsMz()
: checks for each of the spectra whether they contain mass peaks with an m/z equal tomz
(given acceptable difference as defined by parameterstolerance
andppm
- seecommon()
for details). Parameterwhich
allows to define whether any (which = "any"
, the default) or all (which = "all"
) of themz
have to match. The function returnsNA
ifmz
is of length 0 or isNA
.containsNeutralLoss()
: checks for each spectrum inobject
if it has a peak with an m/z value equal to its precursor m/z -neutralLoss
(given acceptable difference as defined by parameterstolerance
andppm
). ReturnsNA
for MS1 spectra (or spectra without a precursor m/z).length()
: gets the number of spectra in the object.lengths()
: gets the number of peaks (m/z-intensity values) per spectrum. Returns aninteger
vector (length equal to the number of spectra). For empty spectra,0
is returned.msLevel()
: gets the spectra's MS level. Returns an integer vector (names being spectrum names, length equal to the number of spectra) with the MS level for each spectrum.mz()
: gets the mass-to-charge ratios (m/z) from the spectra. Returns aNumericList()
or length equal to the number of spectra, each element anumeric
vector with the m/z values of one spectrum.peaksData()
: gets the peaks data for all spectra inobject
. Peaks data consist of the m/z and intensity values as well as possible additional annotations (variables) of all peaks of each spectrum. The function returns aSimpleList()
of two dimensional arrays (eithermatrix
ordata.frame
), with each array providing the values for the requested peak variables (by default"mz"
and"intensity"
). Optional parametercolumns
is passed to the backend'speaksData()
function to allow the selection of specific (or additional) peaks variables (columns) that should be extracted (if available). Importantly, it is not guaranteed that each backend supports this parameter (while each backend must support extraction of"mz"
and"intensity"
columns). Parametercolumns
defaults toc("mz", "intensity")
but any value returned bypeaksVariables(object)
is supported. Note also that it is possible to extract the peak data withas(x, "list")
andas(x, "SimpleList")
as alist
andSimpleList
, respectively. Note however that, in contrast topeaksData()
,as()
does not support the parametercolumns
.peaksVariables()
: lists the available variables for mass peaks provided by the backend. Default peak variables are"mz"
and"intensity"
(which all backends need to support and provide), but some backends might provide additional variables. These variables correspond to the column names of the peak data array returned bypeaksData()
.polarity()
,polarity<-
: gets or sets the polarity for each spectrum.polarity()
returns aninteger
vector (length equal to the number of spectra), with0
and1
representing negative and positive polarities, respectively.polarity<-
expects aninteger
vector of length 1 or equal to the number of spectra.precursorCharge()
,precursorIntensity()
,precursorMz()
,precScanNum()
,precAcquisitionNum()
: gets the charge (integer
), intensity (numeric
), m/z (numeric
), scan index (integer
) and acquisition number (interger
) of the precursor for MS level > 2 spectra from the object. Returns a vector of length equal to the number of spectra inobject
.NA
are reported for MS1 spectra of if no precursor information is available.rtime()
,rtime<-
: gets or sets the retention times (in seconds) for each spectrum.rtime()
returns anumeric
vector (length equal to the number of spectra) with the retention time for each spectrum.rtime<-
expects a numeric vector with length equal to the number of spectra.scanIndex()
: returns aninteger
vector with the scan index for each spectrum. This represents the relative index of the spectrum within each file. Note that this can be different to theacquisitionNum
of the spectrum which represents the index of the spectrum during acquisition/measurement (as reported in the mzML file).smoothed()
,smoothed<-
: gets or sets whether a spectrum is smoothed.smoothed()
returns alogical
vector of length equal to the number of spectra.smoothed<-
takes alogical
vector of length 1 or equal to the number of spectra inobject
.spectraData()
: gets general spectrum metadata (annotation, also called header).spectraData()
returns aDataFrame
. Note that this method does by default not return m/z or intensity values.spectraData<-
: replaces the full spectra data of theSpectra
object with the one provided withvalue
. ThespectraData<-
function expects aDataFrame
to be passed as value with the same number of rows as there a spectra inobject
. Note that replacing values of peaks variables is not supported with a non-empty processing queue, i.e. if any filtering or data manipulations on the peaks data was performed. In these casesapplyProcessing()
needs to be called first to apply all cached data operations and empty the processing queue.spectraNames()
,spectraNames<-
: gets or sets the spectra names.spectraVariables()
: returns acharacter
vector with the available spectra variables (columns, fields or attributes of each spectrum) available inobject
. Note thatspectraVariables()
does not list the peak variables ("mz"
,"intensity"
and eventual additional annotations for each MS peak). Peak variables are returned bypeaksVariables()
.tic()
: gets the total ion current/count (sum of signal of a spectrum) for all spectra inobject
. By default, the value reported in the original raw data file is returned. For an empty spectrum,0
is returned.uniqueMsLevels()
: get the unique MS levels available inobject
. This function is supposed to be more efficient thanunique(msLevel(object))
.
Filter spectra data
Filter a Spectra
object based on the spectra data. This includes subset
operations that immediately reduce the number of spectra in the object as
well as filters that reduce the content of the Spectra
object.
See section Filter peaks data below for functions that filter the peaks
data of a Spectra
.
[
: subsets the spectra keeping only selected elements (i
). The method always returns aSpectra
object.dropNaSpectraVariables()
: removes spectra variables (i.e. columns in the object'sspectraData
that contain only missing values (NA
). Note that while columns with onlyNA
s are removed, aspectraData()
call afterdropNaSpectraVariables()
might still show columns containingNA
values for core spectra variables. The total number of spectra is not changed by this function.filterAcquisitionNum()
: filters the object keeping only spectra matching the provided acquisition numbers (argumentn
). IfdataOrigin
ordataStorage
is also provided,object
is subsetted to the spectra with an acquisition number equal ton
in spectra with matching dataOrigin or dataStorage values retaining all other spectra. Returns the filteredSpectra
.filterDataOrigin()
: filters the object retaining spectra matching the provideddataOrigin
. ParameterdataOrigin
has to be of typecharacter
and needs to match exactly the data origin value of the spectra to subset. Returns the filteredSpectra
object (with spectra ordered according to the provideddataOrigin
parameter).filterDataStorage()
: filters the object retaining spectra stored in the specifieddataStorage
. ParameterdataStorage
has to be of typecharacter
and needs to match exactly the data storage value of the spectra to subset. Returns the filteredSpectra
object (with spectra ordered according to the provideddataStorage
parameter).filterEmptySpectra()
: removes empty spectra (i.e. spectra without peaks). Returns the filteredSpectra
object (with spectra in their original order).filterIsolationWindow()
: retains spectra that containmz
in their isolation window m/z range (i.e. with anisolationWindowLowerMz
<=mz
andisolationWindowUpperMz
>=mz
. Returns the filteredSpectra
object (with spectra in their original order).filterMsLevel()
: filters object by MS level keeping only spectra matching the MS level specified with argumentmsLevel
. Returns the filteredSpectra
(with spectra in their original order).filterPolarity()
: filters the object keeping only spectra matching the provided polarity. Returns the filteredSpectra
(with spectra in their original order).filterPrecursorCharge()
: retains spectra with the defined precursor charge(s).filterPrecursorIsotopes()
: groups MS2 spectra based on their precursor m/z and precursor intensity into predicted isotope groups and keep for each only the spectrum representing the monoisotopic precursor. MS1 spectra are returned as is. See documentation fordeisotopeSpectra()
below for details on isotope prediction and parameter description.filterPrecursorMaxIntensity()
: filters theSpectra
keeping for groups of (MS2) spectra with similar precursor m/z values (given parametersppm
andtolerance
) the one with the highest precursor intensity. The function filters only MS2 spectra and returns all MS1 spectra. If precursor intensities areNA
for all spectra within a spectra group, the first spectrum of that groups is returned. Note: some manufacturers don't provide precursor intensities. These can however also be estimated withestimatePrecursorIntensity()
.filterPrecursorMzRange()
(previouslyfilterPrecursorMz()
which is now deprecated): retains spectra with a precursor m/z within the provided m/z range. See examples for details on selecting spectra with a precursor m/z for a target m/z accepting a small difference in ppm.filterPrecursorMzValues()
: retains spectra with precursor m/z matching any of the provided m/z values (givenppm
andtolerance
). Spectra with missing precursor m/z value (e.g. MS1 spectra) are dropped.filterPrecursorScan()
: retains parent (e.g. MS1) and children scans (e.g. MS2) of acquisition numberacquisitionNum
. Returns the filteredSpectra
(with spectra in their original order). Parameterf
allows to define which spectra belong to the same sample or original data file ( defaults tof = dataOrigin(object)
).filterRanges()
: allows filtering of theSpectra
object based on user defined numeric ranges (parameterranges
) for one or more available spectra variables in object (spectra variable names can be specified with parameterspectraVariables
). Spectra for which the value of a spectra variable is within it's defined range are retained. If multiple ranges/spectra variables are defined, thematch
parameter can be used to specify whether all conditions (match = "all"
; the default) or if any of the conditions must match (match = "any"
; all spectra for which values are within any of the provided ranges are retained).filterRt()
: retains spectra of MS levelmsLevel
with retention times (in seconds) within (>=
)rt[1]
and (<=
)rt[2]
. Returns the filteredSpectra
(with spectra in their original order).filterValues()
: allows filtering of theSpectra
object based on similarities of numeric values of one or morespectraVariables(object)
(parameterspectraVariables
) to provided values (parametervalues
) given acceptable differences (parameters tolerance and ppm). If multiple values/spectra variables are defined, thematch
parameter can be used to specify whether all conditions (match = "all"
; the default) or if any of the conditions must match (match = "any"
; all spectra for which values are within any of the provided ranges are retained).selectSpectraVariables()
: reduces the information within the object to the selected spectra variables: all data for variables not specified will be dropped. For mandatory columns (i.e., those listed bycoreSpectraVariables()
, such as msLevel, rtime ...) only the values will be dropped but not the variable itself. Additional (or user defined) spectra variables will be completely removed. Returns the filteredSpectra
.
Filter or aggregate mass peak data
Operations that filter or aggregate the mass peak data from each spectrum
without changing the number of spectra in a Spectra
object. Also, the
actual subsetting/aggregation operation is only executed once peaks data is
accessed (through peaksData()
, mz()
or intensity()
) or
applyProcessing()
is called.
combinePeaks()
: combines mass peaks within each spectrum with a difference in their m/z values that is smaller than the maximal acceptable difference defined byppm
andtolerance
. ParametersintensityFun
andmzFun
allow to define functions to aggregate the intensity and m/z values for each such group of peaks. Withweighted = TRUE
(the default), the m/z value of the combined peak is calculated using an intensity-weighted mean and parametermzFun
is ignored. TheMsCoreUtils::group()
function is used for the grouping of mass peaks. ParametermsLevel.
allows to define selected MS levels for which peaks should be combined. This function returns aSpectra
with the same number of spectra than the input object, but with possibly combined peaks within each spectrum. dropped (i.e. their values are replaced withNA
) for combined peaks unless they are constant across the combined peaks. See alsoreduceSpectra()
for a function to select a single representative mass peak for each peak group.deisotopeSpectra()
: deisotopes each spectrum keeping only the monoisotopic peak for groups of isotopologues. Isotopologues are estimated using theisotopologues()
function from the MetaboCoreUtils package. Note that the default parameters for isotope prediction/detection have been determined using data from the Human Metabolome Database (HMDB) and isotopes for elements other than CHNOPS might not be detected. See parametersubstDefinition
in the documentation ofisotopologues()
for more information. The approach and code to define the parameters for isotope prediction is described here.filterFourierTransformArtefacts()
: removes (Orbitrap) fast fourier artefact peaks from spectra (see examples below). The function iterates through all intensity ordered peaks in a spectrum and removes all peaks with an m/z within +/-halfWindowSize
of the current peak if their intensity is lower thanthreshold
times the current peak's intensity. Additional parameterskeepIsotopes
,maxCharge
andisotopeTolerance
allow to avoid removing of potential[13]C
isotope peaks (maxCharge
being the maximum charge that should be considered andisotopeTolerance
the absolute acceptable tolerance for matching their m/z). SeefilterFourierTransformArtefacts()
for details and background anddeisitopeSpectra()
for an alternative.filterIntensity()
: filters mass peaks in each spectrum keeping only those with intensities that are within the provided range or match the criteria of the provided function. For the former, parameterintensity
has to be anumeric
defining the intensity range, for the latter afunction
that takes the intensity values of the spectrum and returns alogical
whether the peak should be retained or not (see examples below for details) - additional parameters to the function can be passed with...
. To remove only peaks with intensities below a certain threshold, say 100, useintensity = c(100, Inf)
. Note: also a single value can be passed with theintensity
parameter in which case an upper limit ofInf
is used. Note that this function removes also peaks with missing intensities (i.e. an intensity ofNA
). ParametermsLevel.
allows to restrict the filtering to spectra of the specified MS level(s).filterMzRange()
: filters mass peaks in the object keeping or removing those in each spectrum that are within the provided m/z range. Whether peaks are retained or removed can be configured with parameterkeep
(defaultkeep = TRUE
).filterMzValues()
: filters mass peaks in the object keeping all peaks in each spectrum that match the provided m/z value(s) (forkeep = TRUE
, the default) or removing all of them (forkeep = FALSE
). The m/z matching considers also the absolutetolerance
and m/z-relativeppm
values.tolerance
andppm
have to be of length 1.filterPeaksRanges()
: filters mass peaks of aSpectra
object using any set of range-based filters on numeric spectra or peaks variables. SeefilterPeaksRanges()
for more information.filterPrecursorPeaks()
: removes peaks from each spectrum inobject
with an m/z equal or larger than the m/z of the precursor, depending on the value of parametermz
: formz = ==" (the default) peaks with matching m/z (considering an absolute and relative acceptable difference depending on
toleranceand
ppm, respectively) are removed. For
mz = ">="all peaks with an m/z larger or equal to the precursor m/z (minus
toleranceand the
ppmof the precursor m/z) are removed. Parameter
msLevel.allows to restrict the filter to certain MS levels (by default the filter is applied to all MS levels). Note that no peaks are removed if the precursor m/z is
NA` (e.g. typically for MS1 spectra).reduceSpectra()
: keeps for groups of peaks with similar m/z values in (givenppm
andtolerance
) in each spectrum only the peak with the highest intensity removing all other peaks hence reducing each spectrum to the highest intensity peaks per peak group. Peak groups are defined using thegroup()
function from the MsCoreUtils package. See also thecombinePeaks()
function for an alternative function to combine peaks within each spectrum.
Merging, aggregating and splitting
Several Spectra
objects can be concatenated into a single object with the
c()
or the concatenateSpectra()
function. Concatenation will fail if the
processing queue of any of the Spectra
objects is not empty or if
different backends are used in the Spectra
objects. Thus, in these cases,
prior to merging Spectra
object it is suggested to change the backend to
a MsBackendMemory
using the setBackend()
function, and to apply all
data processing steps using applyProcessing()
. The spectra variables
of the resulting Spectra
object is the union of the spectra variables of
the individual Spectra
objects.
combineSpectra()
: combines MS data (i.e. mass peaks) from sets of spectra into a single spectrum per set (in contrast tocombinePeaks()
orreduceSpectra()
that combine mass peaks within each spectrum). For each spectrum group (set), spectra variables from the first spectrum are used and the peak matrices are combined using the function specified withFUN
, which defaults tocombinePeaksData()
. Please refer to thecombinePeaksData()
help page for details and options of the actual combination of peaks across the sets of spectra and to the package vignette for examples and alternative ways to aggregate spectra. The sets of spectra can be specified with parameterf
. In addition it is possible to define, with parameterp
if and how to split the input data for parallel processing. This defaults top = x$dataStorage
and hence a per-file parallel processing is applied forSpectra
with file-based backends (such as theMsBackendMzR()
). Prior combination of the spectra all processings queued in the lazy evaluation queue are applied. Be aware that callingcombineSpectra()
on aSpectra
object with certain backends that allow modifications might overwrite the original data. This does not happen with aMsBackendMemory
orMsBackendDataFrame
backend, but with aMsBackendHdf5Peaks
backend the m/z and intensity values in the original hdf5 file(s) will be overwritten. The function returns aSpectra
of length equal to the unique levels off
.joinSpectraData()
: Individual spectra variables can be directly added with the$<-
or[[<-
syntax. ThejoinSpectraData()
function allows to merge aDataFrame
to the existing spectra data. This function diverges from themerge()
method in two main ways:The
by.x
andby.y
column names must be of length 1.If variable names are shared in
x
andy
, the spectra variables ofx
are not modified. It's only they
variables that are appended the suffix defined insuffix.y
. This is to avoid modifying any core spectra variables that would lead to an invalid object.Duplicated Spectra keys (i.e.
x[[by.x]]
) are not allowed. Duplicated keys in theDataFrame
(i.ey[[by.y]]
) throw a warning and only the last occurrence is kept. These should be explored and ideally be removed using forQFeatures::reduceDataFrame()
,PMS::reducePSMs()
or similar functions.
split()
: splits theSpectra
object based on parameterf
into alist
ofSpectra
objects.
Data manipulation and analysis methods
Many data manipulation operations, such as those listed in this section, are
not applied immediately to the spectra, but added to a
lazy processing/manipulation queue. Operations stored in this queue are
applied on-the-fly to spectra data each time it is accessed. This lazy
execution guarantees the same functionality for Spectra
objects with
any backend, i.e. backends supporting to save changes to spectrum data
(MsBackendMemory()
, MsBackendDataFrame()
or MsBackendHdf5Peaks()
) as
well as read-only backends (such as the MsBackendMzR()
).
Note that for the former it is possible to apply the processing queue and
write the modified peak data back to the data storage with the
applyProcessing()
function.
addProcessing()
: adds an arbitrary function that should be applied to the peaks matrix of every spectrum inobject
. The function (can be passed with parameterFUN
) is expected to take a peaks matrix as input and to return a peaks matrix. A peaks matrix is a numeric matrix with two columns, the first containing the m/z values of the peaks and the second the corresponding intensities. The function has to have...
in its definition. Additional arguments can be passed with...
. With parameterspectraVariables
it is possible to define additional spectra variables fromobject
that should be passed to the functionFUN
. These will be passed by their name (e.g. specifyingspectraVariables = "precursorMz"
will pass the spectra's precursor m/z as a parameter namedprecursorMz
to the function. The only exception is the spectra's MS level, these will be passed to the function as a parameter calledspectrumMsLevel
(i.e. withspectraVariables = "msLevel"
the MS levels of each spectrum will be submitted to the function as a parameter calledspectrumMsLevel
). Examples are provided in the package vignette.applyProcessing()
: forSpectra
objects that use a writeable backend only: apply all steps from the lazy processing queue to the peak data and write it back to the data storage. Parameterf
allows to specify howobject
should be split for parallel processing. This should either be equal to thedataStorage
, orf = rep(1, length(object))
to disable parallel processing alltogether. Other partitionings might result in errors (especially if aMsBackendHdf5Peaks
backend is used).bin()
: aggregates individual spectra into discrete (m/z) bins. Binning is performed only on spectra of the specified MS level(s) (parametermsLevel
, by default all MS levels ofx
). The bins can be defined with parameterbreaks
which by default are equally sized bins, with size being defined by parameterbinSize
, from the minimal to the maximal m/z of all spectra (of MS levelmsLevel
) withinx
. The same bins are used for all spectra inx
. All intensity values for peaks falling into the same bin are aggregated using the function provided with parameterFUN
(defaults toFUN = sum
, i.e. all intensities are summed up). Note that the binning operation is applied to the peak data on-the-fly upon data access and it is possible to revert the operation with thereset()
function (see description ofreset()
above).compareSpectra()
: compares each spectrum inx
with each spectrum iny
using the function provided withFUN
(defaults tondotproduct()
). Ify
is missing, each spectrum inx
is compared with each other spectrum inx
. The matching/mapping of peaks between the compared spectra is done with theMAPFUN
function. The defaultjoinPeaks()
matches peaks of both spectra and allows to keep all peaks from the first spectrum (type = "left"
), from the second (type = "right"
), from both (type = "outer"
) and to keep only matching peaks (type = "inner"
); seejoinPeaks()
for more information and examples). TheMAPFUN
function should have parametersx
,y
,xPrecursorMz
andyPrecursorMz
as these values are passed to the function. In addition tojoinPeaks()
alsojoinPeaksGnps()
is supported for GNPS-like similarity score calculations. Note thatjoinPeaksGnps()
should only be used in combination withFUN = MsCoreUtils::gnps
(seejoinPeaksGnps()
for more information and details). UseMAPFUN = joinPeaksNone
to disable internal peak matching/mapping if a similarity scoring function is used that performs the matching internally.FUN
is supposed to be a function to compare intensities of (matched) peaks of the two spectra that are compared. The function needs to take two matrices with columns"mz"
and"intensity"
as input and is supposed to return a single numeric as result. In addition to the two peak matrices the spectra's precursor m/z values are passed to the function as parametersxPrecursorMz
(precursor m/z of thex
peak matrix) andyPrecursorMz
(precursor m/z of they
peak matrix). Additional parameters to functionsFUN
andMAPFUN
can be passed with...
. Parametersppm
andtolerance
are passed to bothMAPFUN
andFUN
. The function returns amatrix
with the results ofFUN
for each comparison, number of rows equal tolength(x)
and number of columns equallength(y)
(i.e. element in row 2 and column 3 is the result from the comparison ofx[2]
withy[3]
). IfSIMPLIFY = TRUE
thematrix
is simplified to anumeric
if length ofx
ory
is one. See also the vignette for additional examples, such as using spectral entropy similarity in the scoring.entropy()
: calculates the entropy of each spectra based on the metrics suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). See alsonentropy()
in the MsCoreUtils package for details.estimatePrecursorIntensity()
: defines the precursor intensities for MS2 spectra using the intensity of the matching MS1 peak from the closest MS1 spectrum (i.e. the last MS1 spectrum measured before the respective MS2 spectrum). Withmethod = "interpolation"
it is also possible to calculate the precursor intensity based on an interpolation of intensity values (and retention times) of the matching MS1 peaks from the previous and next MS1 spectrum. SeeestimatePrecursorIntensity()
for examples and more details.estimatePrecursorMz()
: for DDA data: allows to estimate a fragment spectra's precursor m/z based on the reported precursor m/z and the data from the previous MS1 spectrum. SeeestimatePrecursorMz()
for details.neutralLoss()
: calculates neutral loss spectra for fragment spectra. SeeneutralLoss()
for detailed documentation.processingLog()
: returns acharacter
vector with the processing log messages.reset()
: restores the data to its original state (as much as possible): removes any processing steps from the lazy processing queue and callsreset()
on the backend which, depending on the backend, can also undo e.g. data filtering operations. Note that areset*(
call afterapplyProcessing()
will not have any effect. See examples below for more information.scalePeaks()
: scales intensities of peaks within each spectrum depending on parameterby
. Withby = sum
(the default) peak intensities are divided by the sum of peak intensities within each spectrum. The sum of intensities is thus 1 for each spectrum after scaling. ParametermsLevel.
allows to apply the scaling of spectra of a certain MS level. By default (msLevel. = uniqueMsLevels(x)
) intensities for all spectra will be scaled.spectrapply()
: applies a given function to each individual spectrum or sets of aSpectra
object. By default, theSpectra
is split into individual spectra (i.e.Spectra
of length 1) and the functionFUN
is applied to each of them. An alternative splitting can be defined with parameterf
. Parameters forFUN
can be passed using...
. The returned result and its order depend on the functionFUN
and howobject
is split (hence onf
, if provided). Parallel processing is supported and can be configured with parameterBPPARAM
, is however only suggested for computational intenseFUN
. As an alternative to the (eventual parallel) processing of the fullSpectra
,spectrapply()
supports also a chunk-wise processing. For this, parameterchunkSize
needs to be specified.object
is then split into chunks of sizechunkSize
which are then (stepwise) processed byFUN
. This guarantees a lower memory demand (especially for on-disk backends) since only the data for one chunk needs to be loaded into memory in each iteration. Note that by specifyingchunkSize
, parametersf
andBPPARAM
will be ignored. See alsochunkapply()
or examples below for details on chunk-wise processing.smooth()
: smooths individual spectra using a moving window-based approach (window size =2 * halfWindowSize
). Currently, the Moving-Average- (method = "MovingAverage"
), Weighted-Moving-Average- (method = "WeightedMovingAverage")
, weights depending on the distance of the center and calculated1/2^(-halfWindowSize:halfWindowSize)
) and Savitzky-Golay-Smoothing (method = "SavitzkyGolay"
) are supported. For details how to choose the correcthalfWindowSize
please seeMsCoreUtils::smooth()
.pickPeaks()
: picks peaks on individual spectra using a moving window-based approach (window size =2 * halfWindowSize
). For noisy spectra there are currently two different noise estimators available, the Median Absolute Deviation (method = "MAD"
) and Friedman's Super Smoother (method = "SuperSmoother"
), as implemented in theMsCoreUtils::noise()
. The method supports also to optionally refine the m/z value of the identified centroids by considering data points that belong (most likely) to the same mass peak. Therefore the m/z value is calculated as an intensity weighted average of the m/z values within the peak region. The peak region is defined as the m/z values (and their respective intensities) of the2 * k
closest signals to the centroid or the closest valleys (descending = TRUE
) in the2 * k
region. For the latter thek
has to be chosen general larger. SeeMsCoreUtils::refineCentroids()
for details. If the ratio of the signal to the highest intensity of the peak is belowthreshold
it will be ignored for the weighted average.replaceIntensitiesBelow()
: replaces intensities below a specified threshold with the providedvalue
. Parameterthreshold
can be either a single numeric value or a function which is applied to all non-NA
intensities of each spectrum to determine a threshold value for each spectrum. The default isthreshold = min
which replaces all values which are <= the minimum intensity in a spectrum withvalue
(the default forvalue
is0
). Note that the function specified withthreshold
is expected to have a parameterna.rm
sincena.rm = TRUE
will be passed to the function. If the spectrum is in profile mode, ranges of successive non-0 peaks <=threshold
are set to 0. ParametermsLevel.
allows to apply this to only spectra of certain MS level(s).
Author
Nir Shahaf, Johannes Rainer
Nir Shahaf
Johannes Rainer
Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail
Examples
## Create a Spectra providing a `DataFrame` containing the spectrum data.
spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2))
spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2))
spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8))
data <- Spectra(spd)
data
#> MSn data (Spectra) with 2 spectra in a MsBackendMemory backend:
#> msLevel rtime scanIndex
#> <integer> <numeric> <integer>
#> 1 1 1.1 NA
#> 2 2 1.2 NA
#> ... 16 more variables/columns.
## Get the number of spectra
length(data)
#> [1] 2
## Get the number of peaks per spectrum
lengths(data)
#> [1] 4 3
## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk
## backend.
sciex_file <- dir(system.file("sciex", package = "msdata"),
full.names = TRUE)
sciex <- Spectra(sciex_file, backend = MsBackendMzR())
sciex
#> MSn data (Spectra) with 1862 spectra in a MsBackendMzR backend:
#> msLevel rtime scanIndex
#> <integer> <numeric> <integer>
#> 1 1 0.280 1
#> 2 1 0.559 2
#> 3 1 0.838 3
#> 4 1 1.117 4
#> 5 1 1.396 5
#> ... ... ... ...
#> 1858 1 258.636 927
#> 1859 1 258.915 928
#> 1860 1 259.194 929
#> 1861 1 259.473 930
#> 1862 1 259.752 931
#> ... 33 more variables/columns.
#>
#> file(s):
#> 20171016_POOL_POS_1_105-134.mzML
#> 20171016_POOL_POS_3_105-134.mzML
## The MS data is on disk and will be read into memory on-demand. We can
## however change the backend to a MsBackendMemory backend which will
## keep all of the data in memory.
sciex_im <- setBackend(sciex, MsBackendMemory())
sciex_im
#> MSn data (Spectra) with 1862 spectra in a MsBackendMemory backend:
#> msLevel rtime scanIndex
#> <integer> <numeric> <integer>
#> 1 1 0.280 1
#> 2 1 0.559 2
#> 3 1 0.838 3
#> 4 1 1.117 4
#> 5 1 1.396 5
#> ... ... ... ...
#> 1858 1 258.636 927
#> 1859 1 258.915 928
#> 1860 1 259.194 929
#> 1861 1 259.473 930
#> 1862 1 259.752 931
#> ... 33 more variables/columns.
#> Processing:
#> Switch backend from MsBackendMzR to MsBackendMemory [Fri Sep 13 07:09:44 2024]
## The `MsBackendMemory()` supports the `setBackend()` method:
supportsSetBackend(MsBackendMemory())
#> [1] TRUE
## Thus, it is possible to change to that backend with `setBackend()`. Most
## read-only backends however don't support that, such as the
## `MsBackendMzR` and `setBackend()` would fail to change to that backend.
supportsSetBackend(MsBackendMzR())
#> [1] FALSE
## The on-disk object `sciex` is light-weight, because it does not keep the
## MS peak data in memory. The `sciex_im` object in contrast keeps all the
## data in memory and its size is thus much larger.
object.size(sciex)
#> 395976 bytes
object.size(sciex_im)
#> 55802328 bytes
## The spectra variable `dataStorage` returns for each spectrum the location
## where the data is stored. For in-memory objects:
head(dataStorage(sciex_im))
#> [1] "<memory>" "<memory>" "<memory>" "<memory>" "<memory>" "<memory>"
## While objects that use an on-disk backend will list the files where the
## data is stored.
head(dataStorage(sciex))
#> [1] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [2] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [3] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [4] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [5] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [6] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
## The spectra variable `dataOrigin` returns for each spectrum the *origin*
## of the data. If the data is read from e.g. mzML files, this will be the
## original mzML file name:
head(dataOrigin(sciex))
#> [1] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [2] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [3] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [4] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [5] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [6] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
head(dataOrigin(sciex_im))
#> [1] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [2] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [3] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [4] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [5] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [6] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
## ---- ACCESSING AND ADDING DATA ----
## Get the MS level for each spectrum.
msLevel(data)
#> [1] 1 2
## Alternatively, we could also use $ to access a specific spectra variable.
## This could also be used to add additional spectra variables to the
## object (see further below).
data$msLevel
#> [1] 1 2
## Get the intensity and m/z values.
intensity(data)
#> NumericList of length 2
#> [[1]] 200 400 34.2 17
#> [[2]] 12.3 15.2 6.8
mz(data)
#> NumericList of length 2
#> [[1]] 100 103.2 104.3 106.5
#> [[2]] 45.6 120.4 190.2
## Determine whether one of the spectra has a specific m/z value
containsMz(data, mz = 120.4)
#> [1] FALSE TRUE
## Accessing spectra variables works for all backends:
intensity(sciex)
#> NumericList of length 1862
#> [[1]] 0 412 0 0 412 0 0 412 0 0 412 0 0 ... 0 412 0 0 412 0 0 412 0 0 412 412 0
#> [[2]] 0 140 0 0 140 0 0 419 0 0 140 0 0 ... 0 140 0 0 140 0 0 140 0 0 279 140 0
#> [[3]] 0 132 263 263 132 132 0 0 132 132 0 0 ... 0 0 132 0 0 132 0 0 132 0 132 0
#> [[4]] 0 139 139 0 0 139 0 0 139 139 0 139 0 ... 0 0 139 0 0 277 0 0 139 0 139 0
#> [[5]] 0 164 0 0 328 0 164 0 0 164 0 0 164 ... 164 0 0 164 0 0 164 0 164 0 328 0
#> [[6]] 0 146 146 146 0 0 146 0 0 146 0 0 ... 146 0 0 146 146 0 0 146 0 0 146 0
#> [[7]] 0 296 0 296 0 0 148 0 0 148 0 0 148 ... 0 0 148 0 0 148 0 0 148 0 0 148 0
#> [[8]] 0 170 0 170 170 170 0 170 0 0 170 0 ... 170 0 0 170 0 0 170 0 0 170 170 0
#> [[9]] 0 157 0 314 0 0 157 0 0 157 0 0 314 ... 0 0 157 0 0 157 0 0 157 0 157 0
#> [[10]] 0 151 302 302 604 0 302 0 0 151 0 0 ... 151 0 0 151 0 151 0 151 0 151 0
#> ...
#> <1852 more elements>
intensity(sciex_im)
#> NumericList of length 1862
#> [[1]] 0 412 0 0 412 0 0 412 0 0 412 0 0 ... 0 412 0 0 412 0 0 412 0 0 412 412 0
#> [[2]] 0 140 0 0 140 0 0 419 0 0 140 0 0 ... 0 140 0 0 140 0 0 140 0 0 279 140 0
#> [[3]] 0 132 263 263 132 132 0 0 132 132 0 0 ... 0 0 132 0 0 132 0 0 132 0 132 0
#> [[4]] 0 139 139 0 0 139 0 0 139 139 0 139 0 ... 0 0 139 0 0 277 0 0 139 0 139 0
#> [[5]] 0 164 0 0 328 0 164 0 0 164 0 0 164 ... 164 0 0 164 0 0 164 0 164 0 328 0
#> [[6]] 0 146 146 146 0 0 146 0 0 146 0 0 ... 146 0 0 146 146 0 0 146 0 0 146 0
#> [[7]] 0 296 0 296 0 0 148 0 0 148 0 0 148 ... 0 0 148 0 0 148 0 0 148 0 0 148 0
#> [[8]] 0 170 0 170 170 170 0 170 0 0 170 0 ... 170 0 0 170 0 0 170 0 0 170 170 0
#> [[9]] 0 157 0 314 0 0 157 0 0 157 0 0 314 ... 0 0 157 0 0 157 0 0 157 0 157 0
#> [[10]] 0 151 302 302 604 0 302 0 0 151 0 0 ... 151 0 0 151 0 151 0 151 0 151 0
#> ...
#> <1852 more elements>
## Get the m/z for the first spectrum.
mz(data)[[1]]
#> [1] 100.0 103.2 104.3 106.5
## Get the peak data (m/z and intensity values).
pks <- peaksData(data)
pks
#> List of length 2
pks[[1]]
#> mz intensity
#> [1,] 100.0 200.0
#> [2,] 103.2 400.0
#> [3,] 104.3 34.2
#> [4,] 106.5 17.0
pks[[2]]
#> mz intensity
#> [1,] 45.6 12.3
#> [2,] 120.4 15.2
#> [3,] 190.2 6.8
## Note that we could get the same resulb by coercing the `Spectra` to
## a `list` or `SimpleList`:
as(data, "list")
#> [[1]]
#> mz intensity
#> [1,] 100.0 200.0
#> [2,] 103.2 400.0
#> [3,] 104.3 34.2
#> [4,] 106.5 17.0
#>
#> [[2]]
#> mz intensity
#> [1,] 45.6 12.3
#> [2,] 120.4 15.2
#> [3,] 190.2 6.8
#>
as(data, "SimpleList")
#> List of length 2
## List all available spectra variables (i.e. spectrum data and metadata).
spectraVariables(data)
#> [1] "msLevel" "rtime"
#> [3] "acquisitionNum" "scanIndex"
#> [5] "dataStorage" "dataOrigin"
#> [7] "centroided" "smoothed"
#> [9] "polarity" "precScanNum"
#> [11] "precursorMz" "precursorIntensity"
#> [13] "precursorCharge" "collisionEnergy"
#> [15] "isolationWindowLowerMz" "isolationWindowTargetMz"
#> [17] "isolationWindowUpperMz"
## For all *core* spectrum variables accessor functions are available. These
## return NA if the variable was not set.
centroided(data)
#> [1] NA NA
dataStorage(data)
#> [1] "<memory>" "<memory>"
rtime(data)
#> [1] 1.1 1.2
precursorMz(data)
#> [1] NA NA
## The core spectra variables are:
coreSpectraVariables()
#> msLevel rtime acquisitionNum
#> "integer" "numeric" "integer"
#> scanIndex mz intensity
#> "integer" "NumericList" "NumericList"
#> dataStorage dataOrigin centroided
#> "character" "character" "logical"
#> smoothed polarity precScanNum
#> "logical" "integer" "integer"
#> precursorMz precursorIntensity precursorCharge
#> "numeric" "numeric" "integer"
#> collisionEnergy isolationWindowLowerMz isolationWindowTargetMz
#> "numeric" "numeric" "numeric"
#> isolationWindowUpperMz
#> "numeric"
## Add an additional metadata column.
data$spectrum_id <- c("sp_1", "sp_2")
## List spectra variables, "spectrum_id" is now also listed
spectraVariables(data)
#> [1] "msLevel" "rtime"
#> [3] "acquisitionNum" "scanIndex"
#> [5] "dataStorage" "dataOrigin"
#> [7] "centroided" "smoothed"
#> [9] "polarity" "precScanNum"
#> [11] "precursorMz" "precursorIntensity"
#> [13] "precursorCharge" "collisionEnergy"
#> [15] "isolationWindowLowerMz" "isolationWindowTargetMz"
#> [17] "isolationWindowUpperMz" "spectrum_id"
## Get the values for the new spectra variable
data$spectrum_id
#> [1] "sp_1" "sp_2"
## Extract specific spectra variables.
spectraData(data, columns = c("spectrum_id", "msLevel"))
#> DataFrame with 2 rows and 2 columns
#> spectrum_id msLevel
#> <character> <integer>
#> 1 sp_1 1
#> 2 sp_2 2
## Drop spectra variable data and/or columns.
res <- selectSpectraVariables(data, c("mz", "intensity"))
## This removed the additional columns "spectrum_id" and deleted all values
## for all spectra variables, except "mz" and "intensity".
spectraData(res)
#> DataFrame with 2 rows and 17 columns
#> msLevel rtime acquisitionNum scanIndex dataStorage dataOrigin
#> <integer> <numeric> <integer> <integer> <character> <character>
#> 1 NA NA NA NA <memory> NA
#> 2 NA NA NA NA <memory> NA
#> centroided smoothed polarity precScanNum precursorMz precursorIntensity
#> <logical> <logical> <integer> <integer> <numeric> <numeric>
#> 1 NA NA NA NA NA NA
#> 2 NA NA NA NA NA NA
#> precursorCharge collisionEnergy isolationWindowLowerMz
#> <integer> <numeric> <numeric>
#> 1 NA NA NA
#> 2 NA NA NA
#> isolationWindowTargetMz isolationWindowUpperMz
#> <numeric> <numeric>
#> 1 NA NA
#> 2 NA NA
## Compared to the data before selectSpectraVariables.
spectraData(data)
#> DataFrame with 2 rows and 18 columns
#> msLevel rtime acquisitionNum scanIndex dataStorage dataOrigin
#> <integer> <numeric> <integer> <integer> <character> <character>
#> 1 1 1.1 NA NA <memory> NA
#> 2 2 1.2 NA NA <memory> NA
#> centroided smoothed polarity precScanNum precursorMz precursorIntensity
#> <logical> <logical> <integer> <integer> <numeric> <numeric>
#> 1 NA NA NA NA NA NA
#> 2 NA NA NA NA NA NA
#> precursorCharge collisionEnergy isolationWindowLowerMz
#> <integer> <numeric> <numeric>
#> 1 NA NA NA
#> 2 NA NA NA
#> isolationWindowTargetMz isolationWindowUpperMz spectrum_id
#> <numeric> <numeric> <character>
#> 1 NA NA sp_1
#> 2 NA NA sp_2
## ---- SUBSETTING, FILTERING AND COMBINING
## Subset to all MS2 spectra.
data[msLevel(data) == 2]
#> MSn data (Spectra) with 1 spectra in a MsBackendMemory backend:
#> msLevel rtime scanIndex
#> <integer> <numeric> <integer>
#> 1 2 1.2 NA
#> ... 17 more variables/columns.
## Same with the filterMsLevel function
filterMsLevel(data, 2)
#> MSn data (Spectra) with 1 spectra in a MsBackendMemory backend:
#> msLevel rtime scanIndex
#> <integer> <numeric> <integer>
#> 1 2 1.2 NA
#> ... 17 more variables/columns.
#> Processing:
#> Filter: select MS level(s) 2 [Fri Sep 13 07:09:45 2024]
## Below we combine the `data` and `sciex_im` objects into a single one.
data_comb <- c(data, sciex_im)
## The combined Spectra contains a union of all spectra variables:
head(data_comb$spectrum_id)
#> [1] "sp_1" "sp_2" NA NA NA NA
head(data_comb$rtime)
#> [1] 1.100 1.200 0.280 0.559 0.838 1.117
head(data_comb$dataStorage)
#> [1] "<memory>" "<memory>" "<memory>" "<memory>" "<memory>" "<memory>"
head(data_comb$dataOrigin)
#> [1] NA
#> [2] NA
#> [3] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [4] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [5] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [6] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
## Filter a Spectra for a target precursor m/z with a tolerance of 10ppm
spd$precursorMz <- c(323.4, 543.2302)
data_filt <- Spectra(spd)
filterPrecursorMzRange(data_filt, mz = 543.23 + ppm(c(-543.23, 543.23), 10))
#> MSn data (Spectra) with 0 spectra in a MsBackendMemory backend:
#> Processing:
#> Filter: select spectra with a precursor m/z within [543.2354323, 543.2354323] [Fri Sep 13 07:09:45 2024]
## Filter a Spectra keeping only peaks matching certain m/z values
sps_sub <- filterMzValues(data, mz = c(103, 104), tolerance = 0.3)
mz(sps_sub)
#> NumericList of length 2
#> [[1]] 103.2 104.3
#> [[2]] numeric(0)
## This function can also be used to remove specific peaks from a spectrum
## by setting `keep = FALSE`.
sps_sub <- filterMzValues(data, mz = c(103, 104),
tolerance = 0.3, keep = FALSE)
mz(sps_sub)
#> NumericList of length 2
#> [[1]] 100 106.5
#> [[2]] 45.6 120.4 190.2
## Note that `filterMzValues()` keeps or removes all peaks with a matching
## m/z given the provided `ppm` and `tolerance` parameters.
## Filter a Spectra keeping only peaks within a m/z range
sps_sub <- filterMzRange(data, mz = c(100, 300))
mz(sps_sub)
#> NumericList of length 2
#> [[1]] 100 103.2 104.3 106.5
#> [[2]] 120.4 190.2
## Remove empty spectra variables
sciex_noNA <- dropNaSpectraVariables(sciex)
## Available spectra variables before and after `dropNaSpectraVariables()`
spectraVariables(sciex)
#> [1] "msLevel" "rtime"
#> [3] "acquisitionNum" "scanIndex"
#> [5] "dataStorage" "dataOrigin"
#> [7] "centroided" "smoothed"
#> [9] "polarity" "precScanNum"
#> [11] "precursorMz" "precursorIntensity"
#> [13] "precursorCharge" "collisionEnergy"
#> [15] "isolationWindowLowerMz" "isolationWindowTargetMz"
#> [17] "isolationWindowUpperMz" "peaksCount"
#> [19] "totIonCurrent" "basePeakMZ"
#> [21] "basePeakIntensity" "ionisationEnergy"
#> [23] "lowMZ" "highMZ"
#> [25] "mergedScan" "mergedResultScanNum"
#> [27] "mergedResultStartScanNum" "mergedResultEndScanNum"
#> [29] "injectionTime" "filterString"
#> [31] "spectrumId" "ionMobilityDriftTime"
#> [33] "scanWindowLowerLimit" "scanWindowUpperLimit"
spectraVariables(sciex_noNA)
#> [1] "msLevel" "rtime"
#> [3] "acquisitionNum" "scanIndex"
#> [5] "dataStorage" "dataOrigin"
#> [7] "centroided" "smoothed"
#> [9] "polarity" "precScanNum"
#> [11] "precursorMz" "precursorIntensity"
#> [13] "precursorCharge" "collisionEnergy"
#> [15] "isolationWindowLowerMz" "isolationWindowTargetMz"
#> [17] "isolationWindowUpperMz" "peaksCount"
#> [19] "totIonCurrent" "basePeakMZ"
#> [21] "basePeakIntensity" "ionisationEnergy"
#> [23] "lowMZ" "highMZ"
#> [25] "injectionTime" "spectrumId"
## Adding new spectra variables
sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1])
spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging
var1 = rnorm(10),
var2 = sample(letters, 10))
spv
#> DataFrame with 10 rows and 3 columns
#> spectrumId var1 var2
#> <character> <numeric> <character>
#> 1 sample=1 period=1 cy.. -1.40004352 w
#> 2 sample=1 period=1 cy.. 0.25531705 k
#> 3 sample=1 period=1 cy.. -2.43726361 y
#> 4 sample=1 period=1 cy.. -0.00557129 e
#> 5 sample=1 period=1 cy.. 0.62155272 u
#> 6 sample=1 period=1 cy.. 1.14841161 l
#> 7 sample=1 period=1 cy.. -1.82181766 b
#> 8 sample=1 period=1 cy.. -0.24732530 f
#> 9 sample=1 period=1 cy.. -0.24419961 d
#> 10 sample=1 period=1 cy.. -0.28270545 p
sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId")
spectraVariables(sciex2)
#> [1] "msLevel" "rtime"
#> [3] "acquisitionNum" "scanIndex"
#> [5] "dataStorage" "dataOrigin"
#> [7] "centroided" "smoothed"
#> [9] "polarity" "precScanNum"
#> [11] "precursorMz" "precursorIntensity"
#> [13] "precursorCharge" "collisionEnergy"
#> [15] "isolationWindowLowerMz" "isolationWindowTargetMz"
#> [17] "isolationWindowUpperMz" "peaksCount"
#> [19] "totIonCurrent" "basePeakMZ"
#> [21] "basePeakIntensity" "ionisationEnergy"
#> [23] "lowMZ" "highMZ"
#> [25] "mergedScan" "mergedResultScanNum"
#> [27] "mergedResultStartScanNum" "mergedResultEndScanNum"
#> [29] "injectionTime" "filterString"
#> [31] "spectrumId" "ionMobilityDriftTime"
#> [33] "scanWindowLowerLimit" "scanWindowUpperLimit"
#> [35] "var1" "var2"
spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")]
#> DataFrame with 13 rows and 3 columns
#> spectrumId var1 var2
#> <character> <numeric> <character>
#> 1 sample=1 period=1 cy.. NA NA
#> 2 sample=1 period=1 cy.. NA NA
#> 3 sample=1 period=1 cy.. -1.400044 w
#> 4 sample=1 period=1 cy.. 0.255317 k
#> 5 sample=1 period=1 cy.. -2.437264 y
#> ... ... ... ...
#> 9 sample=1 period=1 cy.. -1.821818 b
#> 10 sample=1 period=1 cy.. -0.247325 f
#> 11 sample=1 period=1 cy.. -0.244200 d
#> 12 sample=1 period=1 cy.. -0.282705 p
#> 13 sample=1 period=1 cy.. NA NA
## Removing fourier transform artefacts seen in Orbitra data.
## Loading an Orbitrap spectrum with artefacts.
data(fft_spectrum)
plotSpectra(fft_spectrum, xlim = c(264.5, 265.5))
plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6))
fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum)
fft_spectrum
#> MSn data (Spectra) with 1 spectra in a MsBackendDataFrame backend:
#> msLevel rtime scanIndex
#> <integer> <numeric> <integer>
#> 1 1 367.665 195
#> ... 33 more variables/columns.
#> Lazy evaluation queue: 1 processing step(s)
#> Processing:
#> Switch backend from MsBackendMzR to MsBackendDataFrame [Mon Nov 22 14:14:45 2021]
#> Remove fast fourier artefacts. [Fri Sep 13 07:09:45 2024]
plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6))
## Using a few examples peaks in your data you can optimize the parameters
fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum,
halfWindowSize = 0.2,
threshold = 0.005,
keepIsotopes = TRUE,
maxCharge = 5,
isotopeTolerance = 0.005
)
fft_spectrum_filtered
#> MSn data (Spectra) with 1 spectra in a MsBackendDataFrame backend:
#> msLevel rtime scanIndex
#> <integer> <numeric> <integer>
#> 1 1 367.665 195
#> ... 33 more variables/columns.
#> Lazy evaluation queue: 2 processing step(s)
#> Processing:
#> Switch backend from MsBackendMzR to MsBackendDataFrame [Mon Nov 22 14:14:45 2021]
#> Remove fast fourier artefacts. [Fri Sep 13 07:09:45 2024]
#> Remove fast fourier artefacts. [Fri Sep 13 07:09:45 2024]
length(mz(fft_spectrum_filtered)[[1]])
#> [1] 297
plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6))
## Using filterRanges to filter spectra object based on variables available
## in `spectraData`.
## First, determine the variable(s) on which to base the filtering:
sv <- c("rtime", "precursorMz", "peaksCount")
## Note that ANY variables can be chosen here, and as many as wanted.
## Define the ranges (pairs of values with lower and upper boundary) to be
## used for the individual spectra variables. The first two values will be
## used for the first spectra variable (e.g., rtime here), the next two for
## the second (e.g. precursorMz here) and so on:
ranges <- c(30, 350, 200,500, 350, 600)
## Input the parameters within the filterRanges function:
filt_spectra <- filterRanges(sciex, spectraVariables = sv,
ranges = ranges)
## Using `filterRanges()` to filter spectra object with multiple ranges for
## the same `spectraVariable` (e.g, here rtime)
sv <- c("rtime", "rtime")
ranges <- c(30, 100, 200, 300)
filt_spectra <- filterRanges(sciex, spectraVariables = sv,
ranges = ranges, match = "any")
## Using filterValues in a similar way to a filter spectra object based on
## variables available in `spectraData`. However, this time not based on
## ranges but similarities to user input single values with given
## tolerance/ppm
## First determine the variable(s) on which to base the filtering:
sv <- c("rtime", "precursorMz")
## Note that ANY variables can be chosen here, and as many as wanted.
## Define the values that will be used to filter the spectra based on their
## similarities to their respective spectraVariables.
## The first values in the parameters values, tolerance and ppm will be
## used for the first spectra variable (e.g. rtime here), the next for the
## second (e.g. precursorMz here) and so on:
values <- c(350, 400)
tolerance <- c(100, 0)
ppm <- c(0,50)
## Input the parameters within the `filterValues()` function:
filt_spectra <- filterValues(sciex, spectraVariables = sv,
values = values, tolerance = tolerance, ppm = ppm)
## ---- DATA MANIPULATIONS AND OTHER OPERATIONS ----
## Set the data to be centroided
centroided(data) <- TRUE
## Replace peak intensities below 40 with 3.
res <- replaceIntensitiesBelow(data, threshold = 40, value = 3)
res
#> MSn data (Spectra) with 2 spectra in a MsBackendMemory backend:
#> msLevel rtime scanIndex
#> <integer> <numeric> <integer>
#> 1 1 1.1 NA
#> 2 2 1.2 NA
#> ... 17 more variables/columns.
#> Lazy evaluation queue: 1 processing step(s)
#> Processing:
#> Signal <= 40 in MS level(s) 1, 2 set to 0 [Fri Sep 13 07:09:45 2024]
## Get the intensities of the first and second spectrum.
intensity(res)[[1]]
#> [1] 200 400 3 3
intensity(res)[[2]]
#> [1] 3 3 3
## Remove all peaks with an intensity below 40.
res <- filterIntensity(res, intensity = c(40, Inf))
## Get the intensities of the first and second spectrum.
intensity(res)[[1]]
#> [1] 200 400
intensity(res)[[2]]
#> numeric(0)
## Lengths of spectra is now different
lengths(mz(res))
#> [1] 2 0
lengths(mz(data))
#> [1] 4 3
## In addition it is possible to pass a function to `filterIntensity()`: in
## the example below we want to keep only peaks that have an intensity which
## is larger than one third of the maximal peak intensity in that spectrum.
keep_peaks <- function(x, prop = 3) {
x > max(x, na.rm = TRUE) / prop
}
res2 <- filterIntensity(data, intensity = keep_peaks)
intensity(res2)[[1L]]
#> [1] 200 400
intensity(data)[[1L]]
#> [1] 200.0 400.0 34.2 17.0
## We can also change the proportion by simply passing the `prop` parameter
## to the function. To keep only peaks that have an intensity which is
## larger than half of the maximum intensity:
res2 <- filterIntensity(data, intensity = keep_peaks, prop = 2)
intensity(res2)[[1L]]
#> intensity
#> 400
intensity(data)[[1L]]
#> [1] 200.0 400.0 34.2 17.0
## Since data manipulation operations are by default not directly applied to
## the data but only added to the internal lazy evaluation queue, it is also
## possible to remove these data manipulations with the `reset()` function:
res_rest <- reset(res)
res_rest
#> MSn data (Spectra) with 2 spectra in a MsBackendMemory backend:
#> msLevel rtime scanIndex
#> <integer> <numeric> <integer>
#> 1 1 1.1 NA
#> 2 2 1.2 NA
#> ... 17 more variables/columns.
#> Processing:
#> Signal <= 40 in MS level(s) 1, 2 set to 0 [Fri Sep 13 07:09:45 2024]
#> Remove peaks with intensities outside [40, Inf] in spectra of MS level(s) 1, 2. [Fri Sep 13 07:09:45 2024]
#> Reset object. [Fri Sep 13 07:09:45 2024]
lengths(mz(res_rest))
#> [1] 4 3
lengths(mz(res))
#> [1] 2 0
lengths(mz(data))
#> [1] 4 3
## `reset()` after a `applyProcessing()` can not restore the data, because
## the data in the backend was changed. Similarly, `reset()` after any
## filter operations can not restore data for a `Spectra` with a
## `MsBackendMemory` or `MsBackendDataFrame`.
res_2 <- applyProcessing(res)
res_rest <- reset(res_2)
lengths(mz(res))
#> [1] 2 0
lengths(mz(res_rest))
#> [1] 2 0
## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using
## the normalized dotproduct method.
res <- compareSpectra(sciex_im[2:3], sciex_im[10:20])
## first row contains comparisons of spectrum 2 with spectra 10 to 20 and
## the second row comparisons of spectrum 3 with spectra 10 to 20
res
#> 10 11 12 13 14 15 16
#> 2 0.8583577 0.8603472 0.8588292 0.8434391 0.8581683 0.8515891 0.8568266
#> 3 0.8620081 0.8609868 0.8604669 0.8481934 0.8637094 0.8586378 0.8618227
#> 17 18 19 20
#> 2 0.8563933 0.8559511 0.8546560 0.8538058
#> 3 0.8559523 0.8652169 0.8585325 0.8639445
## To use a simple Pearson correlation instead we can define a function
## that takes the two peak matrices and calculates the correlation for
## their second columns (containing the intensity values).
correlateSpectra <- function(x, y, use = "pairwise.complete.obs", ...) {
cor(x[, 2], y[, 2], use = use)
}
res <- compareSpectra(sciex_im[2:3], sciex_im[10:20],
FUN = correlateSpectra)
res
#> 10 11 12 13 14 15 16
#> 2 0.9974395 0.9989580 0.9973735 0.9974334 0.9987225 0.9992307 0.9978621
#> 3 0.9986971 0.9976861 0.9964951 0.9965942 0.9982105 0.9984607 0.9982808
#> 17 18 19 20
#> 2 0.9950464 0.9987036 0.9988265 0.9970657
#> 3 0.9921029 0.9990331 0.9975400 0.9955439
## Use compareSpectra to determine the number of common (matching) peaks
## with a ppm of 10:
## type = "inner" uses a *inner join* to match peaks, i.e. keeps only
## peaks that can be mapped betwen both spectra. The provided FUN returns
## simply the number of matching peaks.
compareSpectra(sciex_im[2:3], sciex_im[10:20], ppm = 10, type = "inner",
FUN = function(x, y, ...) nrow(x))
#> 10 11 12 13 14 15 16 17 18 19 20
#> 2 530 501 484 576 592 515 549 578 539 542 550
#> 3 526 548 524 592 639 502 589 571 575 564 613
## Apply an arbitrary function to each spectrum in a Spectra.
## In the example below we calculate the mean intensity for each spectrum
## in a subset of the sciex_im data. Note that we can access all variables
## of each individual spectrum either with the `$` operator or the
## corresponding method.
res <- spectrapply(sciex_im[1:20], FUN = function(x) mean(x$intensity[[1]]))
head(res)
#> $`1`
#> [1] 1553.953
#>
#> $`2`
#> [1] 678.2289
#>
#> $`3`
#> [1] 684.3569
#>
#> $`4`
#> [1] 682.1004
#>
#> $`5`
#> [1] 780.6867
#>
#> $`6`
#> [1] 737.5087
#>
## It is however important to note that dedicated methods to access the
## data (such as `intensity`) are much more efficient than using `lapply()`:
res <- lapply(intensity(sciex_im[1:20]), mean)
head(res)
#> [[1]]
#> [1] 1553.953
#>
#> [[2]]
#> [1] 678.2289
#>
#> [[3]]
#> [1] 684.3569
#>
#> [[4]]
#> [1] 682.1004
#>
#> [[5]]
#> [1] 780.6867
#>
#> [[6]]
#> [1] 737.5087
#>
## As an alternative, applying a function `FUN` to a `Spectra` can be
## performed *chunk-wise*. The advantage of this is, that only the data for
## one chunk at a time needs to be loaded into memory reducing the memory
## demand. This type of processing can be performed by specifying the size
## of the chunks (i.e. number of spectra per chunk) with the `chunkSize`
## parameter
spectrapply(sciex_im[1:20], lengths, chunkSize = 5L)
#> [1] 578 1529 1600 1664 1417 1602 1468 1440 1496 1431 1489 1349 1650 1697 1413
#> [16] 1593 1560 1477 1536 1491
## ---- DATA EXPORT ----
## Some `MsBackend` classes provide an `export()` method to export the data
## to the file format supported by the backend.
## The `MsBackendMzR` for example allows to export MS data to mzML or
## mzXML file(s), the `MsBackendMgf` (defined in the MsBackendMgf R package)
## would allow to export the data in mgf file format.
## Below we export the MS data in `data`. We call the `export()` method on
## this object, specify the backend that should be used to export the data
## (and which also defines the output format) and provide a file name.
fl <- tempfile()
export(data, MsBackendMzR(), file = fl)
## This exported our data in mzML format. Below we read the first 6 lines
## from that file.
readLines(fl, n = 6)
#> [1] "<?xml version=\"1.0\" encoding=\"utf-8\"?>"
#> [2] "<indexedmzML xmlns=\"http://psi.hupo.org/ms/mzml\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.2_idx.xsd\">"
#> [3] " <mzML xmlns=\"http://psi.hupo.org/ms/mzml\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd\" id=\"\" version=\"1.1.0\">"
#> [4] " <cvList count=\"2\">"
#> [5] " <cv id=\"MS\" fullName=\"Proteomics Standards Initiative Mass Spectrometry Ontology\" version=\"4.1.117\" URI=\"https://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo\"/>"
#> [6] " <cv id=\"UO\" fullName=\"Unit Ontology\" version=\"09:04:2014\" URI=\"https://raw.githubusercontent.com/bio-ontology-research-group/unit-ontology/master/unit.obo\"/>"
## If only a single file name is provided, all spectra are exported to that
## file. To export data with the `MsBackendMzR` backend to different files, a
## file name for each individual spectrum has to be provided.
## Below we export each spectrum to its own file.
fls <- c(tempfile(), tempfile())
export(data, MsBackendMzR(), file = fls)
## Reading the data from the first file
res <- Spectra(backendInitialize(MsBackendMzR(), fls[1]))
mz(res)
#> NumericList of length 1
#> [[1]] 100 103.2 104.3 106.5
mz(data)
#> NumericList of length 2
#> [[1]] 100 103.2 104.3 106.5
#> [[2]] 45.6 120.4 190.2
## ---- PEAKS VARIABLES AND DATA ----
## Some `MsBackend` classes provide support for arbitrary peaks variables
## (in addition to the mandatory `"mz"` and `"intensity"` values. Below
## we create a simple data frame with an additional peak variable `"pk_ann"`
## and create a `Spectra` with a `MsBackendMemory` for that data.
## Importantly the number of values (per spectrum) need to be the same
## for all peak variables.
tmp <- data.frame(msLevel = c(2L, 2L), rtime = c(123.2, 123.5))
tmp$mz <- list(c(103.1, 110.4, 303.1), c(343.2, 453.1))
tmp$intensity <- list(c(130.1, 543.1, 40), c(0.9, 0.45))
tmp$pk_ann <- list(c(NA_character_, "A", "P"), c("B", "P"))
## Create the Spectra. With parameter `peaksVariables` we can define
## the columns in `tmp` that contain peaks variables.
sps <- Spectra(tmp, source = MsBackendMemory(),
peaksVariables = c("mz", "intensity", "pk_ann"))
peaksVariables(sps)
#> [1] "mz" "intensity" "pk_ann"
## Extract just the m/z and intensity values
peaksData(sps)[[1L]]
#> mz intensity
#> [1,] 103.1 130.1
#> [2,] 110.4 543.1
#> [3,] 303.1 40.0
## Extract the full peaks data
peaksData(sps, columns = peaksVariables(sps))[[1L]]
#> mz intensity pk_ann
#> 1 103.1 130.1 <NA>
#> 2 110.4 543.1 A
#> 3 303.1 40.0 P
## Access just the pk_ann variable
sps$pk_ann
#> [[1]]
#> [1] NA "A" "P"
#>
#> [[2]]
#> [1] "B" "P"
#>
## Convert a subset of the Spectra object to a long DataFrame.
asDataFrame(sciex, i = 1:3, spectraVars = c("rtime", "msLevel"))
#> DataFrame with 3707 rows and 4 columns
#> mz intensity rtime msLevel
#> <numeric> <numeric> <numeric> <integer>
#> 1 105.043 0 0.28 1
#> 2 105.045 412 0.28 1
#> 3 105.046 0 0.28 1
#> 4 107.055 0 0.28 1
#> 5 107.057 412 0.28 1
#> ... ... ... ... ...
#> 3703 133.984 0 0.838 1
#> 3704 133.985 132 0.838 1
#> 3705 133.987 0 0.838 1
#> 3706 133.989 132 0.838 1
#> 3707 133.990 0 0.838 1