Skip to contents

The Spectra class encapsules spectral mass spectrometry data and related metadata.

It supports multiple data backends, e.g. in-memory (MsBackendMemory, MsBackendDataFrame()), on-disk as mzML (MsBackendMzR()) or HDF5 (MsBackendHdf5Peaks()).

Usage

applyProcessing(
  object,
  f = processingChunkFactor(object),
  BPPARAM = bpparam(),
  ...
)

concatenateSpectra(x, ...)

combineSpectra(
  x,
  f = x$dataStorage,
  p = x$dataStorage,
  FUN = combinePeaksData,
  ...,
  BPPARAM = bpparam()
)

joinSpectraData(x, y, by.x = "spectrumId", by.y, suffix.y = ".y")

processingLog(x)

deisotopeSpectra(
  x,
  substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL"),
  tolerance = 0,
  ppm = 20,
  charge = 1
)

reduceSpectra(x, tolerance = 0, ppm = 20)

filterPrecursorMaxIntensity(x, tolerance = 0, ppm = 20)

filterPrecursorIsotopes(
  x,
  tolerance = 0,
  ppm = 20,
  substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL")
)

scalePeaks(x, by = sum, msLevel. = uniqueMsLevels(x))

filterPrecursorPeaks(
  object,
  tolerance = 0,
  ppm = 20,
  mz = c("==", ">="),
  msLevel. = uniqueMsLevels(object)
)

# S4 method for missing
Spectra(
  object,
  processingQueue = list(),
  metadata = list(),
  ...,
  backend = MsBackendMemory(),
  BPPARAM = bpparam()
)

# S4 method for MsBackend
Spectra(
  object,
  processingQueue = list(),
  metadata = list(),
  ...,
  BPPARAM = bpparam()
)

# S4 method for character
Spectra(
  object,
  processingQueue = list(),
  metadata = list(),
  source = MsBackendMzR(),
  backend = source,
  ...,
  BPPARAM = bpparam()
)

# S4 method for ANY
Spectra(
  object,
  processingQueue = list(),
  metadata = list(),
  source = MsBackendMemory(),
  backend = source,
  ...,
  BPPARAM = bpparam()
)

# S4 method for Spectra,MsBackend
setBackend(
  object,
  backend,
  f = processingChunkFactor(object),
  ...,
  BPPARAM = bpparam()
)

# S4 method for Spectra
c(x, ...)

# S4 method for Spectra,ANY
split(x, f, drop = FALSE, ...)

# S4 method for Spectra
export(object, backend, ...)

# S4 method for Spectra
acquisitionNum(object)

# S4 method for Spectra
peaksData(
  object,
  columns = c("mz", "intensity"),
  f = processingChunkFactor(object),
  ...,
  BPPARAM = bpparam()
)

# S4 method for Spectra
peaksVariables(object)

# S4 method for Spectra
centroided(object)

# S4 method for Spectra
centroided(object) <- value

# S4 method for Spectra
collisionEnergy(object)

# S4 method for Spectra
collisionEnergy(object) <- value

# S4 method for Spectra
dataOrigin(object)

# S4 method for Spectra
dataOrigin(object) <- value

# S4 method for Spectra
dataStorage(object)

# S4 method for Spectra
dropNaSpectraVariables(object)

# S4 method for Spectra
intensity(object, f = processingChunkFactor(object), ...)

# S4 method for Spectra
ionCount(object)

# S4 method for Spectra
isCentroided(object, ...)

# S4 method for Spectra
isEmpty(x)

# S4 method for Spectra
isolationWindowLowerMz(object)

# S4 method for Spectra
isolationWindowLowerMz(object) <- value

# S4 method for Spectra
isolationWindowTargetMz(object)

# S4 method for Spectra
isolationWindowTargetMz(object) <- value

# S4 method for Spectra
isolationWindowUpperMz(object)

# S4 method for Spectra
isolationWindowUpperMz(object) <- value

# S4 method for Spectra
containsMz(
  object,
  mz = numeric(),
  tolerance = 0,
  ppm = 20,
  which = c("any", "all"),
  BPPARAM = bpparam()
)

# S4 method for Spectra
containsNeutralLoss(
  object,
  neutralLoss = 0,
  tolerance = 0,
  ppm = 20,
  BPPARAM = bpparam()
)

# S4 method for Spectra
spectrapply(
  object,
  FUN,
  ...,
  chunkSize = integer(),
  f = factor(),
  BPPARAM = SerialParam()
)

# S4 method for Spectra
length(x)

# S4 method for Spectra
msLevel(object)

# S4 method for Spectra
mz(object, f = processingChunkFactor(object), ...)

# S4 method for Spectra
lengths(x, use.names = FALSE)

# S4 method for Spectra
polarity(object)

# S4 method for Spectra
polarity(object) <- value

# S4 method for Spectra
precScanNum(object)

# S4 method for Spectra
precursorCharge(object)

# S4 method for Spectra
precursorIntensity(object)

# S4 method for Spectra
precursorMz(object)

# S4 method for Spectra
rtime(object)

# S4 method for Spectra
rtime(object) <- value

# S4 method for Spectra
scanIndex(object)

# S4 method for Spectra
selectSpectraVariables(
  object,
  spectraVariables = union(spectraVariables(object), peaksVariables(object))
)

# S4 method for Spectra
smoothed(object)

# S4 method for Spectra
smoothed(object) <- value

# S4 method for Spectra
spectraData(object, columns = spectraVariables(object))

# S4 method for Spectra
spectraData(object) <- value

# S4 method for Spectra
spectraNames(object)

# S4 method for Spectra
spectraNames(object) <- value

# S4 method for Spectra
spectraVariables(object)

# S4 method for Spectra
tic(object, initial = TRUE)

# S4 method for Spectra
$(x, name)

# S4 method for Spectra
$(x, name) <- value

# S4 method for Spectra
[[(x, i, j, ...)

# S4 method for Spectra
[[(x, i, j, ...) <- value

# S4 method for Spectra
[(x, i, j, ..., drop = FALSE)

# S4 method for Spectra
filterAcquisitionNum(
  object,
  n = integer(),
  dataStorage = character(),
  dataOrigin = character()
)

# S4 method for Spectra
filterEmptySpectra(object)

# S4 method for Spectra
filterDataOrigin(object, dataOrigin = character())

# S4 method for Spectra
filterDataStorage(object, dataStorage = character())

# S4 method for Spectra
filterFourierTransformArtefacts(
  object,
  halfWindowSize = 0.05,
  threshold = 0.2,
  keepIsotopes = TRUE,
  maxCharge = 5,
  isotopeTolerance = 0.005
)

# S4 method for Spectra
filterIntensity(
  object,
  intensity = c(0, Inf),
  msLevel. = uniqueMsLevels(object),
  ...
)

# S4 method for Spectra
filterIsolationWindow(object, mz = numeric())

# S4 method for Spectra
filterMsLevel(object, msLevel. = integer())

# S4 method for Spectra
filterMzRange(
  object,
  mz = numeric(),
  msLevel. = uniqueMsLevels(object),
  keep = TRUE
)

# S4 method for Spectra
filterMzValues(
  object,
  mz = numeric(),
  tolerance = 0,
  ppm = 20,
  msLevel. = uniqueMsLevels(object),
  keep = TRUE
)

# S4 method for Spectra
filterPolarity(object, polarity = integer())

# S4 method for Spectra
filterPrecursorMz(object, mz = numeric())

# S4 method for Spectra
filterPrecursorMzRange(object, mz = numeric())

# S4 method for Spectra
filterPrecursorMzValues(object, mz = numeric(), ppm = 20, tolerance = 0)

# S4 method for Spectra
filterPrecursorCharge(object, z = integer())

# S4 method for Spectra
filterPrecursorScan(object, acquisitionNum = integer(), f = dataOrigin(object))

# S4 method for Spectra
filterRt(object, rt = numeric(), msLevel. = uniqueMsLevels(object))

# S4 method for Spectra
reset(object, ...)

# S4 method for Spectra
filterRanges(
  object,
  spectraVariables = character(),
  ranges = numeric(),
  match = c("all", "any")
)

# S4 method for Spectra
filterValues(
  object,
  spectraVariables = character(),
  values = numeric(),
  ppm = 0,
  tolerance = 0,
  match = c("all", "any")
)

# S4 method for Spectra
bin(
  x,
  binSize = 1L,
  breaks = NULL,
  msLevel. = uniqueMsLevels(x),
  FUN = sum,
  zero.rm = TRUE
)

# S4 method for Spectra,Spectra
compareSpectra(
  x,
  y,
  MAPFUN = joinPeaks,
  tolerance = 0,
  ppm = 20,
  FUN = ndotproduct,
  ...,
  SIMPLIFY = TRUE
)

# S4 method for Spectra,missing
compareSpectra(
  x,
  y = NULL,
  MAPFUN = joinPeaks,
  tolerance = 0,
  ppm = 20,
  FUN = ndotproduct,
  ...,
  SIMPLIFY = TRUE
)

# S4 method for Spectra
pickPeaks(
  object,
  halfWindowSize = 2L,
  method = c("MAD", "SuperSmoother"),
  snr = 0,
  k = 0L,
  descending = FALSE,
  threshold = 0,
  msLevel. = uniqueMsLevels(object),
  ...
)

# S4 method for Spectra
replaceIntensitiesBelow(
  object,
  threshold = min,
  value = 0,
  msLevel. = uniqueMsLevels(object)
)

# S4 method for Spectra
smooth(
  x,
  halfWindowSize = 2L,
  method = c("MovingAverage", "WeightedMovingAverage", "SavitzkyGolay"),
  msLevel. = uniqueMsLevels(x),
  ...
)

# S4 method for Spectra
addProcessing(object, FUN, ..., spectraVariables = character())

coreSpectraVariables()

# S4 method for Spectra
uniqueMsLevels(object, ...)

# S4 method for Spectra
backendBpparam(object, BPPARAM = bpparam())

# S4 method for Spectra
combinePeaks(
  object,
  tolerance = 0,
  ppm = 20,
  intensityFun = base::mean,
  mzFun = base::mean,
  weighted = TRUE,
  msLevel. = uniqueMsLevels(object),
  ...
)

# S4 method for Spectra
entropy(object, normalized = TRUE)

# S4 method for ANY
entropy(object, ...)

Arguments

object

For Spectra(): either a DataFrame or missing. See section on creation of Spectra objects for details. For all other methods a Spectra object.

f

For split(): factor defining how to split x. See base::split() for details. For setBackend(): factor defining how to split the data for parallelized copying of the spectra data to the new backend. For some backends changing this parameter can lead to errors. For combineSpectra(): factor defining the grouping of the spectra that should be combined. For spectrapply(): factor how object should be splitted. For filterPrecursorScan(): defining which spectra belong to the same original data file (sample): Defaults to f = dataOrigin(x). For intensity(), mz() and peaksData(): factor defining how data should be chunk-wise loaded an processed. Defaults to processingChunkFactor().

BPPARAM

Parallel setup configuration. See bpparam() for more information. This is passed directly to the backendInitialize() method of the MsBackend.

...

Additional arguments.

x

A Spectra object.

p

For combineSpectra(): factor defining how to split the input Spectra for parallel processing. Defaults to x$dataStorage, i.e., depending on the used backend, per-file parallel processing will be performed.

FUN

For addProcessing(): function to be applied to the peak matrix of each spectrum in object. For compareSpectra(): function to compare intensities of peaks between two spectra with each other. For combineSpectra(): function to combine the (peak matrices) of the spectra. See section Data manipulations and examples below for more details. For bin(): function to aggregate intensity values of peaks falling into the same bin. Defaults to FUN = sum thus summing up intensities. For spectrapply() and chunkapply(): function to be applied to Spectra.

y

A Spectra object. A DataFrame for joinSpectraData().

by.x

A character(1) specifying the spectra variable used for merging. Default is "spectrumId".

by.y

A character(1) specifying the column used for merging. Set to by.x if missing.

suffix.y

A character(1) specifying the suffix to be used for making the names of columns in the merged spectra variables unique. This suffix will be used to amend names(y), while spectraVariables(x) will remain unchanged.

substDefinition

For deisotopeSpectra() and filterPrecursorIsotopes(): matrix or data.frame with definitions of isotopic substitutions. Uses by default isotopic substitutions defined from all compounds in the Human Metabolome Database (HMDB). See isotopologues() or isotopicSubstitutionMatrix() for details.

tolerance

For compareSpectra(), containsMz(), deisotopeSpectra(), filterMzValues() and reduceSpectra(): numeric(1) allowing to define a constant maximal accepted difference between m/z values for peaks to be matched (or grouped). For containsMz() it can also be of length equal mz to specify a different tolerance for each m/z value. For filterPrecursorMaxIntensity(): numeric(1) defining the (constant) maximal accepted difference of precursor m/z values of spectra for grouping them into precursor groups. For filterPrecursorIsotopes(): passed directly to the isotopologues() function. For filterValues(): numeric of any length allowing to define a maximal accepted difference between user input values and the spectraVariables values. If it is not equal to the length of the value provided with parameter spectraVariables, tolerance[1] will be recycled. Default is tolerance = 0

ppm

For compareSpectra(), containsMz(), deisotopeSpectra(), filterMzValues() and reduceSpectra(): numeric(1) defining a relative, m/z-dependent, maximal accepted difference between m/z values for peaks to be matched (or grouped). For filterPrecursorMaxIntensity(): numeric(1) defining the relative maximal accepted difference of precursor m/z values of spectra for grouping them into precursor groups. For filterPrecursorIsotopes(): passed directly to the isotopologues() function. For filterValues(): numeric of any length allowing to define a maximal accepted difference between user input values and the spectraVariables values. If it is not equal to the length of the value provided with parameter spectraVariables, ppm[1] will be recycled.

charge

For deisotopeSpectra(): expected charge of the ionized compounds. See isotopologues() for details.

by

For scalePeaks(): function to calculate a single numeric from intensity values of a spectrum by which all intensities (of that spectrum) should be divided by. The default by = sum will divide intensities of each spectrum by the sum of intensities of that spectrum.

msLevel.

integer defining the MS level(s) of the spectra to which the function should be applied (defaults to all MS levels of object. For filterMsLevel(): the MS level to which object should be subsetted.

mz

For filterIsolationWindow(): numeric(1) with the m/z value to filter the object. For filterPrecursorMz() and filterMzRange(): numeric(2) defining the lower and upper m/z boundary. For filterMzValues() and filterPrecursorMzValues(): numeric with the m/z values to match peaks or precursor m/z against.

processingQueue

For Spectra(): optional list of ProcessingStep objects.

metadata

For Spectra(): optional list with metadata information.

backend

For Spectra(): MsBackend to be used as backend. See section on creation of Spectra objects for details. For setBackend(): instance of MsBackend that supports setBackend() (i.e. for which supportsSetBackend() returns TRUE). Such backends have a parameter data in their backendInitialize() function that support passing the full spectra data to the initialize method. See section on creation of Spectra objects for details. For export(): MsBackend to be used to export the data.

source

For Spectra(): instance of MsBackend that can be used to import spectrum data from the provided files. See section Creation of objects, conversion and changing the backend for more details.

drop

For [, split(): not considered.

columns

For spectraData() accessor: optional character with column names (spectra variables) that should be included in the returned DataFrame. By default, all columns are returned. For peaksData() accessor: optional character with requested columns in the individual matrix of the returned list. Defaults to c("mz", "value") but any values returned by peaksVariables(object) with object being the Spectra object are supported.

value

replacement value for <- methods. See individual method description or expected data type.

which

for containsMz(): either "any" or "all" defining whether any (the default) or all provided mz have to be present in the spectrum.

neutralLoss

for containsNeutralLoss(): numeric(1) defining the value which should be subtracted from the spectrum's precursor m/z.

chunkSize

For spectrapply(): size of the chunks into which Spectra should be split. This parameter overrides parameters f and BPPARAM.

use.names

For lengths(): ignored.

spectraVariables
  • For selectSpectraVariables(): character with the names of the spectra variables to which the backend should be subsetted.

    • For addProcessing(): character with additional spectra variables that should be passed along to the function defined with FUN. See function description for details.

    • For filterRanges() and filterValues(): character vector specifying the column(s) from spectraData(object) on which to filter the data and that correspond to the the names of the spectra variables that should be used for the filtering.

initial

For tic(): logical(1) whether the initially reported total ion current should be reported, or whether the total ion current should be (re)calculated on the actual data (initial = FALSE, same as ionCount()).

name

For $ and $<-: the name of the spectra variable to return or set.

i

For [: integer, logical or character to subset the object.

j

For [: not supported.

n

for filterAcquisitionNum(): integer with the acquisition numbers to filter for.

dataStorage

For filterDataStorage(): character to define which spectra to keep. For filterAcquisitionNum(): optionally specify if filtering should occur only for spectra of selected dataStorage.

dataOrigin

For filterDataOrigin(): character to define which spectra to keep. For filterAcquisitionNum(): optionally specify if filtering should occurr only for spectra of selected dataOrigin.

halfWindowSize
  • For pickPeaks(): integer(1), used in the identification of the mass peaks: a local maximum has to be the maximum in the window from (i - halfWindowSize):(i + halfWindowSize).

    • For smooth(): integer(1), used in the smoothing algorithm, the window reaches from (i - halfWindowSize):(i + halfWindowSize).

    • For filterFourierTransformArtefacts(): numeric(1) defining the m/z window left and right of a peak where to remove fourier transform artefacts.

threshold
  • For pickPeaks(): a double(1) defining the proportion of the maximal peak intensity. Just values above are used for the weighted mean calculation.

    • For replaceIntensitiesBelow(): a numeric(1) defining the threshold or a function to calculate the threshold for each spectrum on its intensity values. Defaults to threshold = min.

    • For filterFourierTransformArtefacts(): the relative intensity (to a peak) below which peaks are considered fourier artefacts. Defaults to threshold = 0.2 hence removing peaks that have an intensity below 0.2 times the intensity of the tested peak (within the selected halfWindowSize).

keepIsotopes

For filterFourierTransformArtefacts(): whether isotope peaks should not be removed as fourier artefacts.

maxCharge

For filterFourierTransformArtefacts(): the maximum charge to be considered for isotopes.

isotopeTolerance

For filterFourierTransformArtefacts(): the m/z tolerance to be used to define whether peaks might be isotopes of the current tested peak.

intensity

For filterIntensity(): numeric of length 1 or 2 defining either the lower or the lower and upper intensity limit for the filtering, or a function that takes the intensities as input and returns a logical (same length then peaks in the spectrum) whether the peak should be retained or not. Defaults to intensity = c(0, Inf) thus only peaks with NA intensity are removed.

keep

For filterMzValues() and filterMzRange(): logical(1) whether the matching peaks should be retained (keep = TRUE, the default) or dropped (keep = FALSE).

polarity

for filterPolarity(): integer specifying the polarity to to subset object.

z

For filterPrecursorCharge(): integer() with the precursor charges to be used as filter.

acquisitionNum

for filterPrecursorScan(): integer with the acquisition number of the spectra to which the object should be subsetted.

rt

for filterRt(): numeric(2) defining the retention time range to be used to subset/filter object.

ranges

for filterRanges(): A numeric vector of paired values (upper and lower boundary) that define the ranges to filter the object. These paired values need to be in the same order as the spectraVariables parameter (see below).

match

For filterRanges() and filterValues(): character(1) defining whether the condition has to match for all provided ranges/values (match = "all"; the default), or for any of them (match = "any") for spectra to be retained.

values

for filterValues(): A numeric vector that define the values to filter the Spectra data. These values need to be in the same order as the spectraVariables parameter.

binSize

For bin(): numeric(1) defining the size for the m/z bins. Defaults to binSize = 1.

breaks

For bin(): numeric defining the m/z breakpoints between bins.

zero.rm

logical. For bin(): indicating whether to remove bins with zero intensity. Defaults to TRUE, meaning the function will discard bins created with an intensity of 0 to enhance memory efficiency.

MAPFUN

For compareSpectra(): function to map/match peaks between the two compared spectra. See joinPeaks() for more information and possible functions.

SIMPLIFY

For compareSpectra() whether the result matrix should be simplified to a numeric if possible (i.e. if either x or y is of length 1).

method
  • For pickPeaks(): character(1), the noise estimators that should be used, currently the the Median Absolute Deviation (method = "MAD") and Friedman's Super Smoother (method = "SuperSmoother") are supported.

    • For smooth(): character(1), the smoothing function that should be used, currently, the Moving-Average- (method = "MovingAverage"), Weighted-Moving-Average- (method = "WeightedMovingAverage"), Savitzky-Golay-Smoothing (method = "SavitzkyGolay") are supported.

snr

For pickPeaks(): double(1) defining the Signal-to-Noise-Ratio. The intensity of a local maximum has to be higher than snr * noise to be considered as peak.

k

For pickPeaks(): integer(1), number of values left and right of the peak that should be considered in the weighted mean calculation.

descending

For pickPeaks(): logical, if TRUE just values between the nearest valleys around the peak centroids are used.

intensityFun

For combinePeaks(): function to be used to aggregate intensities for all peaks in each peak group into a single intensity value.

mzFun

For combinePeaks(): function to aggregate m/z values for all peaks within each peak group into a single m/z value. This parameter is ignored if weighted = TRUE (the default).

weighted

For combinePeaks(): logical(1) whether m/z values of peaks within each peak group should be aggregated into a single m/z value using an intensity-weighted mean. Defaults to weighted = TRUE.

normalized

for entropy(): logical(1) whether the normalized entropy should be calculated (default). See also nentropy() for details.

Value

See individual method description for the return value.

Details

The Spectra class uses by default a lazy data manipulation strategy, i.e. data manipulations such as performed with replaceIntensitiesBelow() are not applied immediately to the data, but applied on-the-fly to the spectrum data once it is retrieved. For some backends that allow to write data back to the data storage (such as the MsBackendMemory(), MsBackendDataFrame() and MsBackendHdf5Peaks()) it is possible to apply to queue with the applyProcessing function. See the *Data manipulation and analysis methods section below for more details.

For more information on parallel or chunk-wise processing (especially helpful for very large data sets) see processingChunkSize().

To apply arbitrary functions to a Spectra use the spectrapply() function (or directly chunkapply() for chunk-wise processing). See description of the spectrapply() function below for details.

For details on plotting spectra, see plotSpectra().

Clarifications regarding scan/acquisition numbers and indices:

  • A spectrumId (or spectrumID) is a vendor specific field in the mzML file that contains some information about the run/spectrum, e.g.: controllerType=0 controllerNumber=1 scan=5281 file=2

  • acquisitionNum is a more a less sanitize spectrum id generated from the spectrumId field by mzR (see here).

  • scanIndex is the mzR generated sequence number of the spectrum in the raw file (which doesn't have to be the same as the acquisitionNum)

See also this issue.

Creation of objects, conversion, changing the backend and export

Spectra classes can be created with the Spectra() constructor function which supports the following formats:

  • parameter object is a data.frame or DataFrame containing the spectrum data. The provided backend (by default a MsBackendMemory) will be initialized with that data.

  • parameter object is a MsBackend (assumed to be already initialized).

  • parameter object is missing, in which case it is supposed that the data is provided by the MsBackend class passed along with the backend argument.

  • parameter object is of type character and is expected to be the file names(s) from which spectra should be imported. Parameter source allows to define a MsBackend that is able to import the data from the provided source files. The default value for source is MsBackendMzR() which allows to import spectra data from mzML, mzXML or CDF files.

With ... additional arguments can be passed to the backend's backendInitialize() method. Parameter backend allows to specify which MsBackend should be used for data storage.

The backend of a Spectra object can be changed with the setBackend() method that takes an instance of the new backend as second parameter backend. A call to setBackend(sps, backend = MsBackendDataFrame()) would for example change the backend of sps to the in-memory MsBackendDataFrame. Changing to a backend is only supported if that backend has a data parameter in its backendInitialize() method and if supportsSetBackend() returns TRUE for that backend. setBackend() will transfer the full spectra data from the originating backend as a DataFrame to the new backend. Most read-only backends do not support setBackend(). It is for example not possible to change the backend to a read-only backend (such as the MsBackendMzR() backend).

The definition of the function is: setBackend(object, backend, ..., f = dataStorage(object), BPPARAM = bpparam()) and its parameters are:

  • parameter object: the Spectra object.

  • parameter backend: an instance of the new backend, e.g. [MsBackendMemory()].

  • parameter f: factor allowing to parallelize the change of the backends. By default the process of copying the spectra data from the original to the new backend is performed separately (and in parallel) for each file. Users are advised to use the default setting.

  • parameter ...: optional additional arguments passed to the backendInitialize() method of the new backend.

  • parameter BPPARAM: setup for the parallel processing. See bpparam() for details.

Data from a Spectra object can be exported to a file with the export() function. The actual export of the data has to be performed by the export method of the MsBackend class defined with the mandatory parameter backend. Note however that not all backend classes support export of data. From the MsBackend classes in the Spectra package currently only the MsBackendMzR backend supports data export (to mzML/mzXML file(s)); see the help page of the MsBackend for information on its arguments or the examples below or the vignette for examples.

The definition of the function is export(object, backend, ...) and its parameters are:

  • object: the Spectra object to be exported.

  • backend: instance of a class extending MsBackend which supports export of the data (i.e. which has a defined export method).

  • ...: additional parameters specific for the MsBackend passed with parameter backend.

Accessing spectra data

  • $, $<-: gets (or sets) a spectra variable for all spectra in object. See examples for details. Note that replacing values of a peaks variable is not supported with a non-empty processing queue, i.e. if any filtering or data manipulations on the peaks data was performed. In these cases applyProcessing() needs to be called first to apply all cached data operations.

  • [[, [[<-: access or set/add a single spectrum variable (column) in the backend.

  • acquisitionNum(): returns the acquisition number of each spectrum. Returns an integer of length equal to the number of spectra (with NA_integer_ if not available).

  • centroided(), centroided<-: gets or sets the centroiding information of the spectra. centroided() returns a logical vector of length equal to the number of spectra with TRUE if a spectrum is centroided, FALSE if it is in profile mode and NA if it is undefined. See also isCentroided() for estimating from the spectrum data whether the spectrum is centroided. value for centroided<- is either a single logical or a logical of length equal to the number of spectra in object.

  • collisionEnergy(), collisionEnergy<-: gets or sets the collision energy for all spectra in object. collisionEnergy() returns a numeric with length equal to the number of spectra (NA_real_ if not present/defined), collisionEnergy<- takes a numeric of length equal to the number of spectra in object.

  • coreSpectraVariables(): returns the core spectra variables along with their expected data type.

  • dataOrigin(), dataOrigin<-: gets or sets the data origin for each spectrum. dataOrigin() returns a character vector (same length than object) with the origin of the spectra. dataOrigin<- expects a character vector (same length than object) with the replacement values for the data origin of each spectrum.

  • dataStorage(): returns a character vector (same length than object) with the data storage location of each spectrum.

  • intensity(): gets the intensity values from the spectra. Returns a NumericList() of numeric vectors (intensity values for each spectrum). The length of the list is equal to the number of spectra in object.

  • ionCount(): returns a numeric with the sum of intensities for each spectrum. If the spectrum is empty (see isEmpty()), NA_real_ is returned.

  • isCentroided(): a heuristic approach assessing if the spectra in object are in profile or centroided mode. The function takes the qtlth quantile top peaks, then calculates the difference between adjacent m/z value and returns TRUE if the first quartile is greater than k. (See Spectra:::.isCentroided() for the code.)

  • isEmpty(): checks whether a spectrum in object is empty (i.e. does not contain any peaks). Returns a logical vector of length equal number of spectra.

  • isolationWindowLowerMz(), isolationWindowLowerMz<-: gets or sets the lower m/z boundary of the isolation window.

  • isolationWindowTargetMz(), isolationWindowTargetMz<-: gets or sets the target m/z of the isolation window.

  • isolationWindowUpperMz(), isolationWindowUpperMz<-: gets or sets the upper m/z boundary of the isolation window.

  • containsMz(): checks for each of the spectra whether they contain mass peaks with an m/z equal to mz (given acceptable difference as defined by parameters tolerance and ppm - see common() for details). Parameter which allows to define whether any (which = "any", the default) or all (which = "all") of the mz have to match. The function returns NA if mz is of length 0 or is NA.

  • containsNeutralLoss(): checks for each spectrum in object if it has a peak with an m/z value equal to its precursor m/z - neutralLoss (given acceptable difference as defined by parameters tolerance and ppm). Returns NA for MS1 spectra (or spectra without a precursor m/z).

  • length(): gets the number of spectra in the object.

  • lengths(): gets the number of peaks (m/z-intensity values) per spectrum. Returns an integer vector (length equal to the number of spectra). For empty spectra, 0 is returned.

  • msLevel(): gets the spectra's MS level. Returns an integer vector (names being spectrum names, length equal to the number of spectra) with the MS level for each spectrum.

  • mz(): gets the mass-to-charge ratios (m/z) from the spectra. Returns a NumericList() or length equal to the number of spectra, each element a numeric vector with the m/z values of one spectrum.

  • peaksData(): gets the peaks data for all spectra in object. Peaks data consist of the m/z and intensity values as well as possible additional annotations (variables) of all peaks of each spectrum. The function returns a SimpleList() of two dimensional arrays (either matrix or data.frame), with each array providing the values for the requested peak variables (by default "mz" and "intensity"). Optional parameter columns is passed to the backend's peaksData() function to allow the selection of specific (or additional) peaks variables (columns) that should be extracted (if available). Importantly, it is not guaranteed that each backend supports this parameter (while each backend must support extraction of "mz" and "intensity" columns). Parameter columns defaults to c("mz", "intensity") but any value returned by peaksVariables(object) is supported. Note also that it is possible to extract the peak data with as(x, "list") and as(x, "SimpleList") as a list and SimpleList, respectively. Note however that, in contrast to peaksData(), as() does not support the parameter columns.

  • peaksVariables(): lists the available variables for mass peaks provided by the backend. Default peak variables are "mz" and "intensity" (which all backends need to support and provide), but some backends might provide additional variables. These variables correspond to the column names of the peak data array returned by peaksData().

  • polarity(), polarity<-: gets or sets the polarity for each spectrum. polarity() returns an integer vector (length equal to the number of spectra), with 0 and 1 representing negative and positive polarities, respectively. polarity<- expects an integer vector of length 1 or equal to the number of spectra.

  • precursorCharge(), precursorIntensity(), precursorMz(), precScanNum(), precAcquisitionNum(): gets the charge (integer), intensity (numeric), m/z (numeric), scan index (integer) and acquisition number (interger) of the precursor for MS level > 2 spectra from the object. Returns a vector of length equal to the number of spectra in object. NA are reported for MS1 spectra of if no precursor information is available.

  • rtime(), rtime<-: gets or sets the retention times (in seconds) for each spectrum. rtime() returns a numeric vector (length equal to the number of spectra) with the retention time for each spectrum. rtime<- expects a numeric vector with length equal to the number of spectra.

  • scanIndex(): returns an integer vector with the scan index for each spectrum. This represents the relative index of the spectrum within each file. Note that this can be different to the acquisitionNum of the spectrum which represents the index of the spectrum during acquisition/measurement (as reported in the mzML file).

  • smoothed(),smoothed<-: gets or sets whether a spectrum is smoothed. smoothed() returns a logical vector of length equal to the number of spectra. smoothed<- takes a logical vector of length 1 or equal to the number of spectra in object.

  • spectraData(): gets general spectrum metadata (annotation, also called header). spectraData() returns a DataFrame. Note that this method does by default not return m/z or intensity values.

  • spectraData<-: replaces the full spectra data of the Spectra object with the one provided with value. The spectraData<- function expects a DataFrame to be passed as value with the same number of rows as there a spectra in object. Note that replacing values of peaks variables is not supported with a non-empty processing queue, i.e. if any filtering or data manipulations on the peaks data was performed. In these cases applyProcessing() needs to be called first to apply all cached data operations and empty the processing queue.

  • spectraNames(), spectraNames<-: gets or sets the spectra names.

  • spectraVariables(): returns a character vector with the available spectra variables (columns, fields or attributes of each spectrum) available in object. Note that spectraVariables() does not list the peak variables ("mz", "intensity" and eventual additional annotations for each MS peak). Peak variables are returned by peaksVariables().

  • tic(): gets the total ion current/count (sum of signal of a spectrum) for all spectra in object. By default, the value reported in the original raw data file is returned. For an empty spectrum, 0 is returned.

  • uniqueMsLevels(): get the unique MS levels available in object. This function is supposed to be more efficient than unique(msLevel(object)).

Data subsetting, filtering and merging

Subsetting and filtering of Spectra objects can be performed with the below listed methods.

  • [: subsets the spectra keeping only selected elements (i). The method always returns a Spectra object.

  • deisotopeSpectra(): deisotopes each spectrum keeping only the monoisotopic peak for groups of isotopologues. Isotopologues are estimated using the isotopologues() function from the MetaboCoreUtils package. Note that the default parameters for isotope prediction/detection have been determined using data from the Human Metabolome Database (HMDB) and isotopes for elements other than CHNOPS might not be detected. See parameter substDefinition in the documentation of isotopologues() for more information. The approach and code to define the parameters for isotope prediction is described here.

  • dropNaSpectraVariables(): removes spectra variables (i.e. columns in the object's spectraData that contain only missing values (NA). Note that while columns with only NAs are removed, a spectraData() call after dropNaSpectraVariables() might still show columns containing NA values for core spectra variables.

  • filterAcquisitionNum(): filters the object keeping only spectra matching the provided acquisition numbers (argument n). If dataOrigin or dataStorage is also provided, object is subsetted to the spectra with an acquisition number equal to n in spectra with matching dataOrigin or dataStorage values retaining all other spectra. Returns the filtered Spectra.

  • filterDataOrigin(): filters the object retaining spectra matching the provided dataOrigin. Parameter dataOrigin has to be of type character and needs to match exactly the data origin value of the spectra to subset. Returns the filtered Spectra object (with spectra ordered according to the provided dataOrigin parameter).

  • filterDataStorage(): filters the object retaining spectra stored in the specified dataStorage. Parameter dataStorage has to be of type character and needs to match exactly the data storage value of the spectra to subset. Returns the filtered Spectra object (with spectra ordered according to the provided dataStorage parameter).

  • filterEmptySpectra(): removes empty spectra (i.e. spectra without peaks). Returns the filtered Spectra object (with spectra in their original order).

  • filterFourierTransformArtefacts(): removes (Orbitrap) fast fourier artefact peaks from spectra (see examples below). The function iterates through all intensity ordered peaks in a spectrum and removes all peaks with an m/z within +/- halfWindowSize of the current peak if their intensity is lower than threshold times the current peak's intensity. Additional parameters keepIsotopes, maxCharge and isotopeTolerance allow to avoid removing of potential [13]C isotope peaks (maxCharge being the maximum charge that should be considered and isotopeTolerance the absolute acceptable tolerance for matching their m/z). See filterFourierTransformArtefacts() for details and background and deisitopeSpectra() for an alternative.

  • filterIntensity(): filters each spectrum keeping only peaks with intensities that are within the provided range or match the criteria of the provided function. For the former, parameter intensity has to be a numeric defining the intensity range, for the latter a function that takes the intensity values of the spectrum and returns a logical whether the peak should be retained or not (see examples below for details) - additional parameters to the function can be passed with .... To remove only peaks with intensities below a certain threshold, say 100, use intensity = c(100, Inf). Note: also a single value can be passed with the intensity parameter in which case an upper limit of Inf is used. Note that this function removes also peaks with missing intensities (i.e. an intensity of NA). Parameter msLevel. allows to restrict the filtering to spectra of the specified MS level(s).

  • filterIsolationWindow(): retains spectra that contain mz in their isolation window m/z range (i.e. with an isolationWindowLowerMz <= mz and isolationWindowUpperMz >= mz. Returns the filtered Spectra object (with spectra in their original order).

  • filterMsLevel(): filters object by MS level keeping only spectra matching the MS level specified with argument msLevel. Returns the filtered Spectra (with spectra in their original order).

  • filterMzRange(): filters the object keeping or removing peaks in each spectrum that are within the provided m/z range. Whether peaks are retained or removed can be configured with parameter keep (default keep = TRUE).

  • filterMzValues(): filters the object keeping all peaks in each spectrum that match the provided m/z value(s) (for keep = TRUE, the default) or removing all of them (for keep = FALSE). The m/z matching considers also the absolute tolerance and m/z-relative ppm values. tolerance and ppm have to be of length 1.

  • filterPolarity(): filters the object keeping only spectra matching the provided polarity. Returns the filtered Spectra (with spectra in their original order).

  • filterPrecursorCharge(): retains spectra with the defined precursor charge(s).

  • filterPrecursorIsotopes(): groups MS2 spectra based on their precursor m/z and precursor intensity into predicted isotope groups and keep for each only the spectrum representing the monoisotopic precursor. MS1 spectra are returned as is. See documentation for deisotopeSpectra() below for details on isotope prediction and parameter description.

  • filterPrecursorMaxIntensity(): filters the Spectra keeping for groups of (MS2) spectra with similar precursor m/z values (given parameters ppm and tolerance) the one with the highest precursor intensity. The function filters only MS2 spectra and returns all MS1 spectra. If precursor intensities are NA for all spectra within a spectra group, the first spectrum of that groups is returned. Note: some manufacturers don't provide precursor intensities. These can however also be estimated with estimatePrecursorIntensity().

  • filterPrecursorMzRange() (previously filterPrecursorMz() which is now deprecated): retains spectra with a precursor m/z within the provided m/z range. See examples for details on selecting spectra with a precursor m/z for a target m/z accepting a small difference in ppm.

  • filterPrecursorMzValues(): retains spectra with precursor m/z matching any of the provided m/z values (given ppm and tolerance). Spectra with missing precursor m/z value (e.g. MS1 spectra) are dropped.

  • filterPrecursorPeaks(): removes peaks from each spectrum in object with an m/z equal or larger than the m/z of the precursor, depending on the value of parameter mz: for mz = ==" (the default) peaks with matching m/z (considering an absolute and relative acceptable difference depending on toleranceandppm, respectively) are removed. For mz = ">="all peaks with an m/z larger or equal to the precursor m/z (minustoleranceand theppmof the precursor m/z) are removed. ParametermsLevel.allows to restrict the filter to certain MS levels (by default the filter is applied to all MS levels). Note that no peaks are removed if the precursor m/z isNA` (e.g. typically for MS1 spectra).

  • filterPrecursorScan(): retains parent (e.g. MS1) and children scans (e.g. MS2) of acquisition number acquisitionNum. Returns the filtered Spectra (with spectra in their original order). Parameter f allows to define which spectra belong to the same sample or original data file ( defaults to f = dataOrigin(object)).

  • filterRt(): retains spectra of MS level msLevel with retention times (in seconds) within (>=) rt[1] and (<=) rt[2]. Returns the filtered Spectra (with spectra in their original order).

  • filterRanges(): allows filtering of the Spectra object based on user defined numeric ranges (parameter ranges) for one or more available spectra variables in object (spectra variable names can be specified with parameter spectraVariables). Spectra for which the value of a spectra variable is within it's defined range are retained. If multiple ranges/spectra variables are defined, the match parameter can be used to specify whether all conditions (match = "all"; the default) or if any of the conditions must match (match = "any"; all spectra for which values are within any of the provided ranges are retained).

  • filterValues(): allows filtering of the Spectra object based on similarities of numeric values of one or more spectraVariables(object) (parameter spectraVariables) to provided values (parameter values) given acceptable differences (parameters tolerance and ppm). If multiple values/spectra variables are defined, the match parameter can be used to specify whether all conditions (match = "all"; the default) or if any of the conditions must match (match = "any"; all spectra for which values are within any of the provided ranges are retained).

  • reduceSpectra(): for groups of peaks within highly similar m/z values within each spectrum (given ppm and tolerance), this function keeps only the peak with the highest intensity removing all other peaks hence reducing each spectrum to the highest intensity peaks per peak group. Peak groups are defined using the group() function from the MsCoreUtils package.

  • reset(): restores the data to its original state (as much as possible): removes any processing steps from the lazy processing queue and calls reset() on the backend which, depending on the backend, can also undo e.g. data filtering operations. Note that a reset*( call after applyProcessing() will not have any effect. See examples below for more information.

  • selectSpectraVariables(): reduces the information within the object to the selected spectra variables: all data for variables not specified will be dropped. For mandatory columns (i.e., those listed by coreSpectraVariables(), such as msLevel, rtime ...) only the values will be dropped but not the variable itself. Additional (or user defined) spectra variables will be completely removed. Returns the filtered Spectra.

  • split(): splits the Spectra object based on parameter f into a list of Spectra objects.

  • joinSpectraData(): Individual spectra variables can be directly added with the $<- or [[<- syntax. The joinSpectraData() function allows to merge a DataFrame to the existing spectra data. This function diverges from the merge() method in two main ways:

    • The by.x and by.y column names must be of length 1.

    • If variable names are shared in x and y, the spectra variables of x are not modified. It's only the y variables that are appended the suffix defined in suffix.y. This is to avoid modifying any core spectra variables that would lead to an invalid object.

    • Duplicated Spectra keys (i.e. x[[by.x]]) are not allowed. Duplicated keys in the DataFrame (i.e y[[by.y]]) throw a warning and only the last occurrence is kept. These should be explored and ideally be removed using for QFeatures::reduceDataFrame(), PMS::reducePSMs() or similar functions.

Several Spectra objects can be concatenated into a single object with the c() or the concatenateSpectra() function. Concatenation will fail if the processing queue of any of the Spectra objects is not empty or if different backends are used in the Spectra objects. The spectra variables of the resulting Spectra object is the union of the spectra variables of the individual Spectra objects.

Data manipulation and analysis methods

Many data manipulation operations, such as those listed in this section, are not applied immediately to the spectra, but added to a lazy processing/manipulation queue. Operations stored in this queue are applied on-the-fly to spectra data each time it is accessed. This lazy execution guarantees the same functionality for Spectra objects with any backend, i.e. backends supporting to save changes to spectrum data (MsBackendMemory(), MsBackendDataFrame() or MsBackendHdf5Peaks()) as well as read-only backends (such as the MsBackendMzR()). Note that for the former it is possible to apply the processing queue and write the modified peak data back to the data storage with the applyProcessing() function.

  • addProcessing(): adds an arbitrary function that should be applied to the peaks matrix of every spectrum in object. The function (can be passed with parameter FUN) is expected to take a peaks matrix as input and to return a peaks matrix. A peaks matrix is a numeric matrix with two columns, the first containing the m/z values of the peaks and the second the corresponding intensities. The function has to have ... in its definition. Additional arguments can be passed with .... With parameter spectraVariables it is possible to define additional spectra variables from object that should be passed to the function FUN. These will be passed by their name (e.g. specifying spectraVariables = "precursorMz" will pass the spectra's precursor m/z as a parameter named precursorMz to the function. The only exception is the spectra's MS level, these will be passed to the function as a parameter called spectrumMsLevel (i.e. with spectraVariables = "msLevel" the MS levels of each spectrum will be submitted to the function as a parameter called spectrumMsLevel). Examples are provided in the package vignette.

  • applyProcessing(): for Spectra objects that use a writeable backend only: apply all steps from the lazy processing queue to the peak data and write it back to the data storage. Parameter f allows to specify how object should be split for parallel processing. This should either be equal to the dataStorage, or f = rep(1, length(object)) to disable parallel processing alltogether. Other partitionings might result in errors (especially if a MsBackendHdf5Peaks backend is used).

  • bin(): aggregates individual spectra into discrete (m/z) bins. Binning is performed only on spectra of the specified MS level(s) (parameter msLevel, by default all MS levels of x). The bins can be defined with parameter breaks which by default are equally sized bins, with size being defined by parameter binSize, from the minimal to the maximal m/z of all spectra (of MS level msLevel) within x. The same bins are used for all spectra in x. All intensity values for peaks falling into the same bin are aggregated using the function provided with parameter FUN (defaults to FUN = sum, i.e. all intensities are summed up). Note that the binning operation is applied to the peak data on-the-fly upon data access and it is possible to revert the operation with the reset() function (see description of reset() above).

  • combinePeaks(): combines mass peaks within each spectrum with a difference in their m/z values that is smaller than the maximal acceptable difference defined by ppm and tolerance. Parameters intensityFun and mzFun allow to define functions to aggregate the intensity and m/z values for each such group of peaks. With weighted = TRUE (the default), the m/z value of the combined peak is calculated using an intensity-weighted mean and parameter mzFun is ignored. The MsCoreUtils::group() function is used for the grouping of mass peaks. Parameter msLevel. allows to define selected MS levels for which peaks should be combined. This function returns a Spectra with the same number of spectra than the input object, but with possibly combined peaks within each spectrum. dropped (i.e. their values are replaced with NA) for combined peaks unless they are constant across the combined peaks. See also reduceSpectra() for a function to select a single representative mass peak for each peak group.

  • combineSpectra(): combines sets of spectra into a single spectrum per set. For each spectrum group (set), spectra variables from the first spectrum are used and the peak matrices are combined using the function specified with FUN, which defaults to combinePeaksData(). Please refer to the combinePeaksData() help page for details and options of the actual combination of peaks across the sets of spectra and to the package vignette for examples and alternative ways to aggregate spectra. The sets of spectra can be specified with parameter f. In addition it is possible to define, with parameter p if and how to split the input data for parallel processing. This defaults to p = x$dataStorage and hence a per-file parallel processing is applied for Spectra with file-based backends (such as the MsBackendMzR()). Prior combination of the spectra all processings queued in the lazy evaluation queue are applied. Be aware that calling combineSpectra() on a Spectra object with certain backends that allow modifications might overwrite the original data. This does not happen with a MsBackendMemory or MsBackendDataFrame backend, but with a MsBackendHdf5Peaks backend the m/z and intensity values in the original hdf5 file(s) will be overwritten. The function returns a Spectra of length equal to the unique levels of f.

  • compareSpectra(): compares each spectrum in x with each spectrum in y using the function provided with FUN (defaults to ndotproduct()). If y is missing, each spectrum in x is compared with each other spectrum in x. The matching/mapping of peaks between the compared spectra is done with the MAPFUN function. The default joinPeaks() matches peaks of both spectra and allows to keep all peaks from the first spectrum (type = "left"), from the second (type = "right"), from both (type = "outer") and to keep only matching peaks (type = "inner"); see joinPeaks() for more information and examples). The MAPFUN function should have parameters x, y, xPrecursorMz and yPrecursorMz as these values are passed to the function. In addition to joinPeaks() also joinPeaksGnps() is supported for GNPS-like similarity score calculations. Note that joinPeaksGnps() should only be used in combination with FUN = MsCoreUtils::gnps (see joinPeaksGnps() for more information and details). Use MAPFUN = joinPeaksNone to disable internal peak matching/mapping if a similarity scoring function is used that performs the matching internally. FUN is supposed to be a function to compare intensities of (matched) peaks of the two spectra that are compared. The function needs to take two matrices with columns "mz" and "intensity" as input and is supposed to return a single numeric as result. In addition to the two peak matrices the spectra's precursor m/z values are passed to the function as parameters xPrecursorMz (precursor m/z of the x peak matrix) and yPrecursorMz (precursor m/z of the y peak matrix). Additional parameters to functions FUN and MAPFUN can be passed with .... Parameters ppm and tolerance are passed to both MAPFUN and FUN. The function returns a matrix with the results of FUN for each comparison, number of rows equal to length(x) and number of columns equal length(y) (i.e. element in row 2 and column 3 is the result from the comparison of x[2] with y[3]). If SIMPLIFY = TRUE the matrix is simplified to a numeric if length of x or y is one. See also the vignette for additional examples, such as using spectral entropy similarity in the scoring.

  • deisotopeSpectra(): deisotopes each spectrum keeping only the monoisotopic peak for groups of isotopologues. Isotopologues are estimated using the isotopologues() function from the MetaboCoreUtils package. Note that the default parameters for isotope prediction/detection have been determined using data from the Human Metabolome Database (HMDB) and isotopes for elements other than CHNOPS might not be detected. See parameter substDefinition in the documentation of isotopologues() for more information. The approach and code to define the parameters for isotope prediction is described here.

  • entropy(): calculates the entropy of each spectra based on the metrics suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). See also nentropy() in the MsCoreUtils package for details.

  • estimatePrecursorIntensity(): defines the precursor intensities for MS2 spectra using the intensity of the matching MS1 peak from the closest MS1 spectrum (i.e. the last MS1 spectrum measured before the respective MS2 spectrum). With method = "interpolation" it is also possible to calculate the precursor intensity based on an interpolation of intensity values (and retention times) of the matching MS1 peaks from the previous and next MS1 spectrum. See estimatePrecursorIntensity() for examples and more details.

  • estimatePrecursorMz(): for DDA data: allows to estimate a fragment spectra's precursor m/z based on the reported precursor m/z and the data from the previous MS1 spectrum. See estimatePrecursorMz() for details.

  • neutralLoss(): calculates neutral loss spectra for fragment spectra. See neutralLoss() for detailed documentation.

  • processingLog(): returns a character vector with the processing log messages.

  • reduceSpectra(): keeps for groups of peaks with similar m/z values in (given ppm and tolerance) in each spectrum only the peak with the highest intensity removing all other peaks hence reducing each spectrum to the highest intensity peaks per peak group. Peak groups are defined using the group() function from the MsCoreUtils package. See also the combinePeaks() function for an alternative function to combine peaks within each spectrum.

  • scalePeaks(): scales intensities of peaks within each spectrum depending on parameter by. With by = sum (the default) peak intensities are divided by the sum of peak intensities within each spectrum. The sum of intensities is thus 1 for each spectrum after scaling. Parameter msLevel. allows to apply the scaling of spectra of a certain MS level. By default (msLevel. = uniqueMsLevels(x)) intensities for all spectra will be scaled.

  • spectrapply(): applies a given function to each individual spectrum or sets of a Spectra object. By default, the Spectra is split into individual spectra (i.e. Spectra of length 1) and the function FUN is applied to each of them. An alternative splitting can be defined with parameter f. Parameters for FUN can be passed using .... The returned result and its order depend on the function FUN and how object is split (hence on f, if provided). Parallel processing is supported and can be configured with parameter BPPARAM, is however only suggested for computational intense FUN. As an alternative to the (eventual parallel) processing of the full Spectra, spectrapply() supports also a chunk-wise processing. For this, parameter chunkSize needs to be specified. object is then split into chunks of size chunkSize which are then (stepwise) processed by FUN. This guarantees a lower memory demand (especially for on-disk backends) since only the data for one chunk needs to be loaded into memory in each iteration. Note that by specifying chunkSize, parameters f and BPPARAM will be ignored. See also chunkapply() or examples below for details on chunk-wise processing.

  • smooth(): smooths individual spectra using a moving window-based approach (window size = 2 * halfWindowSize). Currently, the Moving-Average- (method = "MovingAverage"), Weighted-Moving-Average- (method = "WeightedMovingAverage"), weights depending on the distance of the center and calculated 1/2^(-halfWindowSize:halfWindowSize)) and Savitzky-Golay-Smoothing (method = "SavitzkyGolay") are supported. For details how to choose the correct halfWindowSize please see MsCoreUtils::smooth().

  • pickPeaks(): picks peaks on individual spectra using a moving window-based approach (window size = 2 * halfWindowSize). For noisy spectra there are currently two different noise estimators available, the Median Absolute Deviation (method = "MAD") and Friedman's Super Smoother (method = "SuperSmoother"), as implemented in the MsCoreUtils::noise(). The method supports also to optionally refine the m/z value of the identified centroids by considering data points that belong (most likely) to the same mass peak. Therefore the m/z value is calculated as an intensity weighted average of the m/z values within the peak region. The peak region is defined as the m/z values (and their respective intensities) of the 2 * k closest signals to the centroid or the closest valleys (descending = TRUE) in the 2 * k region. For the latter the k has to be chosen general larger. See MsCoreUtils::refineCentroids() for details. If the ratio of the signal to the highest intensity of the peak is below threshold it will be ignored for the weighted average.

  • replaceIntensitiesBelow(): replaces intensities below a specified threshold with the provided value. Parameter threshold can be either a single numeric value or a function which is applied to all non-NA intensities of each spectrum to determine a threshold value for each spectrum. The default is threshold = min which replaces all values which are <= the minimum intensity in a spectrum with value (the default for value is 0). Note that the function specified with threshold is expected to have a parameter na.rm since na.rm = TRUE will be passed to the function. If the spectrum is in profile mode, ranges of successive non-0 peaks <= threshold are set to 0. Parameter msLevel. allows to apply this to only spectra of certain MS level(s).

Author

Nir Shahaf, Johannes Rainer

Nir Shahaf

Johannes Rainer

Sebastian Gibb, Johannes Rainer, Laurent Gatto

Examples


## Create a Spectra providing a `DataFrame` containing the spectrum data.

spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2))
spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2))
spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8))

data <- Spectra(spd)
data
#> MSn data (Spectra) with 2 spectra in a MsBackendMemory backend:
#>     msLevel     rtime scanIndex
#>   <integer> <numeric> <integer>
#> 1         1       1.1        NA
#> 2         2       1.2        NA
#>  ... 16 more variables/columns.

## Get the number of spectra
length(data)
#> [1] 2

## Get the number of peaks per spectrum
lengths(data)
#> [1] 4 3

## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk
## backend.
sciex_file <- dir(system.file("sciex", package = "msdata"),
    full.names = TRUE)
sciex <- Spectra(sciex_file, backend = MsBackendMzR())
sciex
#> MSn data (Spectra) with 1862 spectra in a MsBackendMzR backend:
#>        msLevel     rtime scanIndex
#>      <integer> <numeric> <integer>
#> 1            1     0.280         1
#> 2            1     0.559         2
#> 3            1     0.838         3
#> 4            1     1.117         4
#> 5            1     1.396         5
#> ...        ...       ...       ...
#> 1858         1   258.636       927
#> 1859         1   258.915       928
#> 1860         1   259.194       929
#> 1861         1   259.473       930
#> 1862         1   259.752       931
#>  ... 33 more variables/columns.
#> 
#> file(s):
#> 20171016_POOL_POS_1_105-134.mzML
#> 20171016_POOL_POS_3_105-134.mzML

## The MS data is on disk and will be read into memory on-demand. We can
## however change the backend to a MsBackendMemory backend which will
## keep all of the data in memory.
sciex_im <- setBackend(sciex, MsBackendMemory())
sciex_im
#> MSn data (Spectra) with 1862 spectra in a MsBackendMemory backend:
#>        msLevel     rtime scanIndex
#>      <integer> <numeric> <integer>
#> 1            1     0.280         1
#> 2            1     0.559         2
#> 3            1     0.838         3
#> 4            1     1.117         4
#> 5            1     1.396         5
#> ...        ...       ...       ...
#> 1858         1   258.636       927
#> 1859         1   258.915       928
#> 1860         1   259.194       929
#> 1861         1   259.473       930
#> 1862         1   259.752       931
#>  ... 33 more variables/columns.
#> Processing:
#>  Switch backend from MsBackendMzR to MsBackendMemory [Thu Mar 28 09:08:01 2024] 

## The `MsBackendMemory()` supports the `setBackend()` method:
supportsSetBackend(MsBackendMemory())
#> [1] TRUE

## Thus, it is possible to change to that backend with `setBackend()`. Most
## read-only backends however don't support that, such as the
## `MsBackendMzR` and `setBackend()` would fail to change to that backend.
supportsSetBackend(MsBackendMzR())
#> [1] FALSE

## The on-disk object `sciex` is light-weight, because it does not keep the
## MS peak data in memory. The `sciex_im` object in contrast keeps all the
## data in memory and its size is thus much larger.
object.size(sciex)
#> 395976 bytes
object.size(sciex_im)
#> 55802328 bytes

## The spectra variable `dataStorage` returns for each spectrum the location
## where the data is stored. For in-memory objects:
head(dataStorage(sciex_im))
#> [1] "<memory>" "<memory>" "<memory>" "<memory>" "<memory>" "<memory>"

## While objects that use an on-disk backend will list the files where the
## data is stored.
head(dataStorage(sciex))
#> [1] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [2] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [3] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [4] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [5] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [6] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"

## The spectra variable `dataOrigin` returns for each spectrum the *origin*
## of the data. If the data is read from e.g. mzML files, this will be the
## original mzML file name:
head(dataOrigin(sciex))
#> [1] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [2] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [3] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [4] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [5] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [6] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
head(dataOrigin(sciex_im))
#> [1] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [2] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [3] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [4] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [5] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [6] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"


## ---- ACCESSING AND ADDING DATA ----

## Get the MS level for each spectrum.
msLevel(data)
#> [1] 1 2

## Alternatively, we could also use $ to access a specific spectra variable.
## This could also be used to add additional spectra variables to the
## object (see further below).
data$msLevel
#> [1] 1 2

## Get the intensity and m/z values.
intensity(data)
#> NumericList of length 2
#> [[1]] 200 400 34.2 17
#> [[2]] 12.3 15.2 6.8
mz(data)
#> NumericList of length 2
#> [[1]] 100 103.2 104.3 106.5
#> [[2]] 45.6 120.4 190.2

## Determine whether one of the spectra has a specific m/z value
containsMz(data, mz = 120.4)
#> [1] FALSE  TRUE

## Accessing spectra variables works for all backends:
intensity(sciex)
#> NumericList of length 1862
#> [[1]] 0 412 0 0 412 0 0 412 0 0 412 0 0 ... 0 412 0 0 412 0 0 412 0 0 412 412 0
#> [[2]] 0 140 0 0 140 0 0 419 0 0 140 0 0 ... 0 140 0 0 140 0 0 140 0 0 279 140 0
#> [[3]] 0 132 263 263 132 132 0 0 132 132 0 0 ... 0 0 132 0 0 132 0 0 132 0 132 0
#> [[4]] 0 139 139 0 0 139 0 0 139 139 0 139 0 ... 0 0 139 0 0 277 0 0 139 0 139 0
#> [[5]] 0 164 0 0 328 0 164 0 0 164 0 0 164 ... 164 0 0 164 0 0 164 0 164 0 328 0
#> [[6]] 0 146 146 146 0 0 146 0 0 146 0 0 ... 146 0 0 146 146 0 0 146 0 0 146 0
#> [[7]] 0 296 0 296 0 0 148 0 0 148 0 0 148 ... 0 0 148 0 0 148 0 0 148 0 0 148 0
#> [[8]] 0 170 0 170 170 170 0 170 0 0 170 0 ... 170 0 0 170 0 0 170 0 0 170 170 0
#> [[9]] 0 157 0 314 0 0 157 0 0 157 0 0 314 ... 0 0 157 0 0 157 0 0 157 0 157 0
#> [[10]] 0 151 302 302 604 0 302 0 0 151 0 0 ... 151 0 0 151 0 151 0 151 0 151 0
#> ...
#> <1852 more elements>
intensity(sciex_im)
#> NumericList of length 1862
#> [[1]] 0 412 0 0 412 0 0 412 0 0 412 0 0 ... 0 412 0 0 412 0 0 412 0 0 412 412 0
#> [[2]] 0 140 0 0 140 0 0 419 0 0 140 0 0 ... 0 140 0 0 140 0 0 140 0 0 279 140 0
#> [[3]] 0 132 263 263 132 132 0 0 132 132 0 0 ... 0 0 132 0 0 132 0 0 132 0 132 0
#> [[4]] 0 139 139 0 0 139 0 0 139 139 0 139 0 ... 0 0 139 0 0 277 0 0 139 0 139 0
#> [[5]] 0 164 0 0 328 0 164 0 0 164 0 0 164 ... 164 0 0 164 0 0 164 0 164 0 328 0
#> [[6]] 0 146 146 146 0 0 146 0 0 146 0 0 ... 146 0 0 146 146 0 0 146 0 0 146 0
#> [[7]] 0 296 0 296 0 0 148 0 0 148 0 0 148 ... 0 0 148 0 0 148 0 0 148 0 0 148 0
#> [[8]] 0 170 0 170 170 170 0 170 0 0 170 0 ... 170 0 0 170 0 0 170 0 0 170 170 0
#> [[9]] 0 157 0 314 0 0 157 0 0 157 0 0 314 ... 0 0 157 0 0 157 0 0 157 0 157 0
#> [[10]] 0 151 302 302 604 0 302 0 0 151 0 0 ... 151 0 0 151 0 151 0 151 0 151 0
#> ...
#> <1852 more elements>

## Get the m/z for the first spectrum.
mz(data)[[1]]
#> [1] 100.0 103.2 104.3 106.5

## Get the peak data (m/z and intensity values).
pks <- peaksData(data)
pks
#> List of length 2
pks[[1]]
#>         mz intensity
#> [1,] 100.0     200.0
#> [2,] 103.2     400.0
#> [3,] 104.3      34.2
#> [4,] 106.5      17.0
pks[[2]]
#>         mz intensity
#> [1,]  45.6      12.3
#> [2,] 120.4      15.2
#> [3,] 190.2       6.8

## Note that we could get the same resulb by coercing the `Spectra` to
## a `list` or `SimpleList`:
as(data, "list")
#> [[1]]
#>         mz intensity
#> [1,] 100.0     200.0
#> [2,] 103.2     400.0
#> [3,] 104.3      34.2
#> [4,] 106.5      17.0
#> 
#> [[2]]
#>         mz intensity
#> [1,]  45.6      12.3
#> [2,] 120.4      15.2
#> [3,] 190.2       6.8
#> 
as(data, "SimpleList")
#> List of length 2

## List all available spectra variables (i.e. spectrum data and metadata).
spectraVariables(data)
#>  [1] "msLevel"                 "rtime"                  
#>  [3] "acquisitionNum"          "scanIndex"              
#>  [5] "dataStorage"             "dataOrigin"             
#>  [7] "centroided"              "smoothed"               
#>  [9] "polarity"                "precScanNum"            
#> [11] "precursorMz"             "precursorIntensity"     
#> [13] "precursorCharge"         "collisionEnergy"        
#> [15] "isolationWindowLowerMz"  "isolationWindowTargetMz"
#> [17] "isolationWindowUpperMz" 

## For all *core* spectrum variables accessor functions are available. These
## return NA if the variable was not set.
centroided(data)
#> [1] NA NA
dataStorage(data)
#> [1] "<memory>" "<memory>"
rtime(data)
#> [1] 1.1 1.2
precursorMz(data)
#> [1] NA NA

## The core spectra variables are:
coreSpectraVariables()
#>                 msLevel                   rtime          acquisitionNum 
#>               "integer"               "numeric"               "integer" 
#>               scanIndex                      mz               intensity 
#>               "integer"           "NumericList"           "NumericList" 
#>             dataStorage              dataOrigin              centroided 
#>             "character"             "character"               "logical" 
#>                smoothed                polarity             precScanNum 
#>               "logical"               "integer"               "integer" 
#>             precursorMz      precursorIntensity         precursorCharge 
#>               "numeric"               "numeric"               "integer" 
#>         collisionEnergy  isolationWindowLowerMz isolationWindowTargetMz 
#>               "numeric"               "numeric"               "numeric" 
#>  isolationWindowUpperMz 
#>               "numeric" 

## Add an additional metadata column.
data$spectrum_id <- c("sp_1", "sp_2")

## List spectra variables, "spectrum_id" is now also listed
spectraVariables(data)
#>  [1] "msLevel"                 "rtime"                  
#>  [3] "acquisitionNum"          "scanIndex"              
#>  [5] "dataStorage"             "dataOrigin"             
#>  [7] "centroided"              "smoothed"               
#>  [9] "polarity"                "precScanNum"            
#> [11] "precursorMz"             "precursorIntensity"     
#> [13] "precursorCharge"         "collisionEnergy"        
#> [15] "isolationWindowLowerMz"  "isolationWindowTargetMz"
#> [17] "isolationWindowUpperMz"  "spectrum_id"            

## Get the values for the new spectra variable
data$spectrum_id
#> [1] "sp_1" "sp_2"

## Extract specific spectra variables.
spectraData(data, columns = c("spectrum_id", "msLevel"))
#> DataFrame with 2 rows and 2 columns
#>   spectrum_id   msLevel
#>   <character> <integer>
#> 1        sp_1         1
#> 2        sp_2         2

## Drop spectra variable data and/or columns.
res <- selectSpectraVariables(data, c("mz", "intensity"))

## This removed the additional columns "spectrum_id" and deleted all values
## for all spectra variables, except "mz" and "intensity".
spectraData(res)
#> DataFrame with 2 rows and 17 columns
#>     msLevel     rtime acquisitionNum scanIndex dataStorage  dataOrigin
#>   <integer> <numeric>      <integer> <integer> <character> <character>
#> 1        NA        NA             NA        NA    <memory>          NA
#> 2        NA        NA             NA        NA    <memory>          NA
#>   centroided  smoothed  polarity precScanNum precursorMz precursorIntensity
#>    <logical> <logical> <integer>   <integer>   <numeric>          <numeric>
#> 1         NA        NA        NA          NA          NA                 NA
#> 2         NA        NA        NA          NA          NA                 NA
#>   precursorCharge collisionEnergy isolationWindowLowerMz
#>         <integer>       <numeric>              <numeric>
#> 1              NA              NA                     NA
#> 2              NA              NA                     NA
#>   isolationWindowTargetMz isolationWindowUpperMz
#>                 <numeric>              <numeric>
#> 1                      NA                     NA
#> 2                      NA                     NA

## Compared to the data before selectSpectraVariables.
spectraData(data)
#> DataFrame with 2 rows and 18 columns
#>     msLevel     rtime acquisitionNum scanIndex dataStorage  dataOrigin
#>   <integer> <numeric>      <integer> <integer> <character> <character>
#> 1         1       1.1             NA        NA    <memory>          NA
#> 2         2       1.2             NA        NA    <memory>          NA
#>   centroided  smoothed  polarity precScanNum precursorMz precursorIntensity
#>    <logical> <logical> <integer>   <integer>   <numeric>          <numeric>
#> 1         NA        NA        NA          NA          NA                 NA
#> 2         NA        NA        NA          NA          NA                 NA
#>   precursorCharge collisionEnergy isolationWindowLowerMz
#>         <integer>       <numeric>              <numeric>
#> 1              NA              NA                     NA
#> 2              NA              NA                     NA
#>   isolationWindowTargetMz isolationWindowUpperMz spectrum_id
#>                 <numeric>              <numeric> <character>
#> 1                      NA                     NA        sp_1
#> 2                      NA                     NA        sp_2


## ---- SUBSETTING, FILTERING AND COMBINING

## Subset to all MS2 spectra.
data[msLevel(data) == 2]
#> MSn data (Spectra) with 1 spectra in a MsBackendMemory backend:
#>     msLevel     rtime scanIndex
#>   <integer> <numeric> <integer>
#> 1         2       1.2        NA
#>  ... 17 more variables/columns.

## Same with the filterMsLevel function
filterMsLevel(data, 2)
#> MSn data (Spectra) with 1 spectra in a MsBackendMemory backend:
#>     msLevel     rtime scanIndex
#>   <integer> <numeric> <integer>
#> 1         2       1.2        NA
#>  ... 17 more variables/columns.
#> Processing:
#>  Filter: select MS level(s) 2 [Thu Mar 28 09:08:03 2024] 

## Below we combine the `data` and `sciex_im` objects into a single one.
data_comb <- c(data, sciex_im)

## The combined Spectra contains a union of all spectra variables:
head(data_comb$spectrum_id)
#> [1] "sp_1" "sp_2" NA     NA     NA     NA    
head(data_comb$rtime)
#> [1] 1.100 1.200 0.280 0.559 0.838 1.117
head(data_comb$dataStorage)
#> [1] "<memory>" "<memory>" "<memory>" "<memory>" "<memory>" "<memory>"
head(data_comb$dataOrigin)
#> [1] NA                                                                
#> [2] NA                                                                
#> [3] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [4] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [5] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
#> [6] "/__w/_temp/Library/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"

## Filter a Spectra for a target precursor m/z with a tolerance of 10ppm
spd$precursorMz <- c(323.4, 543.2302)
data_filt <- Spectra(spd)
filterPrecursorMzRange(data_filt, mz = 543.23 + ppm(c(-543.23, 543.23), 10))
#> MSn data (Spectra) with 0 spectra in a MsBackendMemory backend:
#> Processing:
#>  Filter: select spectra with a precursor m/z within [543.2354323, 543.2354323] [Thu Mar 28 09:08:03 2024] 

## Filter a Spectra keeping only peaks matching certain m/z values
sps_sub <- filterMzValues(data, mz = c(103, 104), tolerance = 0.3)
mz(sps_sub)
#> NumericList of length 2
#> [[1]] 103.2 104.3
#> [[2]] numeric(0)

## This function can also be used to remove specific peaks from a spectrum
## by setting `keep = FALSE`.
sps_sub <- filterMzValues(data, mz = c(103, 104),
    tolerance = 0.3, keep = FALSE)
mz(sps_sub)
#> NumericList of length 2
#> [[1]] 100 106.5
#> [[2]] 45.6 120.4 190.2

## Note that `filterMzValues()` keeps or removes all peaks with a matching
## m/z given the provided `ppm` and `tolerance` parameters.

## Filter a Spectra keeping only peaks within a m/z range
sps_sub <- filterMzRange(data, mz = c(100, 300))
mz(sps_sub)
#> NumericList of length 2
#> [[1]] 100 103.2 104.3 106.5
#> [[2]] 120.4 190.2

## Remove empty spectra variables
sciex_noNA <- dropNaSpectraVariables(sciex)

## Available spectra variables before and after `dropNaSpectraVariables()`
spectraVariables(sciex)
#>  [1] "msLevel"                  "rtime"                   
#>  [3] "acquisitionNum"           "scanIndex"               
#>  [5] "dataStorage"              "dataOrigin"              
#>  [7] "centroided"               "smoothed"                
#>  [9] "polarity"                 "precScanNum"             
#> [11] "precursorMz"              "precursorIntensity"      
#> [13] "precursorCharge"          "collisionEnergy"         
#> [15] "isolationWindowLowerMz"   "isolationWindowTargetMz" 
#> [17] "isolationWindowUpperMz"   "peaksCount"              
#> [19] "totIonCurrent"            "basePeakMZ"              
#> [21] "basePeakIntensity"        "ionisationEnergy"        
#> [23] "lowMZ"                    "highMZ"                  
#> [25] "mergedScan"               "mergedResultScanNum"     
#> [27] "mergedResultStartScanNum" "mergedResultEndScanNum"  
#> [29] "injectionTime"            "filterString"            
#> [31] "spectrumId"               "ionMobilityDriftTime"    
#> [33] "scanWindowLowerLimit"     "scanWindowUpperLimit"    
spectraVariables(sciex_noNA)
#>  [1] "msLevel"                 "rtime"                  
#>  [3] "acquisitionNum"          "scanIndex"              
#>  [5] "dataStorage"             "dataOrigin"             
#>  [7] "centroided"              "smoothed"               
#>  [9] "polarity"                "precScanNum"            
#> [11] "precursorMz"             "precursorIntensity"     
#> [13] "precursorCharge"         "collisionEnergy"        
#> [15] "isolationWindowLowerMz"  "isolationWindowTargetMz"
#> [17] "isolationWindowUpperMz"  "peaksCount"             
#> [19] "totIonCurrent"           "basePeakMZ"             
#> [21] "basePeakIntensity"       "ionisationEnergy"       
#> [23] "lowMZ"                   "highMZ"                 
#> [25] "injectionTime"           "spectrumId"             


## Adding new spectra variables
sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1])
spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging
                 var1 = rnorm(10),
                 var2 = sample(letters, 10))
spv
#> DataFrame with 10 rows and 3 columns
#>                spectrumId        var1        var2
#>               <character>   <numeric> <character>
#> 1  sample=1 period=1 cy.. -1.40004352           w
#> 2  sample=1 period=1 cy..  0.25531705           k
#> 3  sample=1 period=1 cy.. -2.43726361           y
#> 4  sample=1 period=1 cy.. -0.00557129           e
#> 5  sample=1 period=1 cy..  0.62155272           u
#> 6  sample=1 period=1 cy..  1.14841161           l
#> 7  sample=1 period=1 cy.. -1.82181766           b
#> 8  sample=1 period=1 cy.. -0.24732530           f
#> 9  sample=1 period=1 cy.. -0.24419961           d
#> 10 sample=1 period=1 cy.. -0.28270545           p

sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId")

spectraVariables(sciex2)
#>  [1] "msLevel"                  "rtime"                   
#>  [3] "acquisitionNum"           "scanIndex"               
#>  [5] "dataStorage"              "dataOrigin"              
#>  [7] "centroided"               "smoothed"                
#>  [9] "polarity"                 "precScanNum"             
#> [11] "precursorMz"              "precursorIntensity"      
#> [13] "precursorCharge"          "collisionEnergy"         
#> [15] "isolationWindowLowerMz"   "isolationWindowTargetMz" 
#> [17] "isolationWindowUpperMz"   "peaksCount"              
#> [19] "totIonCurrent"            "basePeakMZ"              
#> [21] "basePeakIntensity"        "ionisationEnergy"        
#> [23] "lowMZ"                    "highMZ"                  
#> [25] "mergedScan"               "mergedResultScanNum"     
#> [27] "mergedResultStartScanNum" "mergedResultEndScanNum"  
#> [29] "injectionTime"            "filterString"            
#> [31] "spectrumId"               "ionMobilityDriftTime"    
#> [33] "scanWindowLowerLimit"     "scanWindowUpperLimit"    
#> [35] "var1"                     "var2"                    
spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")]
#> DataFrame with 13 rows and 3 columns
#>                 spectrumId      var1        var2
#>                <character> <numeric> <character>
#> 1   sample=1 period=1 cy..        NA          NA
#> 2   sample=1 period=1 cy..        NA          NA
#> 3   sample=1 period=1 cy.. -1.400044           w
#> 4   sample=1 period=1 cy..  0.255317           k
#> 5   sample=1 period=1 cy.. -2.437264           y
#> ...                    ...       ...         ...
#> 9   sample=1 period=1 cy.. -1.821818           b
#> 10  sample=1 period=1 cy.. -0.247325           f
#> 11  sample=1 period=1 cy.. -0.244200           d
#> 12  sample=1 period=1 cy.. -0.282705           p
#> 13  sample=1 period=1 cy..        NA          NA

## Removing fourier transform artefacts seen in Orbitra data.

## Loading an Orbitrap spectrum with artefacts.
data(fft_spectrum)
plotSpectra(fft_spectrum, xlim = c(264.5, 265.5))

plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6))


fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum)
fft_spectrum
#> MSn data (Spectra) with 1 spectra in a MsBackendDataFrame backend:
#>     msLevel     rtime scanIndex
#>   <integer> <numeric> <integer>
#> 1         1   367.665       195
#>  ... 33 more variables/columns.
#> Lazy evaluation queue: 1 processing step(s)
#> Processing:
#>  Switch backend from MsBackendMzR to MsBackendDataFrame [Mon Nov 22 14:14:45 2021]
#>  Remove fast fourier artefacts. [Thu Mar 28 09:08:03 2024] 
plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6))


## Using a few examples peaks in your data you can optimize the parameters
fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum,
                                               halfWindowSize = 0.2,
                                               threshold = 0.005,
                                               keepIsotopes = TRUE,
                                               maxCharge = 5,
                                               isotopeTolerance = 0.005
                                               )

fft_spectrum_filtered
#> MSn data (Spectra) with 1 spectra in a MsBackendDataFrame backend:
#>     msLevel     rtime scanIndex
#>   <integer> <numeric> <integer>
#> 1         1   367.665       195
#>  ... 33 more variables/columns.
#> Lazy evaluation queue: 2 processing step(s)
#> Processing:
#>  Switch backend from MsBackendMzR to MsBackendDataFrame [Mon Nov 22 14:14:45 2021]
#>  Remove fast fourier artefacts. [Thu Mar 28 09:08:03 2024]
#>  Remove fast fourier artefacts. [Thu Mar 28 09:08:03 2024] 
length(mz(fft_spectrum_filtered)[[1]])
#> [1] 297
plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6))


## Using filterRanges to filter spectra object based on variables available
## in `spectraData`.
## First, determine the variable(s) on which to base the filtering:
sv <- c("rtime", "precursorMz", "peaksCount")
## Note that ANY variables can be chosen here, and as many as wanted.

## Define the ranges (pairs of values with lower and upper boundary) to be
## used for the individual spectra variables. The first two values will be
## used for the first spectra variable (e.g., rtime here), the next two for
## the second (e.g. precursorMz here) and so on:
ranges <- c(30, 350, 200,500, 350, 600)

## Input the parameters within the filterRanges function:
filt_spectra <- filterRanges(sciex, spectraVariables = sv,
                ranges = ranges)

## Using `filterRanges()` to filter spectra object with multiple ranges for
## the same `spectraVariable` (e.g, here rtime)
sv <- c("rtime", "rtime")
ranges <- c(30, 100, 200, 300)
filt_spectra <- filterRanges(sciex, spectraVariables = sv,
                ranges = ranges, match = "any")

## Using filterValues in a similar way to a filter spectra object based on
## variables available in `spectraData`. However, this time not based on
## ranges but similarities to user input single values with given
## tolerance/ppm
## First determine the variable(s) on which to base the filtering:
sv <- c("rtime", "precursorMz")
## Note that ANY variables can be chosen here, and as many as wanted.

## Define the values that will be used to filter the spectra based on their
## similarities to their respective spectraVariables.
## The first values in the parameters values, tolerance and ppm will be
## used for the first spectra variable (e.g. rtime here), the next for the
## second (e.g. precursorMz here) and so on:
values <- c(350, 400)
tolerance <- c(100, 0)
ppm <- c(0,50)

## Input the parameters within the `filterValues()` function:
filt_spectra <- filterValues(sciex, spectraVariables = sv,
                values = values, tolerance = tolerance, ppm = ppm)

## ---- DATA MANIPULATIONS AND OTHER OPERATIONS ----

## Set the data to be centroided
centroided(data) <- TRUE

## Replace peak intensities below 40 with 3.
res <- replaceIntensitiesBelow(data, threshold = 40, value = 3)
res
#> MSn data (Spectra) with 2 spectra in a MsBackendMemory backend:
#>     msLevel     rtime scanIndex
#>   <integer> <numeric> <integer>
#> 1         1       1.1        NA
#> 2         2       1.2        NA
#>  ... 17 more variables/columns.
#> Lazy evaluation queue: 1 processing step(s)
#> Processing:
#>  Signal <= 40 in MS level(s) 1, 2 set to 0 [Thu Mar 28 09:08:03 2024] 

## Get the intensities of the first and second spectrum.
intensity(res)[[1]]
#> [1] 200 400   3   3
intensity(res)[[2]]
#> [1] 3 3 3

## Remove all peaks with an intensity below 40.
res <- filterIntensity(res, intensity = c(40, Inf))

## Get the intensities of the first and second spectrum.
intensity(res)[[1]]
#> [1] 200 400
intensity(res)[[2]]
#> numeric(0)

## Lengths of spectra is now different
lengths(mz(res))
#> [1] 2 0
lengths(mz(data))
#> [1] 4 3

## In addition it is possible to pass a function to `filterIntensity()`: in
## the example below we want to keep only peaks that have an intensity which
## is larger than one third of the maximal peak intensity in that spectrum.
keep_peaks <- function(x, prop = 3) {
    x > max(x, na.rm = TRUE) / prop
}
res2 <- filterIntensity(data, intensity = keep_peaks)
intensity(res2)[[1L]]
#> [1] 200 400
intensity(data)[[1L]]
#> [1] 200.0 400.0  34.2  17.0

## We can also change the proportion by simply passing the `prop` parameter
## to the function. To keep only peaks that have an intensity which is
## larger than half of the maximum intensity:
res2 <- filterIntensity(data, intensity = keep_peaks, prop = 2)
intensity(res2)[[1L]]
#> intensity 
#>       400 
intensity(data)[[1L]]
#> [1] 200.0 400.0  34.2  17.0

## Since data manipulation operations are by default not directly applied to
## the data but only added to the internal lazy evaluation queue, it is also
## possible to remove these data manipulations with the `reset()` function:
res_rest <- reset(res)
res_rest
#> MSn data (Spectra) with 2 spectra in a MsBackendMemory backend:
#>     msLevel     rtime scanIndex
#>   <integer> <numeric> <integer>
#> 1         1       1.1        NA
#> 2         2       1.2        NA
#>  ... 17 more variables/columns.
#> Processing:
#>  Signal <= 40 in MS level(s) 1, 2 set to 0 [Thu Mar 28 09:08:03 2024]
#>  Remove peaks with intensities outside [40, Inf] in spectra of MS level(s) 1, 2. [Thu Mar 28 09:08:03 2024]
#>  Reset object. [Thu Mar 28 09:08:03 2024] 
lengths(mz(res_rest))
#> [1] 4 3
lengths(mz(res))
#> [1] 2 0
lengths(mz(data))
#> [1] 4 3

## `reset()` after a `applyProcessing()` can not restore the data, because
## the data in the backend was changed. Similarly, `reset()` after any
## filter operations can not restore data for a `Spectra` with a
## `MsBackendMemory` or `MsBackendDataFrame`.
res_2 <- applyProcessing(res)
res_rest <- reset(res_2)
lengths(mz(res))
#> [1] 2 0
lengths(mz(res_rest))
#> [1] 2 0


## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using
## the normalized dotproduct method.
res <- compareSpectra(sciex_im[2:3], sciex_im[10:20])
## first row contains comparisons of spectrum 2 with spectra 10 to 20 and
## the second row comparisons of spectrum 3 with spectra 10 to 20
res
#>          10        11        12        13        14        15        16
#> 2 0.8583577 0.8603472 0.8588292 0.8434391 0.8581683 0.8515891 0.8568266
#> 3 0.8620081 0.8609868 0.8604669 0.8481934 0.8637094 0.8586378 0.8618227
#>          17        18        19        20
#> 2 0.8563933 0.8559511 0.8546560 0.8538058
#> 3 0.8559523 0.8652169 0.8585325 0.8639445

## To use a simple Pearson correlation instead we can define a function
## that takes the two peak matrices and calculates the correlation for
## their second columns (containing the intensity values).
correlateSpectra <- function(x, y, use = "pairwise.complete.obs", ...) {
    cor(x[, 2], y[, 2], use = use)
}
res <- compareSpectra(sciex_im[2:3], sciex_im[10:20],
    FUN = correlateSpectra)
res
#>          10        11        12        13        14        15        16
#> 2 0.9974395 0.9989580 0.9973735 0.9974334 0.9987225 0.9992307 0.9978621
#> 3 0.9986971 0.9976861 0.9964951 0.9965942 0.9982105 0.9984607 0.9982808
#>          17        18        19        20
#> 2 0.9950464 0.9987036 0.9988265 0.9970657
#> 3 0.9921029 0.9990331 0.9975400 0.9955439

## Use compareSpectra to determine the number of common (matching) peaks
## with a ppm of 10:
## type = "inner" uses a *inner join* to match peaks, i.e. keeps only
## peaks that can be mapped betwen both spectra. The provided FUN returns
## simply the number of matching peaks.
compareSpectra(sciex_im[2:3], sciex_im[10:20], ppm = 10, type = "inner",
    FUN = function(x, y, ...) nrow(x))
#>    10  11  12  13  14  15  16  17  18  19  20
#> 2 530 501 484 576 592 515 549 578 539 542 550
#> 3 526 548 524 592 639 502 589 571 575 564 613

## Apply an arbitrary function to each spectrum in a Spectra.
## In the example below we calculate the mean intensity for each spectrum
## in a subset of the sciex_im data. Note that we can access all variables
## of each individual spectrum either with the `$` operator or the
## corresponding method.
res <- spectrapply(sciex_im[1:20], FUN = function(x) mean(x$intensity[[1]]))
head(res)
#> $`1`
#> [1] 1553.953
#> 
#> $`2`
#> [1] 678.2289
#> 
#> $`3`
#> [1] 684.3569
#> 
#> $`4`
#> [1] 682.1004
#> 
#> $`5`
#> [1] 780.6867
#> 
#> $`6`
#> [1] 737.5087
#> 

## It is however important to note that dedicated methods to access the
## data (such as `intensity`) are much more efficient than using `lapply()`:
res <- lapply(intensity(sciex_im[1:20]), mean)
head(res)
#> [[1]]
#> [1] 1553.953
#> 
#> [[2]]
#> [1] 678.2289
#> 
#> [[3]]
#> [1] 684.3569
#> 
#> [[4]]
#> [1] 682.1004
#> 
#> [[5]]
#> [1] 780.6867
#> 
#> [[6]]
#> [1] 737.5087
#> 

## As an alternative, applying a function `FUN` to a `Spectra` can be
## performed *chunk-wise*. The advantage of this is, that only the data for
## one chunk at a time needs to be loaded into memory reducing the memory
## demand. This type of processing can be performed by specifying the size
## of the chunks (i.e. number of spectra per chunk) with the `chunkSize`
## parameter
spectrapply(sciex_im[1:20], lengths, chunkSize = 5L)
#>  [1]  578 1529 1600 1664 1417 1602 1468 1440 1496 1431 1489 1349 1650 1697 1413
#> [16] 1593 1560 1477 1536 1491

## ---- DATA EXPORT ----

## Some `MsBackend` classes provide an `export()` method to export the data
## to the file format supported by the backend.
## The `MsBackendMzR` for example allows to export MS data to mzML or
## mzXML file(s), the `MsBackendMgf` (defined in the MsBackendMgf R package)
## would allow to export the data in mgf file format.
## Below we export the MS data in `data`. We call the `export()` method on
## this object, specify the backend that should be used to export the data
## (and which also defines the output format) and provide a file name.
fl <- tempfile()
export(data, MsBackendMzR(), file = fl)

## This exported our data in mzML format. Below we read the first 6 lines
## from that file.
readLines(fl, n = 6)
#> [1] "<?xml version=\"1.0\" encoding=\"utf-8\"?>"                                                                                                                                                                                            
#> [2] "<indexedmzML xmlns=\"http://psi.hupo.org/ms/mzml\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.2_idx.xsd\">"                 
#> [3] "  <mzML xmlns=\"http://psi.hupo.org/ms/mzml\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd\" id=\"\" version=\"1.1.0\">"
#> [4] "    <cvList count=\"2\">"                                                                                                                                                                                                              
#> [5] "      <cv id=\"MS\" fullName=\"Proteomics Standards Initiative Mass Spectrometry Ontology\" version=\"4.1.117\" URI=\"https://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo\"/>"                                      
#> [6] "      <cv id=\"UO\" fullName=\"Unit Ontology\" version=\"09:04:2014\" URI=\"https://raw.githubusercontent.com/bio-ontology-research-group/unit-ontology/master/unit.obo\"/>"                                                           

## If only a single file name is provided, all spectra are exported to that
## file. To export data with the `MsBackendMzR` backend to different files, a
## file name for each individual spectrum has to be provided.
## Below we export each spectrum to its own file.
fls <- c(tempfile(), tempfile())
export(data, MsBackendMzR(), file = fls)

## Reading the data from the first file
res <- Spectra(backendInitialize(MsBackendMzR(), fls[1]))

mz(res)
#> NumericList of length 1
#> [[1]] 100 103.2 104.3 106.5
mz(data)
#> NumericList of length 2
#> [[1]] 100 103.2 104.3 106.5
#> [[2]] 45.6 120.4 190.2

## ---- PEAKS VARIABLES AND DATA ----

## Some `MsBackend` classes provide support for arbitrary peaks variables
## (in addition to the mandatory `"mz"` and `"intensity"` values. Below
## we create a simple data frame with an additional peak variable `"pk_ann"`
## and create a `Spectra` with a `MsBackendMemory` for that data.
## Importantly the number of values (per spectrum) need to be the same
## for all peak variables.

tmp <- data.frame(msLevel = c(2L, 2L), rtime = c(123.2, 123.5))
tmp$mz <- list(c(103.1, 110.4, 303.1), c(343.2, 453.1))
tmp$intensity <- list(c(130.1, 543.1, 40), c(0.9, 0.45))
tmp$pk_ann <- list(c(NA_character_, "A", "P"), c("B", "P"))

## Create the Spectra. With parameter `peaksVariables` we can define
## the columns in `tmp` that contain peaks variables.
sps <- Spectra(tmp, source = MsBackendMemory(),
    peaksVariables = c("mz", "intensity", "pk_ann"))
peaksVariables(sps)
#> [1] "mz"        "intensity" "pk_ann"   

## Extract just the m/z and intensity values
peaksData(sps)[[1L]]
#>         mz intensity
#> [1,] 103.1     130.1
#> [2,] 110.4     543.1
#> [3,] 303.1      40.0

## Extract the full peaks data
peaksData(sps, columns = peaksVariables(sps))[[1L]]
#>      mz intensity pk_ann
#> 1 103.1     130.1   <NA>
#> 2 110.4     543.1      A
#> 3 303.1      40.0      P

## Access just the pk_ann variable
sps$pk_ann
#> [[1]]
#> [1] NA  "A" "P"
#> 
#> [[2]]
#> [1] "B" "P"
#>