diff --git a/DESCRIPTION b/DESCRIPTION index 855f9341952c7d03c5b4a22c6d9eb7f2a6e3bc0f..5e4961f976d57082891aa25b18f0bb9a3515b702 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,6 +35,7 @@ Suggests: ontoCAT, RUnit Collate: + 'alternateAnalyze.R' 'createMassBank.R' 'formulaCalculator.R' 'leCsvAccess.R' @@ -54,5 +55,6 @@ Collate: 'validateMassBank.R' 'tools.R' 'msmsRead.R' + 'Isotopic_Annotation.R' 'zzz.R' diff --git a/NAMESPACE b/NAMESPACE index d07a6625d936af0a564602321f5344aec8d788cf..6a3f364be3237a7583d554af88623668e65afe33 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,4 @@ -# Generated by roxygen2 (4.1.0): do not edit by hand +# Generated by roxygen2 (4.1.1): do not edit by hand S3method(c,msmsWSspecs) export(CTS.externalIdSubset) @@ -13,9 +13,11 @@ export(addProperty) export(aggregateSpectra) export(analyzeMsMs) export(analyzeMsMs.formula) +export(analyzeMsMs.formula.optimized) export(analyzeMsMs.intensity) export(annotator.default) export(archiveResults) +export(checkIsotopes) export(checkSpectra) export(cleanElnoise) export(combineMultiplicities) @@ -40,6 +42,7 @@ export(findMsMsHR.direct) export(findMsMsHR.mass) export(findMsMsHR.ticms2) export(findMsMsHR.ticms2.d) +export(findMsMsHRperxcms) export(findMsMsHRperxcms.direct) export(findMz) export(findMz.formula) diff --git a/R/Isotopic_Annotation.R b/R/Isotopic_Annotation.R new file mode 100644 index 0000000000000000000000000000000000000000..544f97fd424c454d897f5c0717055264338ee011 --- /dev/null +++ b/R/Isotopic_Annotation.R @@ -0,0 +1,377 @@ +#' Checks for isotopes in a \code{msmsWorkspace} +#' +#' @param w A \code{msmsWorkspace} to work with. +#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param intensity_cutoff The cutoff (as an absolute intensity value) under which isotopic peaks shouldn't be checked for or accepted as valid. +#' @param intensity_precision The difference that is accepted between the calculated and observed intensity of a possible isotopic peak. Further details down below. +#' @param conflict Either "isotopic"(Peak formulas are always chosen if they fit the requirements for an isotopic peak) +#' or "strict"(Peaks are only marked as isotopic when there hasn't been a formula assigned before.) +#' @param isolationWindow The isolation window in Da +#' @param evalMode Currently no function yet, but planned +#' @param plotSpectrum A boolean specifiying whether the spectrumshould be plotted +#' @param settings Options to be used for processing. Defaults to the options loaded via +#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. +#' @details text describing parameter inputs in more detail. +#' \itemize{ +#' \item{\code{intensity_precision}}{This parameter determines how strict the intensity values should adhere to the calculated intensity in relation to the parent peak. +#' Options for this parameter are \code{"none"}, where the intensity is irrelevant, \code{"low"}, which has an error margin of 70\% and \code{"high"}, where the error margin +#' is set to 35\%. The recommended setting is \code{"low"}, but can be changed to adjust to the intensity precision of the mass spectrometer.} +# \item{\code{evalMode}}{This parameter sets what should be done after the isotopic check. The option "add" adds failpeaks if they are isotopic peaks of previously matched peaks. +# "check" checks matched peaks with formulas for isotopes and removes them if no isotopic peaks have been found for all formulas. The formula is also +# adjusted, if the one with matching isotopes doesn't have the lowest dppm. "complete" does both.} +#' } +#' @return The \code{msmsWorkspace} with . +#' @author Michael Stravs, Eawag <michael.stravs@@eawag.ch> +#' @author Erik Mueller, UFZ +#' @export +checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 5000, intensity_precision = "low", conflict = "dppm", + isolationWindow = 4, evalMode = "complete", plotSpectrum = TRUE, settings = getOption("RMassBank")){ + + # Load library and data + require(enviPat) + data("isotopes") + + + if(!(intensity_precision %in% c("none","low","high"))){ + stop('intensity_precision must be specified as either "none", "low" or "high"') + } + + switch(intensity_precision, + none={ + precisionVal <- Inf + }, + low={ + precisionVal <- 0.7 + }, + high={ + precisionVal <- 0.35 + } + ) + + # Load filtersettings + filterSettings = settings$filterSettings + + # Assign formula additions according to code + if(mode == "pH") { + allowed_additions <- "H" + mode.charge <- 1 + } else if(mode == "pNa") { + allowed_additions <- "Na" + mode.charge <- 1 + } else if(mode == "pM") { + allowed_additions <- "" + mode.charge <- 1 + } else if(mode == "mM") { + allowed_additions <- "" + mode.charge <- -1 + } else if(mode == "mH") { + allowed_additions <- "H-1" + mode.charge <- -1 + } else if(mode == "mFA") { + allowed_additions <- "C2H3O2" + mode.charge <- -1 + } else if(mode == "pNH4") { + allowed_additions <- "NH4" + mode.charge <- 1 + } else{ + stop("mode = \"", mode, "\" not defined") + } + + if(!(evalMode %in% c("complete"))){ + stop('evalMode must currently be specified as "complete"') + } + + evalMode <- c("add","check") + + + # "default" isotopes (i.e. those with the highest abundance) + defIsotopes <- c("107Ag", "27Al", "40Ar", "75As", "197Au", "11B", "138Ba", "9Be", "209Bi", + "79Br", "12C", "40Ca", "114Cd", "140Ce", "35Cl", "59Co", "52Cr", "133Cs", "63Cu", "164Dy", + "166Er", "153Eu", "19F", "56Fe", "69Ga", "158Gd", "74Ge", "1H", "4He", "180Hf", "202Hg", + "165Ho", "127I", "115In", "193Ir", "39K", "84Kr", "139La", "7Li", "175Lu", "24Mg", "55Mn", + "98Mo", "14N", "23Na", "93Nb", "142Nd", "20Ne", "58Ni", "16O", "192Os", "31P", "231Pa", + "208Pb", "106Pd", "141Pr", "195Pt", "85Rb", "187Re", "103Rh", "102Ru", "32S", "121Sb", + "45Sc", "80Se", "28Si", "152Sm", "120Sn", "86Sr", "88Sr", "181Ta", "159Tb", "130Te", + "232Th", "48Ti", "205Tl", "169Tm", "238U", "51V", "184W", "132Xe", "89Y", "174Yb", + "64Zn", "90Zr") + + + # Get the ppm limit from the settings + ppmlimit <- 2.25 * filterSettings$ppmFine + + # Get the cutoff from the settings + cut <- filterSettings$fineCut + cut_ratio <- filterSettings$fineCutRatio + + # Extract matched and unmatched peaks from the aggregated peaks + matchedPeaks <- peaksMatched(w) + unmatchedPeaks <- peaksUnmatched(w) + + wEnv <- environment() + + # lapply over all runs + lapply(w@spectra, function(spec){ + + # Find parent formula and cpdID + parent_formula <- add.formula(spec@formula, allowed_additions) + id <- as.numeric(spec@id) + + + # lapply over all extracted MS2 spectra + lapply(spec@children, function(msmsdata){ + + # Extract currently relevant peaks + currentMPeaks <- matchedPeaks[(matchedPeaks$cpdID == id) & (matchedPeaks$scan == msmsdata@acquisitionNum),,drop=FALSE] + currentUPeaks <- unmatchedPeaks[(unmatchedPeaks$cpdID == id) & (unmatchedPeaks$scan == msmsdata@acquisitionNum),,drop=FALSE] + + rownames(currentMPeaks) <- 1:nrow(currentMPeaks) + rownames(currentUPeaks) <- 1:nrow(currentUPeaks) + + # Generate isotopic patterns of the matched peaks + # Sort pattern by abundance + isoMPatterns <- lapply(currentMPeaks$formula, function(formula){ + # Find pattern + pattern <- as.data.frame(isopattern(formula, isotopes = isotopes)[[1]]) + mass <- findMz.formula(formula,"")$mzCenter + + # Find index of nonisotopic molecule + mainIndex <- which.min(abs(pattern[,"m/z"]-mass)) + + isoCols <- colnames(pattern)[3:ncol(pattern)] + + pattern[,"abundance"] <- round(pattern[,"abundance"] * (100/pattern[mainIndex,"abundance"]),3) + defAtoms <- which(isoCols %in% defIsotopes) + + # Indices of isotope notation string to find atomname + + atomIndices <- regexpr("[A-Z][a-z]?",isoCols) + + + newnames <- vector() + + for(i in 1:length(atomIndices)){ + if(i %in% defAtoms){ + newnames[i] <- substr(isoCols[i],atomIndices[i],nchar(isoCols[i])) + }else{ + lhs <- paste0('^(.{', atomIndices[i]-1, '})(.+)$') + newnames[i] <- gsub(lhs, '\\1]\\2', isoCols[i]) + newnames[i] <- gsub('^(.{0})(.+)$', '\\1[\\2', newnames[i]) + } + } + + pattern$formula <- apply(pattern,1,function(p){ + paste0(sapply(3:ncol(pattern),function(isoIndex){ + if(p[isoIndex] == 0){ + return("") + } + return(paste0(newnames[isoIndex-2],p[isoIndex])) + }),collapse="") + }) + pattern <- pattern[order(pattern[,"abundance"],decreasing = T),][-mainIndex,] + }) + + names(isoMPatterns) <- currentMPeaks$formula + + + # Order patterns by abundance and make sure that the normal abundance is at 100% + + # Calculate the expected intensities according to the abundance + # See which expected isotope peaks have an intensity higher than the cutoff + + for(foundAnnotation in 1:nrow(currentMPeaks)){ + intensities <- vector() + + for(pattern in 1:nrow(isoMPatterns[[foundAnnotation]])){ + intensities[pattern] <- currentMPeaks$int[foundAnnotation] * isoMPatterns[[foundAnnotation]]$abundance[pattern]/100 + } + + roundedInt <- round(intensities,digits=-2) + keepIndex <- which(roundedInt >= intensity_cutoff) + isoMPatterns[[foundAnnotation]] <- isoMPatterns[[foundAnnotation]][keepIndex,,drop=FALSE] + if(nrow(isoMPatterns[[foundAnnotation]])){ + isoMPatterns[[foundAnnotation]]$minintensity <- roundedInt[keepIndex] - roundedInt[keepIndex] * precisionVal + isoMPatterns[[foundAnnotation]]$maxintensity <- roundedInt[keepIndex] + roundedInt[keepIndex] * precisionVal + # Calculate the expected mz range of the isotope peak + isoMPatterns[[foundAnnotation]] <- cbind(isoMPatterns[[foundAnnotation]],t(sapply(isoMPatterns[[foundAnnotation]][,"m/z"], ppm, dppm=ppmlimit,l=T))) + } + } + + # Which isotope patterns still have theoretical intensities above the cutoff? + peaksToCheck <- which(as.logical(sapply(isoMPatterns,nrow))) + + # Now, look for isotopic patterns in unmatched peaks with all specified parameters + if("add" %in% evalMode){ + # Which peaks have no correct formula annotation as of now? + currentUPeaks <- currentUPeaks[which(is.na(currentUPeaks$dppm) | abs(currentUPeaks$dppmBest) > ppmlimit/2.25),] + + # If there are any peaks without annotation: + if(nrow(currentUPeaks)){ + j <- 1 + UPList <- list() + # Iterate over all patterns that have relevant intensities + for(i in peaksToCheck){ + UPList[[j]] <- list() + # Iterate over every isotopic permutation that is still in the pattern + for(k in 1:nrow(isoMPatterns[[i]])){ + # Find peaks that fit the specified intensities and m/z + pIndex <- which(currentUPeaks[,"mzFound"] < isoMPatterns[[i]][k,"1"] & currentUPeaks[,"mzFound"] > isoMPatterns[[i]][k,"2"] + & currentUPeaks[,"intensity"] < isoMPatterns[[i]][k,"maxintensity"] & currentUPeaks[,"intensity"] > isoMPatterns[[i]][k,"minintensity"] + ) + # Note these Peaks + UPList[[j]][[k]] <- currentUPeaks[pIndex,] + # If there are any: Change parameters in the aggregated matrix + if(nrow(UPList[[j]][[k]])){ + UPList[[j]][[k]]$dppm <- (currentUPeaks[pIndex,]$mzFound / isoMPatterns[[i]][k,"m/z"] - 1) * 1e6 + UPList[[j]][[k]]$mzCalc <- isoMPatterns[[i]][k,"m/z"] + UPList[[j]][[k]]$formula <- isoMPatterns[[i]][k,"formula"] + UPList[[j]][[k]]$matchedReanalysis <- NA + UPList[[j]][[k]]$filterOK <- TRUE + UPList[[j]][[k]]$good <- TRUE + UPList[[j]][[k]]$dppmBest <- UPList[[j]][[k]]$dppm + UPList[[j]][[k]]$formulaCount <- 1 + } + } + j <- j + 1 + } + } else{ + # If there are no peaks, fake an empty dataset + UPList <- list(list(data.frame()),list(data.frame())) + } + + + additionMatrix <- do.call(rbind, unlist(UPList,recursive=FALSE)) + if(nrow(additionMatrix)){ + # Add all the peaks that could be annotated as isotopic + wEnv$w@aggregated[additionMatrix$index,] <- additionMatrix + } + + # If conflict is "strict", end here + if(conflict == "strict"){ + if(plotSpectrum){ + plot(currentMPeaks$mzFound, currentMPeaks$intensity,type="h", main=paste(id,findName(id)), col="black", xlab="m/z", ylab="intensity", lwd=3) + if(nrow(additionMatrix)){ + points(additionMatrix$mzFound, additionMatrix$intensity,type="h", col="green", lwd=3) + } + } + return(0) + } + } + + + # Now check the matched peaks for the patterns + j <- 1 + MPList <- list() + for(i in peaksToCheck){ + MPList[[j]] <- list() + for(k in 1:nrow(isoMPatterns[[i]])){ + pIndex <- which(currentMPeaks[,"mzFound"] < isoMPatterns[[i]][k,"1"] & currentMPeaks[,"mzFound"] > isoMPatterns[[i]][k,"2"] + & currentMPeaks[,"intensity"] < isoMPatterns[[i]][k,"maxintensity"] & currentMPeaks[,"intensity"] > isoMPatterns[[i]][k,"minintensity"] + & currentMPeaks[,"filterOK"]) + MPList[[j]][[k]] <- currentMPeaks[pIndex,] + + if(nrow(MPList[[j]][[k]])){ + MPList[[j]][[k]]$dppm <- (currentMPeaks[pIndex,]$mzFound / isoMPatterns[[i]][k,"m/z"] - 1) * 1e6 + MPList[[j]][[k]]$mzCalc <- isoMPatterns[[i]][k,"m/z"] + MPList[[j]][[k]]$formula <- isoMPatterns[[i]][k,"formula"] + MPList[[j]][[k]]$matchedReanalysis <- NA + MPList[[j]][[k]]$filterOK <- TRUE + MPList[[j]][[k]]$good <- TRUE + MPList[[j]][[k]]$dppmBest <- MPList[[j]][[k]]$dppm + MPList[[j]][[k]]$formulaCount <- 1 + } + } + j <- j + 1 + } + + # Peakindices of peaksToCheck where no isotopes have been found in Unmatched Peaks + noIsoPeaksUindex <- which(!sapply(UPList, function(x) any(sapply(x,nrow)))) + + # Peakindices of peaksToCheck where isotopes have been found in Unmatched Peaks + IsoPeaksUindex <- setdiff(1:length(peaksToCheck),noIsoPeaksUindex) + + # Peakindices of peaksToCheck where no isotopes have been found in Matched Peaks + noIsoPeaksMindex <- which(!sapply(MPList, function(x) any(sapply(x,nrow)))) + + # Peakindices of peaksToCheck where isotopes have been found in Matched Peaks + IsoPeaksMindex <- setdiff(1:length(peaksToCheck),noIsoPeaksMindex) + + if("add" %in% evalMode && conflict=="isotopic"){ + correctionMatrix <- do.call(rbind, unlist(MPList,recursive=FALSE)) + correctionMatrix <- correctionMatrix[order(abs(correctionMatrix$dppm)),] + for(ind in unique(correctionMatrix$index)){ + if(length(which(correctionMatrix$index==ind))>1){ + correctionMatrix <- correctionMatrix[-which(correctionMatrix$index==ind)[-1],] + } + } + + if(nrow(correctionMatrix)){ + + # Peaks that are changed but also seem to have isotopes themselves + confPeaksIndex <- which(correctionMatrix$index %in% currentMPeaks[peaksToCheck[IsoPeaksMindex],"index"]) + conflictedMatrix <- correctionMatrix[confPeaksIndex,,drop=FALSE] + + if(length(confPeaksIndex)){ + correctionMatrix <- correctionMatrix[-confPeaksIndex,] + } + + + if(nrow(correctionMatrix)){ + wEnv$w@aggregated[correctionMatrix$index,] <- correctionMatrix + } + } + + if(!("check" %in% evalMode)){ + if(plotSpectrum){ + plot(currentMPeaks$mzFound, currentMPeaks$intensity,type="h", main=paste(id,findName(id)), col="black", xlab="m/z", ylab="intensity", lwd=3) + if(nrow(additionMatrix)){ + points(additionMatrix$mzFound, additionMatrix$intensity,type="h", col="green", lwd=3) + } + if(nrow(correctionMatrix)){ + points(correctionMatrix$mzFound, correctionMatrix$intensity,type="h", col="yellow", lwd=3) + } + if(nrow(conflictedMatrix)){ + points(conflictedMatrix$mzFound, conflictedMatrix$intensity,type="h", col="red", lwd=3) + } + } + return(0) + } + } + + if("check" %in% evalMode){ + # Matched Peaks where no isotopes have been found + noIsoPeaksMatrix <- currentMPeaks[peaksToCheck[noIsoPeaksMindex[which(noIsoPeaksMindex %in% noIsoPeaksUindex)]],] + + # Matched Peaks where no isotopes have been found and which haven't been marked as isotopic themselves + noIsoPeaksMatrix <- noIsoPeaksMatrix[which(!grepl("[",wEnv$w@aggregated$formula[noIsoPeaksMatrix$index],fixed=TRUE)),] + if(plotSpectrum){ + plot(currentMPeaks$mzFound, currentMPeaks$intensity,type="h", main=paste(id,findName(id)), col="black", xlab="m/z", ylab="intensity", lwd=3) + } + + if(nrow(noIsoPeaksMatrix)){ + wEnv$w@aggregated[noIsoPeaksMatrix$index,]$good <- FALSE + wEnv$w@aggregated[noIsoPeaksMatrix$index,]$filterOK <- FALSE + wEnv$w@aggregated[noIsoPeaksMatrix$index,]$matchedReanalysis <- FALSE + if(plotSpectrum){ + points(noIsoPeaksMatrix$mzFound, noIsoPeaksMatrix$intensity, type="h", col="orange", lwd=3) + } + } + + if("add" %in% evalMode && conflict=="isotopic" && plotSpectrum){ + if(nrow(additionMatrix)){ + points(additionMatrix$mzFound, additionMatrix$intensity,type="h", col="green", lwd=3) + } + if(nrow(correctionMatrix)){ + points(correctionMatrix$mzFound, correctionMatrix$intensity,type="h", col="yellow", lwd=3) + } + if(nrow(conflictedMatrix)){ + points(conflictedMatrix$mzFound, conflictedMatrix$intensity,type="h", col="red", lwd=3) + } + } + } + return(0) + + }) + }) + return(w) +} diff --git a/R/alternateAnalyze.R b/R/alternateAnalyze.R new file mode 100644 index 0000000000000000000000000000000000000000..98f8856f2f168cc106a78bb665946c733df2636e --- /dev/null +++ b/R/alternateAnalyze.R @@ -0,0 +1,471 @@ +fragDataIndexing <- function(fragData){ + index <- vector() + fragDataWholeMass <- floor(fragData$mass*100) + + k <- 0 + + for(i in 1:length(fragDataWholeMass)){ + while(k <= fragDataWholeMass[i]){ + index[k] <- i + k <- k + 1 + } + } + + return(index) +} + +is.sub.formula <- function(formula, targetAtomList){ + atomList <- formulastring.to.list(formula) + if(!all(names(atomList) %in% names(targetAtomList))){ + return(FALSE) + } + + for(atom in names(atomList)){ + if(atomList[[atom]] > targetAtomList[[atom]]){ + return(FALSE) + } + } + return(TRUE) +} + +newStep2WorkFlow <- function(w, mode="pH", + confirmMode=FALSE, progressbar = "progressBarHook", + settings = getOption("RMassBank"), analyzeMethod="formula", fragdataFile = NA){ + ##Load the fragment data (locally or from RMassBank package) + if(is.na(fragdataFile)){ + fragData <- nfragData + } else{ + fragData <- read.csv(fragdataFile,colClasses = c("character","numeric")) + } + + ##Progress bar + nLen <- length(w@files) + nProg <- 0 + message("msmsWorkflow: Step 2. First analysis pre recalibration") + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + + ##Index the fragment data (for time reasons, "which" is very slow for large matrices) + fragDataIndex <- fragDataIndexing(fragData) + + + + w@analyzedSpecs <- lapply(w@specs, function(spec) { + #print(spec$id) + s <- analyzeMsMs.optimized(spec, mode=mode, detail=TRUE, run="preliminary", + filterSettings = settings$filterSettings, + spectraList = settings$spectraList, method = analyzeMethod, fragData = fragData, + fragDataIndex = fragDataIndex) + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(s) + }) + + for(f in w@files) + w@analyzedSpecs[[basename(as.character(f))]]$name <- basename(as.character(f)) + + suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) + return(w) +} + + +analyzeMsMs.optimized <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", + filterSettings = getOption("RMassBank")$filterSettings, + spectraList = getOption("RMassBank")$spectraList, method="formula", fragData, + fragDataIndex) +{ + .checkMbSettings() + + if(msmsPeaks$foundOK == FALSE) + return(list(foundOK = FALSE, id=msmsPeaks$id)) + + if(method=="formula") + { + return(analyzeMsMs.formula.optimized(msmsPeaks, mode, detail, run, filterSettings, + spectraList, fragData, fragDataIndex + )) + } + else if(method == "intensity") + { + return(analyzeMsMs.intensity(msmsPeaks, mode, detail, run, filterSettings, + spectraList + )) + } +} + + +#' @export +analyzeMsMs.formula.optimized <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", + filterSettings = getOption("RMassBank")$filterSettings, + spectraList = getOption("RMassBank")$spectraList, fragData, fragDataIndex) +{ + cut <- 0 + cut_ratio <- 0 + if(run=="preliminary") + { + mzColname <- "mz" + filterMode <- "coarse" + cut <- filterSettings$prelimCut + if(is.na(cut)) + { + if(mode %in% c("pH", "pM", "pNa", "pNH4")) + cut <- 1e4 + else if(mode %in% c("mH", "mFA","mM")) + cut <- 0 + else stop(paste("The ionization mode", mode, "is unknown.")) + } + cutRatio <- filterSettings$prelimCutRatio + } else{ + mzColname <- "mzRecal" + filterMode <- "fine" + cut <- filterSettings$fineCut + cut_ratio <- filterSettings$fineCutRatio + if(is.na(cut)) cut <- 0 + } + + # find whole spectrum of parent peak, so we have reasonable data to feed into + # MolgenMsMs + parentSpectrum <- msmsPeaks$parentPeak + + + # Check whether the spectra can be fitted to the spectra list correctly! + if(nrow(msmsPeaks$childHeaders) != length(spectraList)) + { + warning(paste0( + "The spectra count of the substance ", msmsPeaks$id, " (", nrow(msmsPeaks$childHeaders), " spectra) doesn't match the provided spectra list (", length(spectraList), " spectra)." + )) + return(list(specOK=FALSE)) + + } + + # here follows the Rcdk analysis + #------------------------------------ + parentPeaks <- data.frame(mzFound=msmsPeaks$mz$mzCenter, + formula=msmsPeaks$formula, + dppm=0, + x1=0,x2=0,x3=0) + + # define the adduct additions + if(mode == "pH") { + allowed_additions <- "H" + mode.charge <- 1 + } else if(mode == "pNa") { + allowed_additions <- "Na" + mode.charge <- 1 + } else if(mode == "pM") { + allowed_additions <- "" + mode.charge <- 1 + } else if(mode == "mM") { + allowed_additions <- "" + mode.charge <- -1 + } else if(mode == "mH") { + allowed_additions <- "H-1" + mode.charge <- -1 + } else if(mode == "mFA") { + allowed_additions <- "C2H3O2" + mode.charge <- -1 + } else if(mode == "pNH4") { + allowed_additions <- "NH4" + mode.charge <- 1 + } else{ + stop("mode = \"", mode, "\" not defined") + } + + + # the ppm range is two-sided here. + # The range is slightly expanded because dppm calculation of + # generate.formula starts from empirical mass, but dppm cal- + # culation of the evaluation starts from theoretical mass. + # So we don't miss the points on 'the border'. + + if(run=="preliminary") + ppmlimit <- 2 * max(filterSettings$ppmLowMass, filterSettings$ppmHighMass) + else + ppmlimit <- 2.25 * filterSettings$ppmFine + + parent_formula <- add.formula(msmsPeaks$formula, allowed_additions) + dbe_parent <- dbe(parent_formula) + # check whether the formula is valid, i.e. has no negative or zero element numbers. + #print(parent_formula) + if(!is.valid.formula(parent_formula)) + return(list(specOK=FALSE)) + limits <- to.limits.rcdk(parent_formula) + + parentAtomList <- formulastring.to.list(parent_formula) + + rAtoms <- which(c(!grepl("Br",parent_formula),!grepl("N",parent_formula), !grepl("S",parent_formula),!grepl("Cl",parent_formula))) + + if(length(rAtoms)){ + newfragData <- fragData[-which(apply(occurrenceMatrix[,rAtoms,drop=FALSE], 1, any)),] + newfragDataIndex <- fragDataIndexing(newfragData) + } else{ + newfragData <- fragData + newfragDataIndex <- fragDataIndex + } + + + # On each spectrum the following function analyzeTandemShot will be applied. + # It takes the raw peaks matrix as argument (mz, int) and processes the spectrum by + # filtering out low-intensity (<1e4) and shoulder peaks (deltam/z < 0.5, intensity + # < 5%) and subsequently matching the peaks to formulas using Rcdk, discarding peaks + # with insufficient match accuracy or no match. + + analyzeTandemShot <- function(shot_mat) + { + shot <- as.data.frame(shot_mat) + shot_orig <- shot + # Filter out low intensity peaks: + shot_lo <- shot[(shot$int < cut) | (shot$int < max(shot$int) * cut_ratio),] + shot <- shot[(shot$int >= cut) & (shot$int > max(shot$int) * cut_ratio),] + shot_full <- shot + + # Is there still anything left? + if(nrow(shot)==0) + return(list(specOK=FALSE)) + + # Filter out satellite peaks: + shot <- filterPeakSatellites(shot, filterSettings) + shot_satellite_n <- setdiff(row.names(shot_full), row.names(shot)) + shot_satellite <- shot_full[shot_satellite_n,] + + # Is there still anything left? + if(nrow(shot)==0) + return(list(specOK=FALSE)) + + if(max(shot$int) < as.numeric(filterSettings$specOkLimit)) + return(list(specOK=FALSE)) + # Crop to 4 digits (necessary because of the recalibrated values) + shot[,mzColname] <- round(shot[,mzColname], 5) + + + # check whether formula can be fitted into precalculated fragment data + usePrecalcData <- is.sub.formula(parent_formula,list("C"=50,"H"=70,"N"=50,"O"=50,"S"=10,"Cl"=10,"Br"=10)) + + + if(usePrecalcData){ + + # if yes, split the peaks into 2 sets: Those which can be identified using the + # fragment data, and those which can (by maximum fragment Data m/z - 0.5) + + precalcIndex <- which(shot[,mzColname] < (max(newfragData$mass)-0.5)) + smallShots <- shot[precalcIndex,] + bigShots <- shot[-precalcIndex,] + + # for smaller peaks use the precalculated fragment data + if(nrow(smallShots)){ + system.time(peakmatrixSmall <- lapply(smallShots[,mzColname],function(mass){ + mass.calc <- mass + mode.charge * .emass + maxminMass <- ppm(mass.calc, ppmlimit, l=TRUE) + # retrieve only possibly relevant rows of the fragment Data (use the index) + indices <- newfragDataIndex[floor(maxminMass[2]*100)]:newfragDataIndex[ceiling(maxminMass[1]*100)] + + fragmentedFragmentData <- newfragData[indices,] + + # narrow it down further using the ppm + fragmentedFragmentData <- fragmentedFragmentData[which(fragmentedFragmentData$mass > maxminMass[2] & fragmentedFragmentData $mass < maxminMass[1]),] + + # return nothing if the narrowed down data is empty at this point + if(!nrow(fragmentedFragmentData)){ + return(t(c(mzFound=as.numeric(as.character(mass)), + formula=NA, mzCalc=NA))) + } + + + # find out if the narrowed down fragments have a sub-formula of the parentformula + # return the indexes of relevant formulas + fragmentedFragmentData <- fragmentedFragmentData[which(sapply(fragmentedFragmentData$formula, function(currentFormula) is.sub.formula(currentFormula,parentAtomList))),] + + # return nothing if the narrowed down data is empty at this point + if(!nrow(fragmentedFragmentData)){ + return(t(c(mzFound=as.numeric(as.character(mass)), + formula=NA, mzCalc=NA))) + } + + return(t(sapply(1:nrow(fragmentedFragmentData), function(f) + { + mzCalc <- fragmentedFragmentData$mass[f] - mode.charge * .emass + c(mzFound=as.numeric(as.character(mass)), + formula=fragmentedFragmentData$formula[f], + mzCalc=mzCalc) + }))) + + })) + } else{ + peakmatrixSmall <- list() + } + + # for bigger peaks use generate.formula from rcdk + if(nrow(bigShots)){ + system.time(peakmatrixBig <- lapply(bigShots[,mzColname], function(mass){ + # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae + # finally back-correct calculated masses for the charge + mass.calc <- mass + mode.charge * .emass + peakformula <- tryCatch(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), + limits, charge=0), error=function(e) NA) + #peakformula <- tryCatch( + # generate.formula(mass, + # ppm(mass, ppmlimit, p=TRUE), + # limits, charge=1), + #error= function(e) list()) + if(!is.list(peakformula)){ + return(t(c(mzFound=as.numeric(as.character(mass)), + formula=NA, mzCalc=NA))) + }else{ + return(t(sapply(peakformula, function(f) + { + mzCalc <- f@mass - mode.charge * .emass + c(mzFound=as.numeric(as.character(mass)), + formula=f@string, + mzCalc=mzCalc) + }))) + } + })) + } else{ + peakmatrixBig <- list() + } + + peakmatrix <- c(peakmatrixSmall,peakmatrixBig) + + } else{ + system.time(peakmatrix <- lapply(shot[,mzColname], function(mass){ + # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae + # finally back-correct calculated masses for the charge + mass.calc <- mass + mode.charge * .emass + peakformula <- tryCatch(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), + limits, charge=0), error=function(e) NA) + #peakformula <- tryCatch( + # generate.formula(mass, + # ppm(mass, ppmlimit, p=TRUE), + # limits, charge=1), + #error= function(e) list()) + if(!is.list(peakformula)){ + return(t(c(mzFound=as.numeric(as.character(mass)), + formula=NA, mzCalc=NA))) + }else + { + return(t(sapply(peakformula, function(f) + { + mzCalc <- f@mass - mode.charge * .emass + c(mzFound=mass, + formula=f@string, + mzCalc=mzCalc) + }))) + } + })) + } + + childPeaks <- as.data.frame(do.call(rbind, peakmatrix)) + childPeaks$mzFound <- as.numeric(as.character(childPeaks$mzFound)) + childPeaks$formula <- as.character(childPeaks$formula) + childPeaks$mzCalc <- as.numeric(as.character(childPeaks$mzCalc)) + childPeaks$dppm <- (childPeaks$mzFound / childPeaks$mzCalc - 1) * 1e6 + # delete the NA data out again, because MolgenMsMs doesn't have them + # here and they will be re-added later + # (this is just left like this for "historical" reasons) + childPeaks <- childPeaks[!is.na(childPeaks$formula),] + # check if a peak was recognized (here for the first time, + # otherwise the next command would fail) + if(nrow(childPeaks)==0) + return(list(specOK=FALSE)) + + # now apply the rule-based filters to get rid of total junk: + # dbe >= -0.5, dbe excess over mother cpd < 3 + childPeaks$dbe <- unlist(lapply(childPeaks$formula, dbe)) + #iff_rcdk_pM_eln$maxvalence <- unlist(lapply(diff_rcdk_pM_eln$formula.rcdk, maxvalence)) + childPeaks <- childPeaks[childPeaks$dbe >= filterSettings$dbeMinLimit,] + # & dbe < dbe_parent + 3) + + # check if a peak was recognized + if(nrow(childPeaks)==0) + return(list(specOK=FALSE)) + + # trim mz to 5 digits + shot[,mzColname] <- round(shot[,mzColname], 5) + + childPeaksInt <- merge(childPeaks, shot, by.x = "mzFound", by.y = mzColname, all.x = TRUE, all.y = FALSE ) + # find the best ppm value + bestPpm <- aggregate(childPeaksInt$dppm, list(childPeaksInt$mzFound), + function(dppm) dppm[[which.min(abs(dppm))]]) + colnames(bestPpm) <- c("mzFound", "dppmBest") + childPeaksInt <- merge(childPeaksInt, bestPpm, by="mzFound", all.x=TRUE) + # count formulas found per mass + countFormulasTab <- xtabs( ~formula + mzFound, data=childPeaksInt) + countFormulas <- colSums(countFormulasTab) + childPeaksInt$formulaCount <- countFormulas[as.character(childPeaksInt$mzFound)] + # filter results + childPeaksFilt <- filterLowaccResults(childPeaksInt, filterMode, filterSettings) + childPeaksGood <- childPeaksFilt[["TRUE"]] + childPeaksBad <- childPeaksFilt[["FALSE"]] + # count formulas within new limits + # (the results of the "old" count stay in childPeaksInt and are returned + # in $childPeaks) + if(!is.null(childPeaksGood)) + { + countFormulasTab <- xtabs( ~formula + mzFound, data=childPeaksGood) + countFormulas <- colSums(countFormulasTab) + childPeaksGood$formulaCount <- countFormulas[as.character(childPeaksGood$mzFound)] + } + + childPeaksUnmatched <- merge(childPeaks, shot, by.x = "mzFound", by.y = mzColname, + all.x = TRUE, all.y = TRUE ) + childPeaksUnmatched$dppmBest <- NA + childPeaksUnmatched$formulaCount <- 0 + childPeaksUnmatched$good <- FALSE + childPeaksUnmatched <- childPeaksUnmatched[is.na(childPeaksUnmatched$mzCalc),] + + # return list: + rl <- list( + specOK = !is.null(childPeaksGood), + parent = parentPeaks, + childFilt = childPeaksGood, + childRaw=shot_orig + ) + # if "detail" is set to TRUE, return more detailed results including + # all the deleted peaks and the stages when they were culled + if(detail) + { + rl$childRawLow = shot_lo + rl$childRawSatellite = shot_satellite + rl$childRawOK = shot + rl$child =childPeaksInt + rl$childBad = childPeaksBad + rl$childUnmatched = childPeaksUnmatched + } + return(rl) + } + shots <- lapply(msmsPeaks$peaks, analyzeTandemShot) + #browser() + shots <- mapply(function(shot, scan, info) + { + shot$scan <- scan + shot$info <- info + shot$header <- msmsPeaks$childHeaders[as.character(scan),] + return(shot) + }, shots, msmsPeaks$childScans, spectraList, SIMPLIFY=FALSE) + + mzranges <- t(sapply(shots, function(p) { + if(!is.null(p$childRaw)){ + return(range(p$childRaw[,mzColname])) + } else { + return(c(NA,NA)) + } + })) + + mzmin <- min(mzranges[,1], na.rm=TRUE) + mzmax <- max(mzranges[,2], na.rm=TRUE) + + return(list( + msmsdata=shots, + mzrange=c(mzmin, mzmax), + id=msmsPeaks$id, + mode=mode, + parentHeader = msmsPeaks$parentHeader, + parentMs = msmsPeaks$parentPeak, + formula = msmsPeaks$formula, + foundOK = TRUE)) +} + +findPeaksInFracData <- function(mass, fracData, fracDataIndex){ + +} \ No newline at end of file diff --git a/R/sysData.rda b/R/sysData.rda new file mode 100644 index 0000000000000000000000000000000000000000..7b22ce3cc76d4eb12ad0aadb20665a4f755212a4 Binary files /dev/null and b/R/sysData.rda differ diff --git a/man/checkIsotopes.Rd b/man/checkIsotopes.Rd new file mode 100644 index 0000000000000000000000000000000000000000..dd30f7184e80d24c34990cc6b799e291978782bc --- /dev/null +++ b/man/checkIsotopes.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/Isotopic_Annotation.R +\name{checkIsotopes} +\alias{checkIsotopes} +\title{Checks for isotopes in a \code{msmsWorkspace}} +\usage{ +checkIsotopes(w, mode = "pH", intensity_cutoff = 5000, + intensity_precision = "low", conflict = "dppm", isolationWindow = 4, + evalMode = "complete", plotSpectrum = TRUE, + settings = getOption("RMassBank")) +} +\arguments{ +\item{w}{A \code{msmsWorkspace} to work with.} + +\item{mode}{\code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-).} + +\item{intensity_cutoff}{The cutoff (as an absolute intensity value) under which isotopic peaks shouldn't be checked for or accepted as valid.} + +\item{intensity_precision}{The difference that is accepted between the calculated and observed intensity of a possible isotopic peak. Further details down below.} + +\item{conflict}{Either "isotopic"(Peak formulas are always chosen if they fit the requirements for an isotopic peak) +or "strict"(Peaks are only marked as isotopic when there hasn't been a formula assigned before.)} + +\item{isolationWindow}{The isolation window in Da} + +\item{evalMode}{Currently no function yet, but planned} + +\item{plotSpectrum}{A boolean specifiying whether the spectrumshould be plotted} + +\item{settings}{Options to be used for processing. Defaults to the options loaded via +\code{\link{loadRmbSettings}} et al. Refer to there for specific settings.} +} +\value{ +The \code{msmsWorkspace} with . +} +\description{ +Checks for isotopes in a \code{msmsWorkspace} +} +\details{ +text describing parameter inputs in more detail. +\itemize{ + \item{\code{intensity_precision}}{This parameter determines how strict the intensity values should adhere to the calculated intensity in relation to the parent peak. + Options for this parameter are \code{"none"}, where the intensity is irrelevant, \code{"low"}, which has an error margin of 70\% and \code{"high"}, where the error margin + is set to 35\%. The recommended setting is \code{"low"}, but can be changed to adjust to the intensity precision of the mass spectrometer.} +} +} +\author{ +Michael Stravs, Eawag <michael.stravs@eawag.ch> + +Erik Mueller, UFZ +} +