diff --git a/DESCRIPTION b/DESCRIPTION index 5f46eca1efab0e58a02c93982a074acc7f381d49..d92f4806a506325fc860be2cbf6743f5039fe043 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 1.99.7 +Version: 1.99.11 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")), @@ -28,8 +28,8 @@ Depends: Rcpp Encoding: UTF-8 Imports: - XML,RCurl,rjson, - rcdk,yaml,mzR,methods,Biobase,MSnbase,S4Vectors + XML,RCurl,rjson,S4Vectors,digest, + rcdk,yaml,mzR,methods,Biobase,MSnbase Suggests: gplots,RMassBankData, xcms (>= 1.37.1), @@ -41,6 +41,7 @@ Collate: 'alternateAnalyze.R' 'createMassBank.R' 'formulaCalculator.R' + 'getSplash.R' 'leCsvAccess.R' 'leMsMs.r' 'leMsmsRaw.R' diff --git a/NAMESPACE b/NAMESPACE index e2b792db594fda8fe65ad0f7d5483ef8faf69471..a589a8bf59e847da04320388d567b325827428df 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,6 +34,7 @@ export(filterPeaksMultiplicity) export(findCAS) export(findEIC) export(findFormula) +export(findLevel) export(findMass) export(findMsMsHR) export(findMsMsHR.direct) @@ -52,6 +53,7 @@ export(formulastring.to.list) export(gatherCompound) export(gatherData) export(gatherDataBabel) +export(gatherDataUnknown) export(gatherPubChem) export(gatherSpectrum) export(getCSID) @@ -129,6 +131,7 @@ import(RCurl) import(Rcpp) import(S4Vectors) import(XML) +import(digest) import(methods) import(mzR) import(rcdk) diff --git a/R/Isotopic_Annotation.R b/R/Isotopic_Annotation.R index 11e21c539a6a3a23236878f6a3244d259b56afef..760f20875c303499a329ef97491729188736f5ee 100644 --- a/R/Isotopic_Annotation.R +++ b/R/Isotopic_Annotation.R @@ -4,6 +4,7 @@ #' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions #' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). #' @param intensity_cutoff The cutoff (as an absolute intensity value) under which isotopic peaks shouldn't be checked for or accepted as valid. +#' Please note: The cutoff is not hard in the sense that it interacts with the intensity_precision parameter. #' @param intensity_precision The difference that is accepted between the calculated and observed intensity of a possible isotopic peak. Further details down below. #' @param conflict Either "isotopic"(Peak formulas are always chosen if they fit the requirements for an isotopic peak) #' or "strict"(Peaks are only marked as isotopic when there hasn't been a formula assigned before.) @@ -25,8 +26,8 @@ #' @author Michael Stravs, Eawag <michael.stravs@@eawag.ch> #' @author Erik Mueller, UFZ #' @export -checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 1000, intensity_precision = "low", conflict = "dppm", - isolationWindow = 4, evalMode = "complete", plotSpectrum = TRUE, settings = getOption("RMassBank")){ +checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 0, intensity_precision = "none", conflict = "strict", + isolationWindow = 2, evalMode = "complete", plotSpectrum = TRUE, settings = getOption("RMassBank")){ # Load library and data requireNamespace("enviPat",quietly=TRUE) @@ -78,12 +79,6 @@ checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 1000, intensity_pre stop("mode = \"", mode, "\" not defined") } - if(!(evalMode %in% c("complete"))){ - stop('evalMode must currently be specified as "complete"') - } - - evalMode <- c("add","check") - # "default" isotopes (i.e. those with the highest abundance) defIsotopes <- c("107Ag", "27Al", "40Ar", "75As", "197Au", "11B", "138Ba", "9Be", "209Bi", @@ -98,7 +93,7 @@ checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 1000, intensity_pre # Get the ppm limit from the settings - ppmlimit <- 2.25 * filterSettings$ppmFine + ppmlimit <- filterSettings$ppmFine # Get the cutoff from the settings cut <- filterSettings$fineCut @@ -118,12 +113,10 @@ checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 1000, intensity_pre id <- as.numeric(spec@id) specNum <- 0 specEnv <- environment() - # lapply over all extracted MS2 spectra lapply(spec@children, function(msmsdata){ specEnv$specNum <- specEnv$specNum + 1 - # Extract currently relevant peaks currentMPeaks <- matchedPeaks[(matchedPeaks$cpdID == id) & (matchedPeaks$scan == msmsdata@acquisitionNum),,drop=FALSE] currentUPeaks <- unmatchedPeaks[(unmatchedPeaks$cpdID == id) & (unmatchedPeaks$scan == msmsdata@acquisitionNum),,drop=FALSE] @@ -145,8 +138,10 @@ checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 1000, intensity_pre # Generate isotopic patterns of the matched peaks # sort out possible isotopic peaks according to # isolationwindow and intensity - isoMPatterns <- apply(currentMPeaks, 1, .findPattern, defIsotopes = defIsotopes, intensity_cutoff = intensity_cutoff, + isoMPatterns <- lapply(1:nrow(currentMPeaks), function(index){ + .findPattern(currentMPeaks[index,,drop=FALSE], defIsotopes = defIsotopes, intensity_cutoff = intensity_cutoff, precisionVal = precisionVal, ppmlimit = ppmlimit, isolationWindow = isolationWindow) + }) # Name the isopatterns names(isoMPatterns) <- currentMPeaks$formula @@ -155,8 +150,9 @@ checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 1000, intensity_pre # Which isotope patterns still have theoretical intensities above the cutoff? peaksToCheck <- which(as.logical(sapply(isoMPatterns,nrow))) + # If there are no peaks left, then abort for this spectrum if(!length(peaksToCheck)){ - message(paste0("The peaks of compound ", id, " in spectrum #", specEnv$specNum," are not intense enough to search for isotopic peaks")) + message(paste0("The already annotated peaks of compound ", id, " in spectrum #", specEnv$specNum," are not intense enough to search for isotopic peaks")) if(plotSpectrum){ plot(currentMPeaks$mzFound, currentMPeaks$intensity,type="h", main=paste(id,findName(id)), col="black", xlab="m/z", ylab="intensity", lwd=3) } @@ -166,135 +162,60 @@ checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 1000, intensity_pre # Now, look for isotopic patterns in unmatched peaks with all specified parameters # Which peaks have no formula annotation within the dppm as of now? - peaksNoAnnotation <- currentUPeaks[which(is.na(currentUPeaks$dppm) | abs(currentUPeaks$dppmBest) > ppmlimit/2.25),] + peaksNoAnnotation <- currentUPeaks[which(is.na(currentUPeaks$dppm) | abs(currentUPeaks$dppmBest) > ppmlimit),] + # What is the mean of the dppm of the currently "OK" peaks? + # (Used for calculating the score parameter for intensities) + dppmMean <- mean(abs(currentMPeaks[currentMPeaks$filterOK,]$dppm)) + # If there are any peaks without annotation: if(nrow(peaksNoAnnotation)){ - UPList <- .findMatchingPeaks(peaksNoAnnotation,isoMPatterns[peaksToCheck]) + UPList <- .findMatchingPeaks(peaksNoAnnotation,isoMPatterns[peaksToCheck], dppmMean) } else{ # If there are no peaks, fake an empty dataset - UPList <- list(list(data.frame(dummy=character())),list(data.frame(dummy=character()))) - + UPList <- list(matrix(character(0),0,29)) } - - # Generate matrix of peaks that should be added - additionMatrix <- .peakReasoner(UPList, currentMPeaks) - - # If conflict is "strict", end here + + # If conflict is "strict", end here (And plot, maybe) if(conflict == "strict"){ - if(nrow(additionMatrix)){ - wEnv$w@aggregated[additionMatrix$index,] <- additionMatrix - } - + # Generate matrix of peaks that should be added + additionMatrix <- .peakReasoner(list(matrix(character(0),0,29)) , UPList, currentMPeaks) if(plotSpectrum){ plot(currentMPeaks$mzFound, currentMPeaks$intensity,type="h", main=paste(id,findName(id)), col="black", xlab="m/z", ylab="intensity", lwd=3) if(nrow(additionMatrix)){ points(additionMatrix$mzFound, additionMatrix$intensity,type="h", col="green", lwd=3) - # Add all the peaks that could be annotated as isotopic - wEnv$w@aggregated[additionMatrix$index,] <- additionMatrix } } + # If there is something in the matrix + if(nrow(additionMatrix)){ + # Add all the peaks that could be annotated as isotopic + wEnv$w@aggregated[additionMatrix$index,] <- additionMatrix + } return(0) } # Now check the matched peaks for the patterns and put all peaks in a matrix - MPList <- .findMatchingPeaks(currentMPeaks[currentMPeaks$filterOK,], isoMPatterns[peaksToCheck]) + MPList <- .findMatchingPeaks(currentMPeaks[currentMPeaks$filterOK,], isoMPatterns[peaksToCheck], dppmMean) # Generate matrix of peaks that should be corrected - correctionMatrix <- .peakReasoner(MPList, currentMPeaks) - - IsoPeaksMindex <- which(sapply(MPList, function(x) any(sapply(x,nrow)))) - noIsoPeaksMindex <- which(!sapply(MPList, function(x) any(sapply(x,nrow)))) - IsoPeaksUindex <- which(sapply(UPList, function(x) any(sapply(x,nrow)))) - noIsoPeaksUindex <- which(!sapply(UPList, function(x) any(sapply(x,nrow)))) - - if(conflict=="isotopic"){ - if(nrow(correctionMatrix)){ - - # Peaks that are changed but also seem to have isotopes themselves - confPeaksIndex <- which(correctionMatrix$index %in% currentMPeaks$index[peaksToCheck[IsoPeaksMindex]]) - conflictedMatrix <- correctionMatrix[confPeaksIndex,,drop=FALSE] - - if(length(confPeaksIndex)){ - correctionMatrix <- correctionMatrix[-confPeaksIndex,] - } - - - if(nrow(correctionMatrix)){ - wEnv$w@aggregated[correctionMatrix$index,] <- correctionMatrix - } - } - - if(!("check" %in% evalMode)){ - if(plotSpectrum){ - plot(currentMPeaks$mzFound, currentMPeaks$intensity,type="h", main=paste(id,findName(id)), col="black", xlab="m/z", ylab="intensity", lwd=3) - if(nrow(additionMatrix)){ - points(additionMatrix$mzFound, additionMatrix$intensity,type="h", col="green", lwd=3) - } - if(nrow(correctionMatrix)){ - points(correctionMatrix$mzFound, correctionMatrix$intensity,type="h", col="yellow", lwd=3) - } - if(nrow(conflictedMatrix)){ - points(conflictedMatrix$mzFound, conflictedMatrix$intensity,type="h", col="red", lwd=3) - } - } - return(0) - } - } - - if("check" %in% evalMode){ - - if(plotSpectrum){ - plot(currentMPeaks$mzFound, currentMPeaks$intensity,type="h", main=paste(id,findName(id)), xlim=c(min(currentMPeaks$mzFound), max(currentMPeaks$mzFound)), col="black", xlab="m/z", ylab="intensity", lwd=3) - } - - indexList <- lapply(unique(currentMPeaks$mzFound[peaksToCheck]),function(mz){ - which(currentMPeaks$mzFound[peaksToCheck] == mz) - }) - - # Matched Peaks where no isotopes have been found - hasNoIsoIndex <- unlist(sapply(indexList,function(indices){ - if(any(IsoPeaksMindex %in% indices) || any(IsoPeaksUindex %in% indices)){ - return(vector()) - } else{ - return(currentMPeaks$index[peaksToCheck[indices]]) - } - })) - - # Matched Peaks which should have isotopes and are no isotopes themselves - isNoIsoIndex <- currentMPeaks$index[peaksToCheck[ - which(!(currentMPeaks$mzFound[peaksToCheck] %in% additionMatrix$mzFound) | !(currentMPeaks$mzFound[peaksToCheck] %in% correctionMatrix$mzFound)) - ]] - - noIsoPeaksMatrix <- wEnv$w@aggregated[intersect(hasNoIsoIndex,isNoIsoIndex),] - - - - if(nrow(noIsoPeaksMatrix)){ - wEnv$w@aggregated[noIsoPeaksMatrix$index,]$good <- FALSE - wEnv$w@aggregated[noIsoPeaksMatrix$index,]$filterOK <- FALSE - wEnv$w@aggregated[noIsoPeaksMatrix$index,]$matchedReanalysis <- FALSE - if(plotSpectrum){ - points(noIsoPeaksMatrix$mzFound, noIsoPeaksMatrix$intensity, type="h", col="orange", lwd=3) - } - } - - if("add" %in% evalMode && conflict=="isotopic" && plotSpectrum){ - if(nrow(additionMatrix)){ - points(additionMatrix$mzFound, additionMatrix$intensity,type="h", col="green", lwd=3) - } - if(nrow(correctionMatrix)){ - points(correctionMatrix$mzFound, correctionMatrix$intensity,type="h", col="yellow", lwd=3) - if(nrow(conflictedMatrix)){ - points(conflictedMatrix$mzFound, conflictedMatrix$intensity,type="h", col="red", lwd=3) - } - } - - } - } - return(0) + correctionMatrix <- .peakReasoner(MPList, UPList, currentMPeaks) + + + # If there is something in the matrix + if(nrow(correctionMatrix)){ + # Add all the peaks that could be annotated as isotopic + wEnv$w@aggregated[correctionMatrix$index,] <- correctionMatrix + } + # If the newly annotated peaks should be plotted, plot them + if(plotSpectrum){ + plot(currentMPeaks$mzFound, currentMPeaks$intensity,type="h", main=paste(id,findName(id)), col="black", xlab="m/z", ylab="intensity", lwd=3) + if(nrow(correctionMatrix)){ + points(correctionMatrix$mzFound, correctionMatrix$intensity,type="h", col="green", lwd=3) + } + } + return(0) }) }) return(w) @@ -304,63 +225,18 @@ checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 1000, intensity_pre # So modularize this function .findPattern <- function(aggregateRow, defIsotopes, intensity_cutoff, precisionVal, ppmlimit, isolationWindow){ - # Find pattern - capture.output(pattern <- as.data.frame(enviPat::isopattern(aggregateRow["formula"], isotopes = isotopes)[[1]])) - mass <- findMz.formula(as.character(aggregateRow["formula"]),"")$mzCenter + # Find pattern and mass + outp <- capture.output(pattern <- as.data.frame(enviPat::isopattern(aggregateRow[,"formula"], isotopes = isotopes)[[1]])) + rm("outp") + mass <- as.numeric(aggregateRow[,"mzCalc"]) # Find index of monoisotopic molecule and # normalize abundance that monoisotopic molecule has always "1" mainIndex <- which.min(abs(pattern[,"m/z"]-mass)) pattern[,"abundance"] <- round(pattern[,"abundance"] * (100/pattern[mainIndex,"abundance"]),3) - # Find all isotope atom names (only in colnames of patterns, sadly) - isoCols <- colnames(pattern)[3:ncol(pattern)] - - # Order the formulas to be in Hill system: - # First the Cs, then the Hs, then lexicographical - # First: Split isotope names so that there only are atoms - # Find position of C and H - splitNames <- gsub('^([0-9]+)(.+)$', '\\2', isoCols) - CHPos <- c(which(splitNames == "C"),which(splitNames == "H")) - - # Account for special case: No Cs or Hs - if(length(CHPos)){ - # If there are Cs and Hs, overwrite the positions so they always get sorted to the front - splitNames[CHPos] <- "" - } - - # Default isotopes are always first in the colnames, so no need to order internally between isotopes - atomOrder <- order(splitNames) - - # If there are Cs and Hs, the order must be preserved - if(length(CHPos)){ - atomOrder[1:length(CHPos)] <- CHPos - } - # else it is already ordered - - # Mark new names for formula creation. - newnames <- unname(sapply(isoCols, function(currentCol){ - if(currentCol %in% defIsotopes){ - # "Default" isotope (no isotope mass in formula, e.g. "C") - return(gsub('^(.{0})([0-9]+)(.+)$', '\\3', currentCol)) - }else{ - # Other isotopes (isotope mass in formula, e.g. "[13]C") - return(gsub('^(.{0})([0-9]+)(.+)$', '\\1[\\2]\\3', currentCol)) - } - })) - - # Generate the formula for every pattern - pattern$formula <- apply(pattern,1,function(p){ - paste0(sapply(atomOrder+2,function(isoIndex){ - if(p[isoIndex] == 0){ - return("") - } - if(p[isoIndex] == 1){ - return(newnames[isoIndex-2]) - } - return(paste0(newnames[isoIndex-2],p[isoIndex])) - }),collapse="") - }) + # Add the formula of every isotope to every row in the pattern + pattern <- .annotateFormulaToEnviPatTable(pattern, defIsotopes) # Sort pattern by abundance pattern <- pattern[order(pattern[,"abundance"],decreasing = T),][-mainIndex,] @@ -378,18 +254,21 @@ checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 1000, intensity_pre # Find the absolute intensity ranges intensities <- apply(pattern, 1, function(patternRow){ - as.numeric(aggregateRow["intensity"]) * as.numeric(patternRow["abundance"])/100 + as.numeric(aggregateRow[,"intensity"]) * as.numeric(patternRow["abundance"])/100 }) roundedInt <- round(intensities,digits=-2) - keepIntIndex <- which(roundedInt >= intensity_cutoff) + keepIntIndex <- which((roundedInt + roundedInt * precisionVal) >= intensity_cutoff) pattern <- pattern[keepIntIndex,,drop=FALSE] if(nrow(pattern)){ pattern$minintensity <- roundedInt[keepIntIndex] - roundedInt[keepIntIndex] * precisionVal pattern$maxintensity <- roundedInt[keepIntIndex] + roundedInt[keepIntIndex] * precisionVal - + pattern$expectedintensity <- roundedInt[keepIntIndex] + pattern$monoisotopicFormula <- aggregateRow[,"formula"] + pattern$monoisotopicIndex <- aggregateRow[,"index"] + pattern$monoisotopicMz <- aggregateRow[,"mzFound"] # Calculate the expected mz range of the isotope peak - mzCols <- t(sapply(pattern[,"m/z"], ppm, dppm=ppmlimit/2.25,l=T)) + mzCols <- t(sapply(pattern[,"m/z"], ppm, dppm=ppmlimit,l=T)) colnames(mzCols) <- c("maxMZ","minMZ") pattern <- cbind(pattern,mzCols) } @@ -397,7 +276,7 @@ checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 1000, intensity_pre return(pattern) } -.findMatchingPeaks <- function(aggregatedPeakMatrix, isoPatterns){ +.findMatchingPeaks <- function(aggregatedPeakMatrix, isoPatterns, dppmMean){ # Only unique mzs (peaks can be annotated multiple times and will therefore be included multiple times) # It doesn't matter which one, so take the first aggregatedPeakMatrix <- aggregatedPeakMatrix[sapply(unique(aggregatedPeakMatrix$mzFound), function(mz){ @@ -406,90 +285,230 @@ checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 1000, intensity_pre # Iterate over all supplied patterns lapply(isoPatterns, function(pattern){ - x <- list() - for(r in 1:nrow(pattern)){} - - apply(pattern, 1, function(patternRow){ + + do.call(rbind,lapply(1:nrow(pattern), function(index){ + patternRow <- pattern[index,,drop=FALSE] # Find peaks that fit the specified intensities and m/z - pIndex <- which(aggregatedPeakMatrix[,"mzFound"] < as.numeric(patternRow["maxMZ"]) - & aggregatedPeakMatrix[,"mzFound"] > as.numeric(patternRow["minMZ"]) - & aggregatedPeakMatrix[,"intensity"] < as.numeric(patternRow["maxintensity"]) - & aggregatedPeakMatrix[,"intensity"] > as.numeric(patternRow["minintensity"]) + pIndex <- which(aggregatedPeakMatrix[,"mzFound"] < patternRow[,"maxMZ"] + & aggregatedPeakMatrix[,"mzFound"] > patternRow[,"minMZ"] + & aggregatedPeakMatrix[,"intensity"] < patternRow[,"maxintensity"] + & aggregatedPeakMatrix[,"intensity"] > patternRow[,"minintensity"] ) # Note these Peaks candidates <- aggregatedPeakMatrix[pIndex,] # If there are any: Change parameters in the aggregated matrix if(nrow(candidates)){ - candidates$dppm <- (candidates$mzFound / as.numeric(patternRow["m/z"]) - 1) * 1e6 - candidates$mzCalc <- as.numeric(patternRow["m/z"]) - candidates$formula <- patternRow["formula"] + # General parameters that need to be changed + candidates$dppm <- (candidates$mzFound / as.numeric(patternRow[,"m/z"]) - 1) * 1e6 + candidates$mzCalc <- patternRow[,"m/z"] + candidates$formula <- patternRow[,"formula"] candidates$matchedReanalysis <- NA candidates$filterOK <- TRUE candidates$good <- TRUE candidates$dppmBest <- candidates$dppm candidates$formulaCount <- 1 - } + # New parameters (are used to make peak reasoning easier, we would + # need to find them out later anyways, so do it now when it's easy) + + # 1) The monoisotopic formula of the now added peak + candidates$monoisotopicFormula <- patternRow[,"monoisotopicFormula"] + + # 2) The calculated expected intensity (added for debug reasons) + candidates$intCalc <- patternRow[,"expectedintensity"] + + # 3) Produce "scores" in the range of dppm and scales nicely + #candidates$dint <- (max(as.numeric(candidates$intensity),as.numeric(patternRow["expectedintensity"]))/min(as.numeric(candidates$intensity),as.numeric(patternRow["expectedintensity"])))^2 + maxInt <- max(as.numeric(candidates$intensity),as.numeric(patternRow[,"expectedintensity"])) + minInt <- min(as.numeric(candidates$intensity),as.numeric(patternRow[,"expectedintensity"])) + + candidates$dint <- (maxInt/minInt) * dppmMean + candidates$monoisotopicIndex <- patternRow[,"monoisotopicIndex"] + + candidates$monoisotopicMz <- as.numeric(patternRow[,"monoisotopicMz"]) + } else{ + candidates$monoisotopicFormula <- character(0) + candidates$intCalc <- numeric(0) + candidates$dint <- numeric(0) + candidates$monoisotopicIndex <- integer(0) + candidates$monoisotopicMz <- numeric(0) + } return(candidates) - }) + })) }) } -.peakReasoner <- function(adjustmentList, aggregatedPeakMatrix){ - - adjustmentMatrix <- do.call(rbind, unlist(adjustmentList,recursive=FALSE)) - - if(!as.logical(nrow(adjustmentMatrix))){ +.peakReasoner <- function(MPList, UPList, currentMPeaks){ + + # Unlist the possible isotope peak lists for the previously matched and previously unmatched peaks + adjustmentMatrix <- rbind(do.call(rbind, MPList),do.call(rbind, UPList)) + + # Remove peaks with a much too high dint + dintRemoval <- which(adjustmentMatrix$dint>50) + if(as.logical(length(dintRemoval))){ + adjustmentMatrix <- adjustmentMatrix[-which(adjustmentMatrix$dint>50),] + } + # No isotopes found, abort + if(!as.logical(nrow(adjustmentMatrix))){ return(adjustmentMatrix) } + + # Now we need to remove rows from this matrix, so that 4 conditions are fulfilled: + # a) Values in the mzFound can not occur in monoisotopicMz (because an isotope can't be monoisotopic) + # b) No monoisotopicMz can have more than one index assigned (because this would mean a peak has 2 formulas) + # c) All unique mzFound can only be included once + # d) The number of indices must be minimal (it is highly likely that the monoisotopic peak with the most + # isotope peaks is correctly annotated) + + uniqueMonoMz <- unique(adjustmentMatrix$monoisotopicMz) + uniqueMzFound <- unique(adjustmentMatrix$mzFound) + + checkAllMzFound <- function(){ + all(uniqueMzFound %in% adjustmentMatrix$mzFound) && all(table(adjustmentMatrix$mzFound) == 1) + } + + # Condition a) Go through all monoisotopicMz + # and remove those that happen to be in mzFound + removalIndex <- which(adjustmentMatrix$monoisotopicMz %in% uniqueMzFound) + if(length(removalIndex)){ + adjustmentMatrix <- adjustmentMatrix[-removalIndex,] + } + + + # Condition b) and d) + for(monoMz in uniqueMonoMz){ + monoMzRows <- which(adjustmentMatrix$monoisotopicMz == monoMz) + + # If there are different indices, use the one that appears most often, else there is no problem + if(!all(adjustmentMatrix$monoisotopicIndex[monoMzRows] == adjustmentMatrix$monoisotopicIndex[monoMzRows][1])){ + # Store all indices that occur equally as often and most often + maxIndex <- vector() + maxlength <- 0 + for(index in unique(adjustmentMatrix$monoisotopicIndex[monoMzRows])){ + currlength <- length(which(adjustmentMatrix$monoisotopicIndex[monoMzRows] == index)) + if(currlength > maxlength){ + maxlength <- currlength + maxIndex <- index + } + if(currlength == maxlength){ + maxlength <- currlength + if(!(index %in% maxIndex)){ + maxIndex <- c(maxIndex,index) + } + } + } + + # One index appears most often, so use that one + if(length(maxIndex) == 1){ + monoMzRows <- monoMzRows[-which(adjustmentMatrix$monoisotopicIndex[monoMzRows] == maxIndex)] + adjustmentMatrix <- adjustmentMatrix[-monoMzRows,] + } else { # Sometimes different indices appear equally as often and most often, use scoring metric + bestscore <- 1000 + bestIndex <- 0 + # Go through all possible indices + for(conflictIndex in maxIndex){ + # Calculate Score and note the best score + score <- mean(apply(abs(adjustmentMatrix[which(adjustmentMatrix$monoisotopicIndex == conflictIndex),c("dint","dppm")]),1,sum)) + if(score < bestscore){ + bestscore <- score + bestIndex <- conflictIndex + } + } + # Throw out all indices that don't have the best average score + monoMzRows <- monoMzRows[-which(adjustmentMatrix$monoisotopicIndex[monoMzRows] == bestIndex)] + adjustmentMatrix <- adjustmentMatrix[-monoMzRows,] + } + } + } + + # Everything should be fine, but if it is not + # some peaks have the same monoisotopicMz but not the same Formula, + # i.e. one monoisotopic peak has 2 isotope formulas that fit the + # same peak + if(length(unique(adjustmentMatrix$mzFound)) != length(adjustmentMatrix$mzFound)){ + occurrences <- as.data.frame(table(adjustmentMatrix$mzFound)) + for(value in occurrences[which(occurrences[,2] > 1),1]){ + rows <- which(adjustmentMatrix$mzFound == value) + scores <- apply(adjustmentMatrix[rows,c("dint","dppm")],1,sum) + notMinRows <- rows[-which.min(scores)] + adjustmentMatrix <- adjustmentMatrix[-notMinRows,] + } + } + + # Now: Check for every peak if the monoisotopic peak is already the peak that is passed + # else change the peak that gets passed + problemPeaks <- which(!sapply(adjustmentMatrix$monoisotopicIndex, function(index) currentMPeaks$filterOK[which(currentMPeaks$index == index)])) + problemMz <- adjustmentMatrix$monoisotopicMz[problemPeaks] + correctIndex <- adjustmentMatrix$monoisotopicIndex[problemPeaks] + + adjustmentMatrix <- adjustmentMatrix[,-(25:29)] + + if(as.logical(length(problemMz))){ + # Go through every mz and change the peak to the correct one + for(p in 1:length(problemMz)){ + addPeak <- currentMPeaks[which(currentMPeaks$index == correctIndex[p]),,drop=FALSE] + addPeak$filterOK <- TRUE + removePeak <- currentMPeaks[which((currentMPeaks$mzFound == problemMz[p]) & currentMPeaks$filterOK),,drop=FALSE] + removePeak$filterOK <- FALSE + adjustmentMatrix <- rbind(adjustmentMatrix,addPeak,removePeak) + } + } + + return(adjustmentMatrix) +} + + +# Further modularization of formula addition +# Is essentially one task, so this makes it more understandable +.annotateFormulaToEnviPatTable <- function(pattern, defIsotopes){ + # Find all isotope atom names (only in colnames of patterns, sadly) + isoCols <- colnames(pattern)[3:ncol(pattern)] - # Peaks can turn up multiple times, take the one that actually fits the "base" formula, - # i.e. that of the peak that is actually written into the record - - # For each possible parent peak: Find out which peaks have possible children - # and how many, group these peaks (by adding a column to the adjustmentMatrix) - # Note: Grouping happens here, because we also need to split the peaks by mz - summaryList <- lapply(adjustmentList, function(x) sapply(x,nrow)) - groupLengths <- unlist(lapply(summaryList, function(entry){ - if(all(entry == 0)) return(vector()) else return(length(which(entry > 0))) - })) - adjustmentMatrix$group <- unlist(lapply(1:length(groupLengths),function(i) rep(i,groupLengths[i]))) + # Order the formulas to be in Hill system: + # First the Cs, then the Hs, then lexicographical + # First: Split isotope names so that there only are atoms + # Find position of C and H + splitNames <- gsub('^([0-9]+)(.+)$', '\\2', isoCols) + CHPos <- c(which(splitNames == "C"),which(splitNames == "H")) - # Technically, you only need to look at one peak in each group and can adjust the rest alongside - # Since we only want to check if the parent formula is currently passed through - groupAdjustmentMatrix <- adjustmentMatrix[sapply(1:length(groupLengths),function(i) which(adjustmentMatrix$group == i)[1]),] + # Account for special case: No Cs or Hs + if(length(CHPos)){ + # If there are Cs and Hs, overwrite the positions so they always get sorted to the front + splitNames[CHPos] <- "" + } - # Extract the parent formulas of the isotopic formula groups - parentFormulas <- names(groupLengths) + # Default isotopes are always first in the colnames, so no need to order internally between isotopes + atomOrder <- order(splitNames) - # Find out which of the possible found isotopes formulas are competing (by checking which have the same mz) - indexList <- lapply(unique(groupAdjustmentMatrix$mzFound),function(mz){ - which(groupAdjustmentMatrix$mzFound == mz) - }) + # If there are Cs and Hs, the order must be preserved + if(length(CHPos)){ + atomOrder[1:length(CHPos)] <- CHPos + } + # else it is already ordered - # For every set of indices, find the parent formula in the aggregated matrix - return(do.call(rbind, lapply(indexList, function(indices){ - # All peaks which fit the parent formulas - cutAggregateMatrix <- aggregatedPeakMatrix[which(aggregatedPeakMatrix$formula %in% parentFormulas[indices]),,drop=FALSE] - correctParentIndex <- which(cutAggregateMatrix$filterOK == TRUE) - if(length(correctParentIndex)){ - # If there is a Peak that will be passed and fits one of the parent formulas, take that Peak and all belonging to that group - # Also, drop the group column - correctGroup <- groupAdjustmentMatrix$group[indices[correctParentIndex]] - returnMatrix <- adjustmentMatrix[which(adjustmentMatrix$group == correctGroup),-25,drop=FALSE] - return(returnMatrix) - } else{ - # Else calculate the Peak-isotope relationship with the lowest average dppm - # And correct the Peak that currently has filterOK to not have it - correctParentIndex <- which.min((abs(cutAggregateMatrix$dppm) + abs(adjustmentMatrix[indices,]$dppm))/2) - correctionLine <- aggregatedPeakMatrix[which(aggregatedPeakMatrix$mzFound == groupAdjustmentMatrix$mzFound[indices[1]] & aggregatedPeakMatrix$filterOK),,drop=FALSE] - correctionLine$filterOK <- FALSE - correctGroup <- groupAdjustmentMatrix$group[indices[correctParentIndex]] - returnMatrix <- adjustmentMatrix[which(adjustmentMatrix$group == correctGroup),-25,drop=FALSE] - return(rbind( - returnMatrix, - correctionLine - )) + # Mark new names for formula creation. + newnames <- unname(sapply(isoCols, function(currentCol){ + if(currentCol %in% defIsotopes){ + # "Default" isotope (no isotope mass in formula, e.g. "C") + return(gsub('^(.{0})([0-9]+)(.+)$', '\\3', currentCol)) + }else{ + # Other isotopes (isotope mass in formula, e.g. "[13]C") + return(gsub('^(.{0})([0-9]+)(.+)$', '\\1[\\2]\\3', currentCol)) } - }))) -} \ No newline at end of file + })) + + # Generate the formula for every pattern + pattern$formula <- apply(pattern,1,function(p){ + paste0(sapply(atomOrder+2,function(isoIndex){ + if(p[isoIndex] == 0){ + return("") + } + if(p[isoIndex] == 1){ + return(newnames[isoIndex-2]) + } + return(paste0(newnames[isoIndex-2],p[isoIndex])) + }),collapse="") + }) + + return(pattern) +} diff --git a/R/alternateAnalyze.R b/R/alternateAnalyze.R index c708634b66a65d8701ced4c6626d3d042e4b70dc..15f7d582044e853b2650de8ad67a4e47ec90e915 100644 --- a/R/alternateAnalyze.R +++ b/R/alternateAnalyze.R @@ -32,11 +32,7 @@ newStep2WorkFlow <- function(w, mode="pH", confirmMode=FALSE, progressbar = "progressBarHook", settings = getOption("RMassBank"), analyzeMethod="formula", fragdataFile = NA){ ##Load the fragment data (locally or from RMassBank package) - if(is.na(fragdataFile)){ - fragData <- nfragData - } else{ - fragData <- read.csv(fragdataFile,colClasses = c("character","numeric")) - } + fragData <- read.csv(fragdataFile,colClasses = c("character","numeric")) ##Progress bar nLen <- length(w@files) @@ -95,41 +91,41 @@ analyzeMsMs.formula.optimized <- function(msmsPeaks, mode="pH", detail=FALSE, ru filterSettings = getOption("RMassBank")$filterSettings, spectraList = getOption("RMassBank")$spectraList, fragData, fragDataIndex) { - cut <- 0 - cut_ratio <- 0 - if(run=="preliminary") - { - mzColname <- "mz" - filterMode <- "coarse" - cut <- filterSettings$prelimCut - if(is.na(cut)) - { - if(mode %in% c("pH", "pM", "pNa", "pNH4")) - cut <- 1e4 - else if(mode %in% c("mH", "mFA","mM")) - cut <- 0 - else stop(paste("The ionization mode", mode, "is unknown.")) - } - cutRatio <- filterSettings$prelimCutRatio - } else{ - mzColname <- "mzRecal" - filterMode <- "fine" - cut <- filterSettings$fineCut - cut_ratio <- filterSettings$fineCutRatio - if(is.na(cut)) cut <- 0 - } - - # find whole spectrum of parent peak, so we have reasonable data to feed into - # MolgenMsMs - parentSpectrum <- msmsPeaks$parentPeak + cut <- 0 + cut_ratio <- 0 + if(run=="preliminary") + { + mzColname <- "mz" + filterMode <- "coarse" + cut <- filterSettings$prelimCut + if(is.na(cut)) + { + if(mode %in% c("pH", "pM", "pNa", "pNH4")) + cut <- 1e4 + else if(mode %in% c("mH", "mFA","mM")) + cut <- 0 + else stop(paste("The ionization mode", mode, "is unknown.")) + } + cutRatio <- filterSettings$prelimCutRatio + } else{ + mzColname <- "mzRecal" + filterMode <- "fine" + cut <- filterSettings$fineCut + cut_ratio <- filterSettings$fineCutRatio + if(is.na(cut)) cut <- 0 + } - - # Check whether the spectra can be fitted to the spectra list correctly! - if(nrow(msmsPeaks$childHeaders) != length(spectraList)) - { - warning(paste0( - "The spectra count of the substance ", msmsPeaks$id, " (", nrow(msmsPeaks$childHeaders), " spectra) doesn't match the provided spectra list (", length(spectraList), " spectra)." - )) + # find whole spectrum of parent peak, so we have reasonable data to feed into + # MolgenMsMs + parentSpectrum <- msmsPeaks$parentPeak + + + # Check whether the spectra can be fitted to the spectra list correctly! + if(nrow(msmsPeaks$childHeaders) != length(spectraList)) + { + warning(paste0("The spectra count of the substance ", msmsPeaks$id, " (", nrow(msmsPeaks$childHeaders), " spectra) doesn't match the provided spectra list (", + length(spectraList), " spectra).") + ) return(list(specOK=FALSE)) } @@ -240,52 +236,52 @@ analyzeMsMs.formula.optimized <- function(msmsPeaks, mode="pH", detail=FALSE, ru if(usePrecalcData){ - # if yes, split the peaks into 2 sets: Those which can be identified using the - # fragment data, and those which can (by maximum fragment Data m/z - 0.5) - - precalcIndex <- which(shot[,mzColname] < (max(newfragData$mass)-0.5)) - smallShots <- shot[precalcIndex,] - bigShots <- shot[-precalcIndex,] - - # for smaller peaks use the precalculated fragment data - if(nrow(smallShots)){ - system.time(peakmatrixSmall <- lapply(smallShots[,mzColname],function(mass){ - mass.calc <- mass + mode.charge * .emass - maxminMass <- ppm(mass.calc, ppmlimit, l=TRUE) - # retrieve only possibly relevant rows of the fragment Data (use the index) - indices <- newfragDataIndex[floor(maxminMass[2]*100)]:newfragDataIndex[ceiling(maxminMass[1]*100)] - - fragmentedFragmentData <- newfragData[indices,] - - # narrow it down further using the ppm - fragmentedFragmentData <- fragmentedFragmentData[which(fragmentedFragmentData$mass > maxminMass[2] & fragmentedFragmentData $mass < maxminMass[1]),] - - # return nothing if the narrowed down data is empty at this point - if(!nrow(fragmentedFragmentData)){ - return(t(c(mzFound=as.numeric(as.character(mass)), - formula=NA, mzCalc=NA))) - } - - - # find out if the narrowed down fragments have a sub-formula of the parentformula - # return the indexes of relevant formulas - fragmentedFragmentData <- fragmentedFragmentData[which(sapply(fragmentedFragmentData$formula, function(currentFormula) is.sub.formula(currentFormula,parentAtomList))),] - - # return nothing if the narrowed down data is empty at this point - if(!nrow(fragmentedFragmentData)){ - return(t(c(mzFound=as.numeric(as.character(mass)), - formula=NA, mzCalc=NA))) - } - - return(t(sapply(1:nrow(fragmentedFragmentData), function(f) - { - mzCalc <- fragmentedFragmentData$mass[f] - mode.charge * .emass - c(mzFound=as.numeric(as.character(mass)), - formula=fragmentedFragmentData$formula[f], - mzCalc=mzCalc) - }))) - - })) + # if yes, split the peaks into 2 sets: Those which can be identified using the + # fragment data, and those which can't (by maximum fragment Data m/z - 0.5) + + precalcIndex <- which(shot[,mzColname] < (max(newfragData$mass)-0.5)) + smallShots <- shot[precalcIndex,] + bigShots <- shot[-precalcIndex,] + + # for smaller peaks use the precalculated fragment data + if(nrow(smallShots)){ + peakmatrixSmall <- lapply(smallShots[,mzColname],function(mass){ + mass.calc <- mass + mode.charge * .emass + maxminMass <- ppm(mass.calc, ppmlimit, l=TRUE) + # retrieve only possibly relevant rows of the fragment Data (use the index) + indices <- newfragDataIndex[floor(maxminMass[2]*100)]:newfragDataIndex[ceiling(maxminMass[1]*100)] + + fragmentedFragmentData <- newfragData[indices,] + + # narrow it down further using the ppm + fragmentedFragmentData <- fragmentedFragmentData[which(fragmentedFragmentData$mass > maxminMass[2] & fragmentedFragmentData $mass < maxminMass[1]),] + + # return nothing if the narrowed down data is empty at this point + if(!nrow(fragmentedFragmentData)){ + return(t(c(mzFound=as.numeric(as.character(mass)), + formula=NA, mzCalc=NA))) + } + + + # find out if the narrowed down fragments have a sub-formula of the parentformula + # return the indexes of relevant formulas + fragmentedFragmentData <- fragmentedFragmentData[which(sapply(fragmentedFragmentData$formula, function(currentFormula) is.sub.formula(currentFormula,parentAtomList))),] + + # return nothing if the narrowed down data is empty at this point + if(!nrow(fragmentedFragmentData)){ + return(t(c(mzFound=as.numeric(as.character(mass)), + formula=NA, mzCalc=NA))) + } + + return(t(sapply(1:nrow(fragmentedFragmentData), function(f) + { + mzCalc <- fragmentedFragmentData$mass[f] - mode.charge * .emass + c(mzFound=as.numeric(as.character(mass)), + formula=fragmentedFragmentData$formula[f], + mzCalc=mzCalc) + }))) + + }) } else{ peakmatrixSmall <- list() } @@ -323,31 +319,30 @@ analyzeMsMs.formula.optimized <- function(msmsPeaks, mode="pH", detail=FALSE, ru peakmatrix <- c(peakmatrixSmall,peakmatrixBig) } else{ - system.time(peakmatrix <- lapply(shot[,mzColname], function(mass){ - # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae - # finally back-correct calculated masses for the charge - mass.calc <- mass + mode.charge * .emass - peakformula <- tryCatch(suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), - limits, charge=0)), error=function(e) NA) - #peakformula <- tryCatch( - # generate.formula(mass, - # ppm(mass, ppmlimit, p=TRUE), - # limits, charge=1), - #error= function(e) list()) - if(!is.list(peakformula)){ - return(t(c(mzFound=as.numeric(as.character(mass)), - formula=NA, mzCalc=NA))) - }else - { - return(t(sapply(peakformula, function(f) - { - mzCalc <- f@mass - mode.charge * .emass - c(mzFound=mass, - formula=f@string, - mzCalc=mzCalc) - }))) - } - })) + peakmatrix <- lapply(shot[,mzColname], function(mass){ + # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae + # finally back-correct calculated masses for the charge + mass.calc <- mass + mode.charge * .emass + peakformula <- tryCatch(suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), + limits, charge=0)), error=function(e) NA) + #peakformula <- tryCatch( + # generate.formula(mass, + # ppm(mass, ppmlimit, p=TRUE), + # limits, charge=1), + #error= function(e) list()) + if(!is.list(peakformula)){ + return(t(c(mzFound=as.numeric(as.character(mass)), + formula=NA, mzCalc=NA))) + }else{ + return(t(sapply(peakformula, function(f) + { + mzCalc <- f@mass - mode.charge * .emass + c(mzFound=mass, + formula=f@string, + mzCalc=mzCalc) + }))) + } + }) } childPeaks <- as.data.frame(do.call(rbind, peakmatrix)) diff --git a/R/createMassBank.R b/R/createMassBank.R index a3e1c1f7ce282adc35535a8581a25baa85eed07d..cc95afd8850ac418d31a64643b0a4c581b1c5080 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -162,7 +162,7 @@ resetInfolists <- function(mb) #' @param mb The \code{mbWorkspace} to work in. #' @param gatherData A variable denoting whether to retrieve information using several online databases \code{gatherData= "online"} #' or to use the local babel installation \code{gatherData= "babel"}. Note that babel is used either way, if a directory is given -#' in the settings +#' in the settings. This setting will be ignored if retrieval is set to "standard" #' @return The processed \code{mbWorkspace}. #' @seealso \code{\link{mbWorkspace-class}} #' @author Michael A. Stravs, Eawag <michael.stravs@@eawag.ch> @@ -175,110 +175,109 @@ resetInfolists <- function(mb) #' @export mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.csv", gatherData = "online") { - # Step 1: Find which compounds don't have annotation information yet. For these - # compounds, pull information from CTS (using gatherData). - if(1 %in% steps) - { - mbdata_ids <- lapply(selectSpectra(mb@spectra, "found", "object"), function(spec) spec@id) - if(gatherData == "online"){ - message("mbWorkflow: Step 1. Gather info from several databases") - } - if(gatherData == "babel"){ - message("mbWorkflow: Step 1. Gather info using babel") - } - # Which IDs are not in mbdata_archive yet? - new_ids <- setdiff(as.numeric(unlist(mbdata_ids)), mb@mbdata_archive$id) - mb@mbdata <- lapply(new_ids, function(id) - { - #print(id) - if(gatherData == "online"){ - d <- gatherData(id) - } - if(gatherData == "babel"){ - d <- gatherDataBabel(id) - } - #print(d$dataused) - message(paste(id, ": ", d$dataused, sep='')) - return(d) - }) - } - # Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. - # Otherwise, continue! - if(2 %in% steps) - { - message("mbWorkflow: Step 2. Export infolist (if required)") - if(length(mb@mbdata)>0) + # Step 1: Find which compounds don't have annotation information yet. For these + # compounds, pull information from CTS (using gatherData). + if(1 %in% steps) { - mbdata_mat <- flatten(mb@mbdata) - write.csv(as.data.frame(mbdata_mat),infolist_path, na="") - message(paste("The file", infolist_path, "was generated with new compound information. Please check and edit the table, and add it to your infolist folder.") - ) - return(mb) + mbdata_ids <- lapply(selectSpectra(mb@spectra, "found", "object"), function(spec) spec@id) + message("mbWorkflow: Step 1. Gather info from several databases") + # Which IDs are not in mbdata_archive yet? + new_ids <- setdiff(as.numeric(unlist(mbdata_ids)), mb@mbdata_archive$id) + mb@mbdata <- lapply(new_ids, function(id) + { + if(findLevel(id,TRUE) == "standard"){ + if(gatherData == "online"){ + + d <- gatherData(id) + } + if(gatherData == "babel"){ + # message("mbWorkflow: Step 1. Gather info using babel") + d <- gatherDataBabel(id) + } + } else{ + # message("mbWorkflow: Step 1. Gather no info - Unknown structure") + d <- gatherDataUnknown(id, mb@spectra[[1]]@mode, retrieval=findLevel(id,TRUE)) + } + message(paste(id, ": ", d$dataused, sep='')) + return(d) + }) } - else - message("No new data added.") - } - # Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. - if(3 %in% steps) - { - message("mbWorkflow: Step 3. Data reformatting") - mb@mbdata_relisted <- apply(mb@mbdata_archive, 1, readMbdata) - } - # Step 4: Compile the spectra! Using the skeletons from the archive data, create - # MassBank records per compound and fill them with peak data for each spectrum. - # Also, assign accession numbers based on scan mode and relative scan no. - if(4 %in% steps) - { - message("mbWorkflow: Step 4. Spectra compilation") - mb@compiled <- lapply( - selectSpectra(mb@spectra, "found", "object"), - function(r) { - message(paste("Compiling: ", r@name, sep="")) - mbdata <- mb@mbdata_relisted[[which(mb@mbdata_archive$id == as.numeric(r@id))]] - if(nrow(mb@additionalPeaks) > 0) - res <-compileRecord(r, mbdata, mb@aggregated, mb@additionalPeaks) - else - res <-compileRecord(r, mbdata, mb@aggregated, NULL) - return(res) - }) - # check which compounds have useful spectra - mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) - mb@problems <- which(is.na(mb@compiled)) - mb@compiled_ok <- mb@compiled[mb@ok] - } - # Step 5: Convert the internal tree-like representation of the MassBank data into - # flat-text string arrays (basically, into text-file style, but still in memory) - if(5 %in% steps) - { - message("mbWorkflow: Step 5. Flattening records") - mb@mbfiles <- lapply(mb@compiled_ok, function(c) lapply(c, toMassbank)) - } - # Step 6: For all OK records, generate a corresponding molfile with the structure - # of the compound, based on the SMILES entry from the MassBank record. (This molfile - # is still in memory only, not yet a physical file) - if(6 %in% steps) - { - message("mbWorkflow: Step 6. Generate molfiles") - mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(c[[1]][["CH$SMILES"]])) - } - # Step 7: If necessary, generate the appropriate subdirectories, and actually write - # the files to disk. - if(7 %in% steps) - { - message("mbWorkflow: Step 7. Generate subdirs and export") - dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "moldata", sep='/'),recursive=TRUE) - dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "recdata", sep='/'),recursive=TRUE) - for(cnt in 1:length(mb@compiled_ok)) - exportMassbank(mb@compiled_ok[[cnt]], mb@mbfiles[[cnt]], mb@molfile[[cnt]]) - } - # Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank - # to attribute substances to their corresponding structure molfiles. - if(8 %in% steps) - { - message("mbWorkflow: Step 8. Create list.tsv") - makeMollist(mb@compiled_ok) - } - return(mb) + # Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. + # Otherwise, continue! + if(2 %in% steps) + { + message("mbWorkflow: Step 2. Export infolist (if required)") + if(length(mb@mbdata)>0) + { + mbdata_mat <- flatten(mb@mbdata) + write.csv(as.data.frame(mbdata_mat),infolist_path, na="") + message(paste("The file", infolist_path, "was generated with new compound information. Please check and edit the table, and add it to your infolist folder.")) + return(mb) + } + else + message("No new data added.") + } + # Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. + if(3 %in% steps) + { + message("mbWorkflow: Step 3. Data reformatting") + mb@mbdata_relisted <- apply(mb@mbdata_archive, 1, readMbdata) + } + # Step 4: Compile the spectra! Using the skeletons from the archive data, create + # MassBank records per compound and fill them with peak data for each spectrum. + # Also, assign accession numbers based on scan mode and relative scan no. + if(4 %in% steps) + { + message("mbWorkflow: Step 4. Spectra compilation") + mb@compiled <- lapply( + selectSpectra(mb@spectra, "found", "object"), + function(r) { + message(paste("Compiling: ", r@name, sep="")) + mbdata <- mb@mbdata_relisted[[which(mb@mbdata_archive$id == as.numeric(r@id))]] + if(nrow(mb@additionalPeaks) > 0) + res <-compileRecord(r, mbdata, mb@aggregated, mb@additionalPeaks) + else + res <-compileRecord(r, mbdata, mb@aggregated, NULL, retrieval=findLevel(r@id,TRUE)) + return(res) + }) + # check which compounds have useful spectra + mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) + mb@problems <- which(is.na(mb@compiled)) + mb@compiled_ok <- mb@compiled[mb@ok] + } + # Step 5: Convert the internal tree-like representation of the MassBank data into + # flat-text string arrays (basically, into text-file style, but still in memory) + if(5 %in% steps) + { + message("mbWorkflow: Step 5. Flattening records") + mb@mbfiles <- lapply(mb@compiled_ok, function(c) lapply(c, toMassbank)) + } + # Step 6: For all OK records, generate a corresponding molfile with the structure + # of the compound, based on the SMILES entry from the MassBank record. (This molfile + # is still in memory only, not yet a physical file) + if(6 %in% steps) + { + message("mbWorkflow: Step 6. Generate molfiles") + mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(as.numeric(c[[1]][['COMMENT']][[getOption("RMassBank")$annotations$internal_id_fieldname]]))) + } + # Step 7: If necessary, generate the appropriate subdirectories, and actually write + # the files to disk. + if(7 %in% steps) + { + message("mbWorkflow: Step 7. Generate subdirs and export") + dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "moldata", sep='/'),recursive=TRUE) + dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "recdata", sep='/'),recursive=TRUE) + for(cnt in 1:length(mb@compiled_ok)) + exportMassbank(mb@compiled_ok[[cnt]], mb@mbfiles[[cnt]], mb@molfile[[cnt]]) + } + # Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank + # to attribute substances to their corresponding structure molfiles. + if(8 %in% steps) + { + message("mbWorkflow: Step 8. Create list.tsv") + makeMollist(mb@compiled_ok) + } + return(mb) } @@ -318,11 +317,15 @@ createMolfile <- function(id_or_smiles, fileName = FALSE) { .checkMbSettings() babeldir <- getOption("RMassBank")$babeldir - - if(!is.numeric(id_or_smiles)) + + if(!is.numeric(id_or_smiles)){ smiles <- id_or_smiles - else - smiles <- findSmiles(id_or_smiles) + } else{ + if(findLevel(id_or_smiles,TRUE) != "standard"){ + return(c(" ","$$$$")) + } + smiles <- findSmiles(id_or_smiles) + } # if no babeldir was set, get the result from cactus. if(is.na(babeldir)) { @@ -348,7 +351,8 @@ createMolfile <- function(id_or_smiles, fileName = FALSE) # If we wrote to a file, read it back as return value. if(is.character(fileName)) res <- readLines(fileName) - } + } + return(c(" ","$$$$")) return(res) } @@ -574,8 +578,45 @@ gatherData <- function(id) # COMMENT: EAWAG_UCHEM_ID 1234 # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" mbdata[["COMMENT"]] <- list() - mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment - mbdata[["COMMENT"]][["ID"]] = id + if(findLevel(id) == "0"){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + } else{ + level <- findLevel(id) + if(level %in% c("1","1a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" + } + if(level == c("2")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" + } + if(level == c("2a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" + } + if(level == c("2b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" + } + if(level == c("3")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" + } + if(level == c("3a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" + } + if(level == c("3b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" + } + if(level == c("3c")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" + } + if(level == c("3d")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" + } + if(level == c("4")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" + } + if(level == c("5")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" + } + } + mbdata[["COMMENT"]][["ID"]] = id # here compound info starts mbdata[['CH$NAME']] <- names # Currently we use a fixed value for Compound Class, since there is no useful @@ -756,8 +797,46 @@ gatherDataBabel <- function(id){ # COMMENT: EAWAG_UCHEM_ID 1234 # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" mbdata[["COMMENT"]] <- list() - mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + if(findLevel(id) == "0"){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + } else{ + level <- findLevel(id) + if(level %in% c("1","1a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" + } + if(level == c("2")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" + } + if(level == c("2a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" + } + if(level == c("2b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" + } + if(level == c("3")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" + } + if(level == c("3a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" + } + if(level == c("3b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" + } + if(level == c("3c")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" + } + if(level == c("3d")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" + } + if(level == c("4")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" + } + if(level == c("5")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" + } + } mbdata[["COMMENT"]][["ID"]] <- id + # here compound info starts mbdata[['CH$NAME']] <- as.list(dbname) @@ -781,6 +860,139 @@ gatherDataBabel <- function(id){ return(mbdata) } +# Retrieve annotation data for a compound, using only babel +#' Retrieve annotation data +#' +#' Retrieves annotation data for an unknown compound by using basic information present +#' +#' Composes the "upper part" of a MassBank record filled with chemical data +#' about the compound: name, exact mass, structure, CAS no.. +#' The instrument type is also written into this block (even +#' if not strictly part of the chemical information). Additionally, index +#' fields are added at the start of the record, which will be removed later: +#' \code{id, dbcas, dbname} from the compound list. +#' +#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +#' inserted empty and will be filled later on. +#' +#' This function is used to generate the data in case a substance is unknown, +#' i.e. not enough information is present to derive anything about formulas or links +#' +#' @usage gatherDataUnknown(id, mode, retrieval) +#' @param id The compound ID. +#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., +#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +#' @author Michael Stravs, Erik Mueller +#' @seealso \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherDataUnknown(131,"pH")} +#' +#' @export +gatherDataUnknown <- function(id, mode, retrieval){ + .checkMbSettings() + + ##Read from Compoundlist + smiles <- "" + if(retrieval == "unknown"){ + mass <- findMass(id, "unknown", mode) + formula <- "" + } + if(retrieval == "tentative"){ + mass <- findMass(id, "tentative", mode) + formula <- findFormula(id, "tentative") + } + dbcas <- NA + dbname <- findName(id) + if(is.na(dbname)) dbname <- paste("Unknown ID:",id) + if(is.na(dbcas)) dbcas <- "" + + + + ##Create + mbdata <- list() + mbdata[['id']] <- id + mbdata[['dbcas']] <- dbcas + mbdata[['dbname']] <- dbname + mbdata[['dataused']] <- "none" + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + mbdata[["COMMENT"]] <- list() + if(findLevel(id) == "0"){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + } else{ + level <- findLevel(id) + if(level %in% c("1","1a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" + } + if(level == c("2")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" + } + if(level == c("2a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" + } + if(level == c("2b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" + } + if(level == c("3")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" + } + if(level == c("3a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" + } + if(level == c("3b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" + } + if(level == c("3c")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" + } + if(level == c("3d")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" + } + if(level == c("4")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" + } + if(level == c("5")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" + } + } + mbdata[["COMMENT"]][["ID"]] <- id + + # here compound info starts + mbdata[['CH$NAME']] <- as.list(dbname) + + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class + mbdata[['CH$FORMULA']] <- formula + mbdata[['CH$EXACT_MASS']] <- mass + mbdata[['CH$SMILES']] <- "" + mbdata[['CH$IUPAC']] <- "" + + link <- list() + mbdata[['CH$LINK']] <- link + mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument + mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type + + return(mbdata) +} # Flatten the internal tree-like representation of MassBank data to a flat table. # Note that this limits us, in that the fields should be constant over all records! @@ -975,10 +1187,10 @@ readMbdata <- function(row) #' list('FRAGMENTATION_MODE' = 'CID', ...), ...)} etc. #' #' @aliases gatherCompound gatherSpectrum -#' @usage gatherCompound(spec, aggregated, additionalPeaks = NULL) +#' @usage gatherCompound(spec, aggregated, additionalPeaks = NULL, retrieval="standard") #' #' gatherSpectrum(spec, msmsdata, ac_ms, ac_lc, aggregated, -#' additionalPeaks = NULL) +#' additionalPeaks = NULL, retrieval="standard") #' @param spec A \code{RmbSpectraSet} object, representing a compound with multiple spectra. #' @param aggregated An aggregate peak table where the peaks are extracted from. #' @param msmsdata A \code{RmbSpectrum2} object from the \code{spec} spectra set, representing a single spectrum to give a record. @@ -987,6 +1199,9 @@ readMbdata <- function(row) #' \code{gatherCompound} and then fed into \code{gatherSpectrum}. #' @param additionalPeaks If present, a table with additional peaks to add into the spectra. #' As loaded with \code{\link{addPeaks}}. +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z #' @return \code{gatherCompound} returns a list of tree-like MassBank data #' blocks. \code{gatherSpectrum} returns one single MassBank data block or #' \code{NA} if no useful peak is in the spectrum. @@ -1009,47 +1224,47 @@ readMbdata <- function(row) #' #' #' @export -gatherCompound <- function(spec, aggregated, additionalPeaks = NULL) +gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval="standard") { - # compound ID - id <- spec@id - # processing mode - imode <- spec@mode - - # define positive or negative, based on processing mode. - ion_modes <- list( - "pH" = "POSITIVE", - "pNa" = "POSITIVE", - "mH" = "NEGATIVE", - "mFA" = "NEGATIVE", - "pM" = "POSITIVE", - "mM" = "NEGATIVE", - "pNH4" = "POSITIVE") - mode <- ion_modes[[imode]] - - # for format 2.01 - ac_ms <- list(); - ac_ms[['MS_TYPE']] <- getOption("RMassBank")$annotations$ms_type - ac_ms[['IONIZATION']] <- getOption("RMassBank")$annotations$ionization - ac_ms[['ION_MODE']] <- mode - - # This list could be made customizable. - ac_lc <- list(); - rt <- spec@parent@rt / 60 - ac_lc[['COLUMN_NAME']] <- getOption("RMassBank")$annotations$lc_column - ac_lc[['FLOW_GRADIENT']] <- getOption("RMassBank")$annotations$lc_gradient - ac_lc[['FLOW_RATE']] <- getOption("RMassBank")$annotations$lc_flow - ac_lc[['RETENTION_TIME']] <- sprintf("%.1f min", rt) - ac_lc[['SOLVENT A']] <- getOption("RMassBank")$annotations$lc_solvent_a - ac_lc[['SOLVENT B']] <- getOption("RMassBank")$annotations$lc_solvent_b + # compound ID + id <- spec@id + # processing mode + imode <- spec@mode + + # define positive or negative, based on processing mode. + ion_modes <- list( + "pH" = "POSITIVE", + "pNa" = "POSITIVE", + "mH" = "NEGATIVE", + "mFA" = "NEGATIVE", + "pM" = "POSITIVE", + "mM" = "NEGATIVE", + "pNH4" = "POSITIVE") + mode <- ion_modes[[imode]] + + # for format 2.01 + ac_ms <- list(); + ac_ms[['MS_TYPE']] <- getOption("RMassBank")$annotations$ms_type + ac_ms[['IONIZATION']] <- getOption("RMassBank")$annotations$ionization + ac_ms[['ION_MODE']] <- mode - # Go through all child spectra, and fill our skeleton with scan data! - # Pass them the AC_LC and AC_MS data, which are added at the right place - # directly in there. - allSpectra <- lapply(spec@children, function(m) - gatherSpectrum(spec, m, ac_ms, ac_lc, aggregated, additionalPeaks)) - allSpectra <- allSpectra[which(!is.na(allSpectra))] - return(allSpectra) + # This list could be made customizable. + ac_lc <- list(); + rt <- spec@parent@rt / 60 + ac_lc[['COLUMN_NAME']] <- getOption("RMassBank")$annotations$lc_column + ac_lc[['FLOW_GRADIENT']] <- getOption("RMassBank")$annotations$lc_gradient + ac_lc[['FLOW_RATE']] <- getOption("RMassBank")$annotations$lc_flow + ac_lc[['RETENTION_TIME']] <- sprintf("%.3f min", rt) + ac_lc[['SOLVENT A']] <- getOption("RMassBank")$annotations$lc_solvent_a + ac_lc[['SOLVENT B']] <- getOption("RMassBank")$annotations$lc_solvent_b + + # Go through all child spectra, and fill our skeleton with scan data! + # Pass them the AC_LC and AC_MS data, which are added at the right place + # directly in there. + allSpectra <- lapply(spec@children, function(m) + gatherSpectrum(spec, m, ac_ms, ac_lc, aggregated, additionalPeaks, retrieval=retrieval)) + allSpectra <- allSpectra[which(!is.na(allSpectra))] + return(allSpectra) } @@ -1069,17 +1284,17 @@ gatherCompound <- function(spec, aggregated, additionalPeaks = NULL) # peaksProblematic. Currently we use peaksOK and peaksReanOK to create the files. # (Also, the global additionalPeaks table is used.) #' @export -gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalPeaks = NULL) +gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalPeaks = NULL, retrieval = "standard") { - # If the spectrum is not filled, return right now. All "NA" spectra will - # not be treated further. - if(msmsdata@ok == FALSE) - return(NA) - # get data - scan <- msmsdata@acquisitionNum - id <- spec@id - # Further fill the ac_ms datasets, and add the ms$focused_ion with spectrum-specific data: - precursor_types <- list( + # If the spectrum is not filled, return right now. All "NA" spectra will + # not be treated further. + if(msmsdata@ok == FALSE) + return(NA) + # get data + scan <- msmsdata@acquisitionNum + id <- spec@id + # Further fill the ac_ms datasets, and add the ms$focused_ion with spectrum-specific data: + precursor_types <- list( "pH" = "[M+H]+", "pNa" = "[M+Na]+", "mH" = "[M-H]-", @@ -1087,187 +1302,194 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP "pM" = "[M]+", "mM" = "[M]-", "pNH4" = "[M+NH4]+") - ac_ms[['FRAGMENTATION_MODE']] <- msmsdata@info$mode - #ac_ms['PRECURSOR_TYPE'] <- precursor_types[spec$mode] - ac_ms[['COLLISION_ENERGY']] <- msmsdata@info$ce - ac_ms[['RESOLUTION']] <- msmsdata@info$res - - # Calculate exact precursor mass with Rcdk, and find the base peak from the parent - # spectrum. (Yes, that's what belongs here, I think.) - precursorMz <- findMz(spec@id, spec@mode) - ms_fi <- list() - ms_fi[['BASE_PEAK']] <- round(mz(spec@parent)[which.max(intensity(spec@parent))],4) - ms_fi[['PRECURSOR_M/Z']] <- round(precursorMz$mzCenter,4) - ms_fi[['PRECURSOR_TYPE']] <- precursor_types[spec@mode] - - # Select all peaks which belong to this spectrum (correct cpdID and scan no.) - # from peaksOK - # Note: Here and below it would be easy to customize the source of the peaks. - # Originally the peaks came from msmsdata$childFilt, and the subset - # was used where dppm == dppmBest (because childFilt still contains multiple formulas) - # per peak. - peaks <- aggregated[aggregated$filterOK,,drop=FALSE] - peaks <- peaks[(peaks$cpdID == id) & (peaks$scan == msmsdata@acquisitionNum),,drop=FALSE] + ac_ms[['FRAGMENTATION_MODE']] <- msmsdata@info$mode + #ac_ms['PRECURSOR_TYPE'] <- precursor_types[spec$mode] + ac_ms[['COLLISION_ENERGY']] <- msmsdata@info$ce + ac_ms[['RESOLUTION']] <- msmsdata@info$res + + # Calculate exact precursor mass with Rcdk, and find the base peak from the parent + # spectrum. (Yes, that's what belongs here, I think.) + precursorMz <- findMz(spec@id, spec@mode, retrieval=retrieval) + ms_fi <- list() + ms_fi[['BASE_PEAK']] <- round(mz(spec@parent)[which.max(intensity(spec@parent))],4) + ms_fi[['PRECURSOR_M/Z']] <- round(precursorMz$mzCenter,4) + ms_fi[['PRECURSOR_TYPE']] <- precursor_types[spec@mode] + + # Select all peaks which belong to this spectrum (correct cpdID and scan no.) + # from peaksOK + # Note: Here and below it would be easy to customize the source of the peaks. + # Originally the peaks came from msmsdata$childFilt, and the subset + # was used where dppm == dppmBest (because childFilt still contains multiple formulas) + # per peak. + peaks <- aggregated[aggregated$filterOK,,drop=FALSE] + peaks <- peaks[(peaks$cpdID == id) & (peaks$scan == msmsdata@acquisitionNum),,drop=FALSE] - # No peaks? Aha, bye + # No peaks? Aha, bye if(nrow(peaks) == 0) - return(NA) + return(NA) - # If we don't include the reanalyzed peaks: - if(!getOption("RMassBank")$use_rean_peaks) - peaks <- peaks[is.na(peaks$matchedReanalysis),,drop=FALSE] - # but if we include them: - else - { - # for info, the following data will be used in the default annotator: - # annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")] - # and in the peaklist itself: - # c("mzSpec", "int", "intrel") - peaks[!is.na(peaks$matchedReanalysis),"formula"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.formula"] - peaks[!is.na(peaks$matchedReanalysis),"mzCalc"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.mzCalc"] - peaks[!is.na(peaks$matchedReanalysis),"dppm"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.dppm"] - peaks[!is.na(peaks$matchedReanalysis),"dbe"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.dbe"] - peaks[!is.na(peaks$matchedReanalysis),"formulaCount"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.formulaCount"] - } + # If we don't include the reanalyzed peaks: + if(!getOption("RMassBank")$use_rean_peaks) + peaks <- peaks[is.na(peaks$matchedReanalysis),,drop=FALSE] + # but if we include them: + else + { + # for info, the following data will be used in the default annotator: + # annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")] + # and in the peaklist itself: + # c("mzSpec", "int", "intrel") + peaks[!is.na(peaks$matchedReanalysis),"formula"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.formula"] + peaks[!is.na(peaks$matchedReanalysis),"mzCalc"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.mzCalc"] + peaks[!is.na(peaks$matchedReanalysis),"dppm"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.dppm"] + peaks[!is.na(peaks$matchedReanalysis),"dbe"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.dbe"] + peaks[!is.na(peaks$matchedReanalysis),"formulaCount"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.formulaCount"] + } - # Calculate relative intensity and make a formatted m/z to use in the output - # (mzSpec, for "spectrum") - peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999) - peaks$mzSpec <- round(peaks$mzFound, 4) - # reorder peaks after addition of the reanalyzed ones - peaks <- peaks[order(peaks$mzSpec),] - - # Also format the other values, which are used in the annotation - peaks$dppm <- round(peaks$dppm, 2) - peaks$mzCalc <- round(peaks$mzCalc, 4) - peaks$intensity <- round(peaks$intensity, 1) - # copy the peak table to the annotation table. (The peak table will then be extended - # with peaks from the global "additional_peaks" table, which can be used to add peaks - # to the spectra by hand. - annotation <- peaks - # Keep only peaks with relative intensity >= 1 o/oo, since the MassBank record - # makes no sense otherwise. Also, keep only the columns needed in the output. - peaks <- peaks[ peaks$intrel >= 1, c("mzSpec", "intensity", "intrel")] + # Calculate relative intensity and make a formatted m/z to use in the output + # (mzSpec, for "spectrum") + peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999) + peaks$mzSpec <- round(peaks$mzFound, 4) + # reorder peaks after addition of the reanalyzed ones + peaks <- peaks[order(peaks$mzSpec),] - # Here add the additional peaks if there are any for this compound! - # They are added without any annotation. - if(!is.null(additionalPeaks)) - { - # select the peaks from the corresponding spectrum which were marked with "OK=1" in the table. - spec_add_peaks <- additionalPeaks[ - (additionalPeaks$OK == 1) & - (additionalPeaks$cpdID == spec@id) & - (additionalPeaks$scan == msmsdata@acquisitionNum), - c("mzFound", "intensity")] - # If there are peaks to add: - if(nrow(spec_add_peaks)>0) + # Also format the other values, which are used in the annotation + peaks$dppm <- round(peaks$dppm, 2) + peaks$mzCalc <- round(peaks$mzCalc, 4) + peaks$intensity <- round(peaks$intensity, 1) + # copy the peak table to the annotation table. (The peak table will then be extended + # with peaks from the global "additional_peaks" table, which can be used to add peaks + # to the spectra by hand. + annotation <- peaks + # Keep only peaks with relative intensity >= 1 o/oo, since the MassBank record + # makes no sense otherwise. Also, keep only the columns needed in the output. + peaks <- peaks[ peaks$intrel >= 1, c("mzSpec", "intensity", "intrel")] + + # Here add the additional peaks if there are any for this compound! + # They are added without any annotation. + if(!is.null(additionalPeaks)) { - # add the column for rel. int. - spec_add_peaks$intrel <- 0 - # format m/z value - spec_add_peaks$mzSpec <- round(spec_add_peaks$mzFound, 4) - # bind tables together - peaks <- rbind(peaks, spec_add_peaks[,c("mzSpec", "intensity", "intrel")]) - # recalculate rel.int. and reorder list - peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999) - # Again, select the correct columns, and drop values with rel.int. <1 o/oo - # NOTE: If the highest additional peak is > than the previous highest peak, - # this can lead to the situation that a peak is in "annotation" but not in "peaks"! - # See below. - peaks <- peaks[ peaks$intrel >= 1, c("mzSpec", "intensity", "intrel")] - # Reorder again. - peaks <- peaks[order(peaks$mzSpec),] + # select the peaks from the corresponding spectrum which were marked with "OK=1" in the table. + spec_add_peaks <- additionalPeaks[ + (additionalPeaks$OK == 1) & + (additionalPeaks$cpdID == spec@id) & + (additionalPeaks$scan == msmsdata@acquisitionNum), + c("mzFound", "intensity")] + # If there are peaks to add: + if(nrow(spec_add_peaks)>0) + { + # add the column for rel. int. + spec_add_peaks$intrel <- 0 + # format m/z value + spec_add_peaks$mzSpec <- round(spec_add_peaks$mzFound, 4) + # bind tables together + peaks <- rbind(peaks, spec_add_peaks[,c("mzSpec", "intensity", "intrel")]) + # recalculate rel.int. and reorder list + peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999) + # Again, select the correct columns, and drop values with rel.int. <1 o/oo + # NOTE: If the highest additional peak is > than the previous highest peak, + # this can lead to the situation that a peak is in "annotation" but not in "peaks"! + # See below. + peaks <- peaks[ peaks$intrel >= 1, c("mzSpec", "intensity", "intrel")] + # Reorder again. + peaks <- peaks[order(peaks$mzSpec),] + } } - } - # add + or - to fragment formulas - formula_tag <- list( - "pH" = "+", - "pNa" = "+", - "mH" = "-", - "mFA" = "-", - "pM" = "+", - "mM" = "-", - "pNH4" = "+") - type <- formula_tag[[spec@mode]] - - annotator <- getOption("RMassBank")$annotator - if(is.null(annotator)) - annotator <- "annotator.default" - + # add + or - to fragment formulas + formula_tag <- list( + "pH" = "+", + "pNa" = "+", + "mH" = "-", + "mFA" = "-", + "pM" = "+", + "mM" = "-", + "pNH4" = "+") + type <- formula_tag[[spec@mode]] + annotator <- getOption("RMassBank")$annotator + if(is.null(annotator)) + annotator <- "annotator.default" - # Here, the relative intensity is recalculated using the newly added additional - # peaks from the peak list. Therefore, we throw superfluous peaks out again. - # NOTE: It is a valid question whether or not we should kick peaks out at this stage. - # The alternative would be to leave the survivors at 1 o/oo, but keep them in the spectrum. - annotation$intrel <- floor(annotation$intensity / max(peaks$intensity) * 999) - annotation <- annotation[annotation$intrel >= 1,] - annotation <- do.call(annotator, list(annotation= annotation, type=type)) - - # Name the columns correctly. - colnames(peaks) <- c("m/z", "int.", "rel.int.") - peaknum <- nrow(peaks) - - # Create the "lower part" of the record. - mbdata <- list() - # Add the AC$MS, AC$LC info. - if(getOption("RMassBank")$use_version == 2) - { - mbdata[["AC$MASS_SPECTROMETRY"]] <- ac_ms - mbdata[["AC$CHROMATOGRAPHY"]] <- ac_lc - } - else - { - # Fix for MassBank data format 1, where ION_MODE must be renamed to MODE - mbdata[["AC$ANALYTICAL_CONDITION"]] <- c(ac_ms, ac_lc) - names(mbdata[["AC$ANALYTICAL_CONDITION"]])[[3]] <- "MODE" - } - # Add the MS$FOCUSED_ION info. - mbdata[["MS$FOCUSED_ION"]] <- ms_fi + # Here, the relative intensity is recalculated using the newly added additional + # peaks from the peak list. Therefore, we throw superfluous peaks out again. + # NOTE: It is a valid question whether or not we should kick peaks out at this stage. + # The alternative would be to leave the survivors at 1 o/oo, but keep them in the spectrum. + annotation$intrel <- floor(annotation$intensity / max(peaks$intensity) * 999) + annotation <- annotation[annotation$intrel >= 1,] + + annotation <- do.call(annotator, list(annotation= annotation, type=type)) + + + # Name the columns correctly. + colnames(peaks) <- c("m/z", "int.", "rel.int.") + peaknum <- nrow(peaks) - # the data processing tag :) - # Change by Tobias: - # I suggest to add here the current version number of the clone due to better distinction between different makes of MB records - # Could be automatised from DESCRIPTION file? - if(getOption("RMassBank")$use_rean_peaks) - processingComment <- list("REANALYZE" = "Peaks with additional N2/O included") - else - processingComment <- list() - mbdata[["MS$DATA_PROCESSING"]] <- c( - getOption("RMassBank")$annotations$ms_dataprocessing, - processingComment, - list("WHOLE" = paste("RMassBank", packageVersion("RMassBank"))) + # Create the "lower part" of the record. + mbdata <- list() + # Add the AC$MS, AC$LC info. + if(getOption("RMassBank")$use_version == 2) + { + mbdata[["AC$MASS_SPECTROMETRY"]] <- ac_ms + mbdata[["AC$CHROMATOGRAPHY"]] <- ac_lc + } + else + { + # Fix for MassBank data format 1, where ION_MODE must be renamed to MODE + mbdata[["AC$ANALYTICAL_CONDITION"]] <- c(ac_ms, ac_lc) + names(mbdata[["AC$ANALYTICAL_CONDITION"]])[[3]] <- "MODE" + } + # Add the MS$FOCUSED_ION info. + mbdata[["MS$FOCUSED_ION"]] <- ms_fi + + ## The SPLASH is a hash value calculated across all peaks + ## http://splash.fiehnlab.ucdavis.edu/ + ## Has to be temporarily added as "PK$SPLASH" in the "lower" part + ## of the record, but will later be moved "up" when merging parts in compileRecord() + + # the data processing tag :) + # Change by Tobias: + # I suggest to add here the current version number of the clone due to better distinction between different makes of MB records + # Could be automatised from DESCRIPTION file? + if(getOption("RMassBank")$use_rean_peaks) + processingComment <- list("REANALYZE" = "Peaks with additional N2/O included") + else + processingComment <- list() + mbdata[["MS$DATA_PROCESSING"]] <- c( + getOption("RMassBank")$annotations$ms_dataprocessing, + processingComment, + list("WHOLE" = paste("RMassBank", packageVersion("RMassBank"))) ) - # Annotation: - if(getOption("RMassBank")$add_annotation==TRUE) - mbdata[["PK$ANNOTATION"]] <- annotation - # Peak table - mbdata[["PK$NUM_PEAK"]] <- peaknum - mbdata[["PK$PEAK"]] <- peaks - # These two entries will be thrown out later, but they are necessary to build the - # record title and the accession number. - mbdata[["RECORD_TITLE_CE"]] <- msmsdata@info$ces #formatted collision energy - # Mode of relative scan calculation: by default it is calculated relative to the - # parent scan. If a corresponding option is set, it will be calculated from the first - # present child scan in the list. - relativeScan <- "fromParent" - if(!is.null(getOption("RMassBank")$recomputeRelativeScan)) - if(getOption("RMassBank")$recomputeRelativeScan == "fromFirstChild") - relativeScan <- "fromFirstChild" - if(relativeScan == "fromParent") - mbdata[["SUBSCAN"]] <- msmsdata@acquisitionNum - spec@parent@acquisitionNum #relative scan - else if(relativeScan == "fromFirstChild") - { - firstChild <- min(unlist(lapply(spec@children,function(d) d@acquisitionNum))) - mbdata[["SUBSCAN"]] <- msmsdata@acquisitionNum - firstChild + 1 - } - return(mbdata) + mbdata[["PK$SPLASH"]] <- list(SPLASH = getSplash(peaks[,c("m/z", "int.")])) + + # Annotation: + if(getOption("RMassBank")$add_annotation && (findLevel(id,TRUE)!="unknown")) + mbdata[["PK$ANNOTATION"]] <- annotation + + # Peak table + mbdata[["PK$NUM_PEAK"]] <- peaknum + mbdata[["PK$PEAK"]] <- peaks + # These two entries will be thrown out later, but they are necessary to build the + # record title and the accession number. + mbdata[["RECORD_TITLE_CE"]] <- msmsdata@info$ces #formatted collision energy + # Mode of relative scan calculation: by default it is calculated relative to the + # parent scan. If a corresponding option is set, it will be calculated from the first + # present child scan in the list. + relativeScan <- "fromParent" + if(!is.null(getOption("RMassBank")$recomputeRelativeScan)) + if(getOption("RMassBank")$recomputeRelativeScan == "fromFirstChild") + relativeScan <- "fromFirstChild" + if(relativeScan == "fromParent") + mbdata[["SUBSCAN"]] <- msmsdata@acquisitionNum - spec@parent@acquisitionNum #relative scan + else if(relativeScan == "fromFirstChild"){ + firstChild <- min(unlist(lapply(spec@children,function(d) d@acquisitionNum))) + mbdata[["SUBSCAN"]] <- msmsdata@acquisitionNum - firstChild + 1 + } + return(mbdata) } @@ -1291,7 +1513,7 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP #' spectrum data, and finally fills in the record title and accession number, #' renames the "internal ID" comment field and removes dummy fields. #' -#' @usage compileRecord(spec, mbdata, aggregated, additionalPeaks = NULL) +#' @usage compileRecord(spec, mbdata, aggregated, additionalPeaks = NULL, retrieval="standard") #' @param spec A \code{RmbSpectraSet} for a compound, after analysis (\code{\link{analyzeMsMs}}). #' Note that \bold{peaks are not read from this #' object anymore}: Peaks come from the \code{aggregated} dataframe (and from @@ -1302,6 +1524,9 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP #' @param aggregated An aggregated peak data table containing information about refiltered spectra etc. #' @param additionalPeaks If present, a table with additional peaks to add into the spectra. #' As loaded with \code{\link{addPeaks}}. +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z #' @return Returns a MassBank record in list format: e.g. #' \code{list("ACCESSION" = "XX123456", "RECORD_TITLE" = "Cubane", ..., #' "CH\$LINK" = list( "CAS" = "12-345-6", "CHEMSPIDER" = 1111, ...))} @@ -1319,43 +1544,44 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP #' \dontrun{compiled <- compileRecord(myspec, mbdata, w@@aggregated)} #' #' @export -compileRecord <- function(spec, mbdata, aggregated, additionalPeaks = NULL) +compileRecord <- function(spec, mbdata, aggregated, additionalPeaks = NULL, retrieval="standard") { - # gather the individual spectra data - mblist <- gatherCompound(spec, aggregated, additionalPeaks) - # this returns a n-member list of "lower parts" of spectra (one for each subscan). - # (n being the number of child scans per parent scan.) - # Now we put the two parts together. - # (lapply on all n subscans, returns a list.) - mblist_c <- lapply(mblist, function(l) + # gather the individual spectra data + mblist <- gatherCompound(spec, aggregated, additionalPeaks, retrieval=retrieval) + # this returns a n-member list of "lower parts" of spectra (one for each subscan). + # (n being the number of child scans per parent scan.) + # Now we put the two parts together. + # (lapply on all n subscans, returns a list.) + mblist_c <- lapply(mblist, function(l) { - # This is the step which sticks together the upper and the lower part of the - # record (the upper being compound-specific and the lower being scan-specific.) - # Note that the accession number and record title (in the upper part) must of course - # be filled in with scan-specific info. - mbrecord <- c(mbdata, l) - # Here is the right place to fix the name of the INTERNAL ID field. - names(mbrecord[["COMMENT"]])[[which(names(mbrecord[["COMMENT"]]) == "ID")]] <- + # This is the step which sticks together the upper and the lower part of the + # record (the upper being compound-specific and the lower being scan-specific.) + # Note that the accession number and record title (in the upper part) must of course + # be filled in with scan-specific info. + mbrecord <- c(mbdata, l) + + # Here is the right place to fix the name of the INTERNAL ID field. + names(mbrecord[["COMMENT"]])[[which(names(mbrecord[["COMMENT"]]) == "ID")]] <- getOption("RMassBank")$annotations$internal_id_fieldname - # get mode parameter (for accession number generation) depending on version - # of record definition - # Change by Tobias: - # I suggest to include fragmentation mode here for information - if(getOption("RMassBank")$use_version == 2) + # get mode parameter (for accession number generation) depending on version + # of record definition + # Change by Tobias: + # I suggest to include fragmentation mode here for information + if(getOption("RMassBank")$use_version == 2) mode <- mbrecord[["AC$MASS_SPECTROMETRY"]][["ION_MODE"]] - else + else mode <- mbrecord[["AC$ANALYTICAL_CONDITION"]][["MODE"]] - # Generate the title and then delete the temprary RECORD_TITLE_CE field used before - mbrecord[["RECORD_TITLE"]] <- .parseTitleString(mbrecord) - mbrecord[["RECORD_TITLE_CE"]] <- NULL - # Calculate the accession number from the options. - shift <- getOption("RMassBank")$accessionNumberShifts[[spec@mode]] - mbrecord[["ACCESSION"]] <- sprintf("%s%04d%02d", getOption("RMassBank")$annotations$entry_prefix, as.numeric(spec@id), as.numeric(mbrecord[["SUBSCAN"]])+shift) - # Clear the "SUBSCAN" field. - mbrecord[["SUBSCAN"]] <- NULL - # return the record. - return(mbrecord) - }) + # Generate the title and then delete the temprary RECORD_TITLE_CE field used before + mbrecord[["RECORD_TITLE"]] <- .parseTitleString(mbrecord) + mbrecord[["RECORD_TITLE_CE"]] <- NULL + # Calculate the accession number from the options. + shift <- getOption("RMassBank")$accessionNumberShifts[[spec@mode]] + mbrecord[["ACCESSION"]] <- sprintf("%s%04d%02d", getOption("RMassBank")$annotations$entry_prefix, as.numeric(spec@id), as.numeric(mbrecord[["SUBSCAN"]])+shift) + # Clear the "SUBSCAN" field. + mbrecord[["SUBSCAN"]] <- NULL + # return the record. + return(mbrecord) + }) } @@ -1385,11 +1611,11 @@ compileRecord <- function(spec, mbdata, aggregated, additionalPeaks = NULL) annotator.default <- function(annotation, type) { - annotation$formula <- paste(annotation$formula, type, sep='') - # Select the right columns and name them correctly for output. - annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")] - colnames(annotation) <- c("m/z", "tentative_formula", "formula_count", "mass", "error(ppm)") - return(annotation) + annotation$formula <- paste(annotation$formula, type, sep='') + # Select the right columns and name them correctly for output. + annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")] + colnames(annotation) <- c("m/z", "tentative_formula", "formula_count", "mass", "error(ppm)") + return(annotation) } #' Parse record title @@ -1462,7 +1688,6 @@ annotator.default <- function(annotation, type) # I.e. from a string like "R={BLA: BLUB}" return "BLA: BLUB" args <- regexec("\\{(.*)\\}", var) arg <- regmatches(var, args)[[1]][[2]] - # Split the parameter by colon if necessary splitVar <- strsplit(arg, ": ")[[1]] # Read the parameter value from the record @@ -1480,6 +1705,12 @@ annotator.default <- function(annotation, type) # Fix problems: Names will have >= 1 match. Take the first if(length(replaceVar) > 1) replaceVar <- replaceVar[[1]] + + # Fix problems: Unknowns might have no name + if(!length(replaceVar)){ + replaceVar <- "" + } + # Substitute the parameter value into the string parsedVar <- sub("\\{(.*)\\}", replaceVar, var) return(parsedVar) @@ -1594,8 +1825,11 @@ toMassbank <- function (mbdata) lapply(names(mbdata[[entry]]), function(subentry) { - # todo - mbf[[count]] <<- paste(entry,": ",subentry, " ", mbdata[[entry]][[subentry]], sep='') + if(subentry != "SPLASH"){ + mbf[[count]] <<- paste(entry,": ",subentry, " ", mbdata[[entry]][[subentry]], sep='') + } else { + mbf[[count]] <<- paste(entry,": ", mbdata[[entry]][[subentry]], sep='') + } #print(mbf) count <<- count + 1 }) @@ -1669,24 +1903,26 @@ toMassbank <- function (mbdata) #' @export exportMassbank <- function(compiled, files, molfile) { - molnames <- c() - for(file in 1:length(compiled)) - { - # Read the accession no. from the corresponding "compiled" entry - filename <- compiled[[file]]["ACCESSION"] - # use this accession no. as filename - filename <- paste(filename, ".txt", sep="") - write(files[[file]], - file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata",filename) - ) - } - # Use internal ID for naming the molfiles - molname <- sprintf("%04d", as.numeric( - compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) - molname <- paste(molname, ".mol", sep="") - write(molfile, - file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata",molname) - ) + molnames <- c() + for(file in 1:length(compiled)) + { + # Read the accession no. from the corresponding "compiled" entry + filename <- compiled[[file]]["ACCESSION"] + # use this accession no. as filename + filename <- paste(filename, ".txt", sep="") + write(files[[file]], + file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata",filename) + ) + } + # Use internal ID for naming the molfiles + if(findLevel(compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]],TRUE)=="standard"){ + molname <- sprintf("%04d", as.numeric( + compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) + molname <- paste(molname, ".mol", sep="") + write(molfile, + file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata",molname) + ) + } } # Makes a list.tsv with molfile -> massbank ch$name attribution. @@ -1711,24 +1947,29 @@ exportMassbank <- function(compiled, files, molfile) #' @export makeMollist <- function(compiled) { - # For every "compiled" entry (here, compiled is not one "compiled" entry but the total - # list of all compiled spectra), extract the uppermost CH$NAME and the ID (from the - # first spectrum.) Make the ID into 0000 format. - tsvlist <- t(sapply(compiled, function(entry) + # For every "compiled" entry (here, compiled is not one "compiled" entry but the total + # list of all compiled spectra), extract the uppermost CH$NAME and the ID (from the + # first spectrum.) Make the ID into 0000 format. + + tsvlist <- t(sapply(compiled, function(entry) { - name <- entry[[1]][["CH$NAME"]][[1]] - id <- sprintf("%04d", as.numeric(entry[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) - molfilename <- paste(id,".mol",sep='') - return(c(name,molfilename)) - })) - # Write the file with the - write.table(tsvlist, - paste(getOption("RMassBank")$annotations$entry_prefix,"/moldata/list.tsv", sep=''), - quote = FALSE, - sep="\t", - row.names=FALSE, - col.names=FALSE - ) + name <- entry[[1]][["CH$NAME"]][[1]] + id <- sprintf("%04d", as.numeric(entry[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) + molfilename <- paste(id,".mol",sep='') + return(c(name,molfilename)) + })) + + IDs <- sapply(compiled, function(entry) return( sprintf("%04d", as.numeric(entry[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])))) + level <- sapply(IDs, findLevel, compact=TRUE) + validentries <- which(level == "standard") + # Write the file with the + write.table(tsvlist[validentries,], + paste(getOption("RMassBank")$annotations$entry_prefix,"/moldata/list.tsv", sep=''), + quote = FALSE, + sep="\t", + row.names=FALSE, + col.names=FALSE + ) } diff --git a/R/formulaCalculator.R b/R/formulaCalculator.R index 26cb54d685680aec191042ecfc26c2ba69d4f3d9..5b9257475ff0b90c116da040d69063bd73043623 100755 --- a/R/formulaCalculator.R +++ b/R/formulaCalculator.R @@ -52,12 +52,12 @@ NULL #' @export to.limits.rcdk <- function(formula) { - if(!is.list(formula)) - formula <- formulastring.to.list(formula) - elelist <- lapply(names(formula), function(element) { - return(c(element, 0, formula[[element]])) - }) - return(elelist) + if(!is.list(formula)) + formula <- formulastring.to.list(formula) + elelist <- lapply(names(formula), function(element) { + return(c(element, 0, formula[[element]])) + }) + return(elelist) } @@ -342,3 +342,15 @@ ppm <- function(mass, dppm, l=FALSE, p=FALSE) .emass <- 0.0005485799 ## pmass <- 1.007276565 ## hmass <- 1.007825 + + +split.formula.posneg <- function(f, as.formula = TRUE, as.list=FALSE) +{ + if(!is.list(f)) f <- formulastring.to.list(f) + pos <- f[which(f > 0)] + neg <- f[which(f < 0)] + if(as.formula & !as.list) + return(list(pos=list.to.formula(pos), neg=list.to.formula(neg))) + else + return(list(pos=pos, neg=neg)) +} \ No newline at end of file diff --git a/R/getSplash.R b/R/getSplash.R new file mode 100644 index 0000000000000000000000000000000000000000..a3299c8201c9bc4452125c70d67aebfbe51dcfb9 --- /dev/null +++ b/R/getSplash.R @@ -0,0 +1,66 @@ + +## Some constants +BINS <- 10; +BIN_SIZE <- 100; +EPS_CORRECTION = 1.0e-7 + +## FINAL_SCALE_FACTOR <- 9; ## Base 10 +FINAL_SCALE_FACTOR <- 35; ## Base 36 + +decimalavoidance <- function(x) { + format(floor (x * 1000000), scientific=FALSE) +} + +getBlockHash <- function(peaks, + RELATIVE_INTENSITY_SCALE=100, + MAX_HASH_CHARATERS_ENCODED_SPECTRUM=20) { + max_intensity = max(peaks[,2]) + + ## Scale to maximum intensity + peaks[,2] <- as.integer(peaks[,2] / max(peaks[,2]) * RELATIVE_INTENSITY_SCALE + EPS_CORRECTION) + + ## Sorted by ascending m/z and ties broken by descending intensity + o <- order(peaks[,1], -1*peaks[,2], decreasing=FALSE) + + peakString <- paste(apply(peaks[o,,drop=FALSE], MARGIN=1, + FUN=function(x) {paste(c(decimalavoidance(x[1]+EPS_CORRECTION), x[2]), + collapse=":")}), collapse=" ") + + ## cat("Prehash: ", peakString, sep="", file="/tmp/prehash-R") ## Debugging of spectrum-string + block2 <- substr(digest(peakString, algo="sha256", serialize=FALSE), + 1, MAX_HASH_CHARATERS_ENCODED_SPECTRUM) +} + +integer2base36 <- function(i) { + integer2base36code <- sapply(c(48:57, 97:122), function(i) rawToChar(as.raw(i))) + paste(integer2base36code[i+1], collapse="") +} + +getBlockHist <- function(peaks) { + ## Initialise output + wrappedhist <- integer(BINS) + + binindex <- as.integer(peaks[,1] / BIN_SIZE) + + summedintensities <- tapply(peaks[,2], binindex, sum) + wrappedbinindex <- unique(binindex) %% BINS + wrappedintensities <- tapply(summedintensities, wrappedbinindex, sum) + normalisedintensities <- as.integer(wrappedintensities/max(wrappedintensities)*FINAL_SCALE_FACTOR) + + wrappedhist[sort(unique(wrappedbinindex))+1] <- normalisedintensities + paste(integer2base36(wrappedhist), collapse="") + +} + +getSplash <- function(peaks) { + + block2 <- getBlockHist(peaks) + block3 <- getBlockHash(peaks) + + splash <- paste("splash10", + block2, + block3, + sep="-") + + return(splash) +} diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index bbe5fa5e25217c9a571cd1a4b1e078d8e2936966..7af0405d34992f3e273ad7f6ba9af540ed54c9a8 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -37,46 +37,190 @@ loadList <- function(path, listEnv = NULL, check = TRUE) listEnv <- .listEnvEnv if(!file.exists(path)) stop("The supplied file does not exist, please supply a correct path") - compoundList <- read.csv(path, stringsAsFactors=FALSE) - # check whether the necessary columns are present + + # Try out if the file is comma- or semicolon-separated + compoundList <- read.csv(path, stringsAsFactors=FALSE, check.names=FALSE) n <- colnames(compoundList) - cols <- c('ID', 'Name', 'SMILES', 'RT', 'CAS') - d <- setdiff(cols, n) - if(length(d)>0){ - compoundList <- read.csv2(path, stringsAsFactors=FALSE) - n <- colnames(compoundList) - d2 <- setdiff(cols, n) - if(length(d2) > 0){ - stop("Some columns are missing in the compound list. Needs at least ID, Name, SMILES, RT, CAS.") - } - } + if(!("ID" %in% n)){ # If no ID column, it must be semicolon separated + compoundList <- read.csv2(path, stringsAsFactors=FALSE, check.names=FALSE) + n <- colnames(compoundList) + if(!("ID" %in% n)){ # ...or there must be something wrong with the column names + stop("There is no 'ID' column in the compound list") + } + } + + # Now everything should be fine, at least regarding csv and ssv + + # Are there duplicate compound IDs? + if(any(duplicated(compoundList$ID))){ + stop("Duplicate compound IDs are present. Please check the compound list.") + } + + # Do all Compoundlist IDs have 4 characters or less? + if(any(nchar(compoundList$ID) > 4)){ + stop("The maximum number of digits for compound IDs is 4") + } + + # Evaluate if all strictly necessary columns are in the list + cols <- c('ID', 'Name', 'SMILES', 'RT', 'CAS') + d <- setdiff(cols, n) + if(length(d)>0){ + stop(paste("Some columns are missing in the compound list. It needs at least", paste(cols,collapse=", "))) + } + + # Is there a "Level" column? + if("Level" %in% n){ + newList <- TRUE + } else { + newList <- FALSE + } + + # Assign for now... + assign("listEnv", listEnv, envir=.listEnvEnv) + .listEnvEnv$listEnv$compoundList <- compoundList + # If "level" is in the compound list we have to check several things: + + if(newList){ + + # a) Are the levels part of the defined levels? + # b) Are the values ok for every level? (i.e. all necessary values supplied for each line in the compound list?) + + # Check a) (and translate to number levels because handling numbers and text would make the if-statements here even more confusing) + + compoundList$Level <- sapply(as.character(compoundList$Level),.translateLevel) + + # Need this variable for levels 3b, 3c, 3d, 4 and potentially 3 and 3a + formulaColPresent <- "Formula" %in% n + + if(any(c("3b","3c","3d","4") %in% compoundList$Level)){ + + if(!(formulaColPresent)){ + .listEnvEnv$listEnv$compoundList <- NULL + stop('The compound list must contain the column "Formula" if a tentative or formula compound (levels 3b, 3c, 3d, 4) is present') + } + } + # Need this variable for level 5 + if("5" %in% compoundList$Level){ + mzCol <- which(c("mz","m/z","mass","exactMass") %in% n) + if(length(mzCol) > 1){ + .listEnvEnv$listEnv$compoundList <- NULL + stop("The compound list can only contain one column with one of the following column names: 'mz','m/z','mass','exactMass' if an unknown compound (level 5) is present") + } + + if(mzCol == 2){ + n[which(n == "m/z")] <- "mz" + } + + if(mzCol == 4){ + n[which(n == "exactMass")] <- "mass" + } + .listEnvEnv$listEnv$mzCol <- c("mz","m/z","mass","exactMass")[mzCol] + colnames(compoundList) <- n + } + # Check b) + for(i in 1:length(compoundList$Level)){ + level <- compoundList$Level[i] + # ID must have numbers as values + if(!is.numeric(compoundList[i,"ID"])){ + .listEnvEnv$listEnv$compoundList <- NULL + stop(paste("Value for 'ID' in line", i, "of the compound list is not a valid compound ID")) + } + + # If level is "1x" or "2x", the SMILES must be supplied and must be correct + if(level %in% c("0","1","1a","1b","1c","2","2a","2b")){ + currEnvir <- environment() + + tryCatch( + findMz(compoundList[i,"ID"]), + error = function(e){ + .listEnvEnv$listEnv$compoundList <- NULL + stop(paste("'SMILES' value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid SMILES")) + } + ) + } + + # If level is "3" or "3a", a valid smiles or formula must be supplied + if(level %in% c("3","3a")){ + if(!is.na(findSmiles(compoundList[i,"ID"]))){ + tryCatch( + findMz(compoundList[i,"ID"]), + error = function(e){ + .listEnvEnv$listEnv$compoundList <- NULL + stop(paste("'SMILES' value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid SMILES")) + } + ) + } else{ + if(!formulaColPresent){ + .listEnvEnv$listEnv$compoundList <- NULL + stop(paste("Compound", compoundList[i,"ID"], "in line", i, "of the compound list is marked as level 3, but there is no valid SMILES + value nor a 'Formula' column present")) + } + + tryCatch( + rcdk::get.formula(findFormula(compoundList[i,"ID"], retrieval="tentative")), + error = function(e){ + .listEnvEnv$listEnv$compoundList <- NULL + stop(paste("'Formula' value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid Formula")) + } + ) + } + } + + # If level is "3b","3c","3d" or "4", a valid formula must be supplied + if(level %in% c("3b","3c","3d","4")){ + + tryCatch( + rcdk::get.formula(findFormula(compoundList[i,"ID"], retrieval="tentative")), + error = function(e){ + .listEnvEnv$listEnv$compoundList <- NULL + stop(paste("'Formula' value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid Formula")) + } + ) + } + # If level is "5", m/z must be supplied + + if(level == "5"){ + if(!is.numeric(findMz(compoundList[i,"ID"], retrieval="unknown")$mzCenter)){ + .listEnvEnv$listEnv$compoundList <- NULL + stop(paste(c("mz","m/z","mass","exactMass")[mzCol],"value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid number")) + } + } + } + } + + # If "Level" is not in the compound list it MUST be a standard list, so process just as before: + if(!newList){ + cols <- c('ID', 'Name', 'SMILES', 'RT', 'CAS') + d <- setdiff(cols, n) + if(length(d)>0){ + stop("Some columns are missing in the compound list. Needs at least ID, Name, SMILES, RT, CAS.") + } - if(length(duplicated(compoundList$cpdID)) > 0) - stop("Duplicate compound IDs are present. Please check your list.") - assign("listEnv", listEnv, envir=.listEnvEnv) - .listEnvEnv$listEnv$compoundList <- compoundList - - - ### - ###Test if all the IDs work - ### - - if(check){ - wrongID <- vector() - currEnvir <- environment() - sapply(compoundList[,"ID"], function(x){ - tryCatch( - findMz(x), - error = function(e){ - currEnvir$wrongID <- c(currEnvir$wrongID, x) - } - ) - }) - if(length(wrongID)){ - stop(paste("Unable to interpret the SMILES-strings for ID(s)", paste(wrongID, collapse= " "), "\nPlease check these and load the list again.")) - } - } + ### + ###Test if all the IDs work + ### + + if(check){ + wrongID <- vector() + currEnvir <- environment() + sapply(compoundList[,"ID"], function(x){ + tryCatch( + findMz(x), + error = function(e){ + currEnvir$wrongID <- c(currEnvir$wrongID, x) + } + ) + }) + if(length(wrongID)){ + .listEnvEnv$listEnv$compoundList <- NULL + stop(paste("Unable to interpret the SMILES-strings for ID(s)", paste(wrongID, collapse= " "), "\nPlease check these and load the list again.")) + } + } + Level <- rep("0",nrow(compoundList)) + .listEnvEnv$listEnv$compoundList <- cbind(compoundList,Level) + } + message("Loaded compoundlist successfully") } #' @export @@ -90,6 +234,61 @@ resetList <- function() assign("listEnv", NULL, envir=.listEnvEnv) } +# Function that translates one entry of the level column of the compound list into the number system +.translateLevel <- function(level){ + if(!(level %in% c("0","1","1a","1b","1c","2","2a","2b","3","3a","3b","3c","3d","4","5"))){ + switch(level, + standard={ + return("1a") + }, + parent={ + return("1b") + }, + confirmed={ + return("1c") + }, + probable={ + return("2") + }, + probableLibrary={ + return("2a") + }, + probableDiagnostic={ + return("2b") + }, + tentative={ + return("3") + }, + tentativeStructure={ + return("3a") + }, + tentativeIsomer={ + return("3b") + }, + tentativeTPClass={ + return("3c") + }, + tentativeBestMatch={ + return("3d") + }, + formula={ + return("4") + }, + unknown={ + return("5") + }, + exactMass={ + return("5") + }, + { + stop(paste(level, "is not a valid level")) + } + ) + } else{ + return(level) + } +} + #' Create Rcdk molecule from SMILES #' #' Generates a Rcdk molecule object from SMILES code, which is fully typed and @@ -125,8 +324,6 @@ getMolecule <- function(smiles) return(mol) } - - #' Find the exact mass +/- a given margin for a given formula or its ions and adducts. #' #' @param formula The molecular formula in text or list format (see \code{\link{formulastring.to.list}} @@ -139,25 +336,50 @@ getMolecule <- function(smiles) #' @examples findMz.formula("C6H6") #' @seealso \code{\link{findMz}} #' @export -findMz.formula <- function(formula, mode="pH", ppm=10, deltaMz=0) +findMz.formula <- function(formula, mode="pH", ppm=10, deltaMz=0) { - if(!any(mode %in% c("pH","pNa","pM","pNH4","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) - mzopt <- list(addition="", charge=0) - if(mode=="pH") mzopt <- list(addition="H", charge=1) - if(mode=="pNa") mzopt <- list(addition="Na", charge=1) - if(mode=="pM") mzopt <- list(addition="", charge=1) - if(mode=="pNH4") mzopt <- list(addition="NH4", charge=-1) - if(mode=="mH") mzopt <- list(addition="H-1", charge=-1) - if(mode=="mFA") mzopt <- list(addition="C1O2", charge=-1) - if(mode=="mM") mzopt <- list(addition="", charge=-1) - if(mode=="") mzopt <- list(addition="", charge=0) - - + if (!any(mode %in% c("pH", "pNa", "pM", "pNH4", "mH", "mFA", + "mM", ""))) + stop(paste("The ionization mode", mode, "is unknown.")) + mzopt <- list(addition = "", charge = 0) + if (mode == "pH") + mzopt <- list(addition = "H", charge = 1) + if (mode == "pNa") + mzopt <- list(addition = "Na", charge = 1) + if (mode == "pM") + mzopt <- list(addition = "", charge = 1) + if (mode == "pNH4") + mzopt <- list(addition = "NH4", charge = -1) + if (mode == "mH") + mzopt <- list(addition = "H-1", charge = -1) + if (mode == "mFA") + mzopt <- list(addition = "C1O2", charge = -1) + if (mode == "mM") + mzopt <- list(addition = "", charge = -1) + if (mode == "") + mzopt <- list(addition = "", charge = 0) formula <- add.formula(formula, mzopt$addition) - formula <- get.formula(formula, charge=mzopt$charge) - m <- formula@mass - delta <- ppm(m, ppm, l=TRUE) - return(list(mzMin=delta[[2]] - deltaMz, mzMax=delta[[1]] + deltaMz, mzCenter=m)) + # Since in special cases we want to use this with negative and zero number of atoms, we account for this case + # by splitting up the formula into positive and negative atom counts (this eliminates the zeroes.) + formula.split <- split.formula.posneg(formula) + m <- 0 + if(formula.split$pos != "") + { + formula.pos <- get.formula(formula.split$pos, charge = mzopt$charge) + m = m + formula.pos@mass + } + if(formula.split$neg != "") + { + formula.neg <- get.formula(formula.split$neg, charge = -mzopt$charge) + m = m - formula.neg@mass + } + if((nchar(formula.split$pos)==0) & (nchar(formula.split$neg)==0)) + { + m <- get.formula("H", charge = mzopt$charge)@mass - get.formula("H", charge = 0)@mass + } + delta <- ppm(m, ppm, l = TRUE) + return(list(mzMin = delta[[2]] - deltaMz, mzMax = delta[[1]] + + deltaMz, mzCenter = m)) } #' Find compound information @@ -165,18 +387,20 @@ findMz.formula <- function(formula, mode="pH", ppm=10, deltaMz=0) #' Retrieves compound information from the loaded compound list or calculates #' it from the SMILES code in the list. #' -#' @aliases findMz findSmiles findFormula findRt findCAS findName -#' @usage findMz(cpdID, mode = "pH", ppm = 10, deltaMz = 0) +#' @aliases findMz findSmiles findFormula findRt findCAS findName findLevel +#' @usage findMz(cpdID, mode = "pH", ppm = 10, deltaMz = 0, retrieval="standard") #' #' findRt(cpdID) #' #' findSmiles(cpdID) #' -#' findFormula(cpdID) +#' findFormula(cpdID, retrieval="standard") #' #' findCAS(cpdID) #' #' findName(cpdID) +#' +#' findLevel(cpdID, compact=FALSE) #' @param cpdID The compound ID in the compound list. #' @param mode Specifies the species of the molecule: An empty string specifies #' uncharged monoisotopic mass, \code{\var{pH}} (positive H) specifies [M+H]+, @@ -189,6 +413,11 @@ findMz.formula <- function(formula, mode="pH", ppm=10, deltaMz=0) #' @param deltaMz Specifies additional m/z window to add to the range (deltaMz #' = 0.02 will return the range of the molecular mass +- 0.02 (and additionally #' +- the set ppm value). +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @param compact Only for \code{findLevel}, returns the "retrieval" parameter used for many functions +#' within RMassBank if TRUE #' @return \code{findMz} will return a \code{list(mzCenter=, mzMin=, mzMax=)} #' with the molecular weight of the given ion, as calculated from the SMILES #' code and Rcdk. @@ -207,16 +436,31 @@ findMz.formula <- function(formula, mode="pH", ppm=10, deltaMz=0) #' } #' #' @export -findMz <- function(cpdID, mode="pH", ppm=10, deltaMz=0) +findMz <- function(cpdID, mode="pH", ppm=10, deltaMz=0, retrieval="standard") { - s <- findSmiles(cpdID) - #if(s=="-") s <- NA - if(is.na(s)){ - stop("There was no matching SMILES-entry to the supplied cpdID(s) \n Please check the cpdIDs and the compoundlist.") - return(list(mzMin=NA,mzMax=NA,mzCenter=NA)) - } - formula <- .get.mol2formula(getMolecule(s)) - return(findMz.formula(formula@string, mode, ppm, deltaMz)) + # In case of unknown: m/z is in table + if(retrieval == "unknown"){ + mz <- .listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),.listEnvEnv$listEnv$mzCol] + delta <- ppm(mz, ppm, l=TRUE) + return(list(mzMin=delta[2] - deltaMz, mzMax=delta[1] + deltaMz, mzCenter=mz)) + } + + # In case of tentative: calculate mass from formula + if(retrieval == "tentative"){ + return(findMz.formula(findFormula(cpdID, "tentative"),mode=mode)) + } + + # All other cases: Use smiles to calculate mass + if(retrieval == "standard"){ + s <- findSmiles(cpdID) + #if(s=="-") s <- NA + if(is.na(s)){ + stop("There was no matching SMILES-entry to the supplied cpdID(s) \n Please check the cpdIDs and the compoundlist.") + return(list(mzMin=NA,mzMax=NA,mzCenter=NA)) + } + formula <- .get.mol2formula(getMolecule(s)) + return(findMz.formula(formula@string, mode, ppm, deltaMz)) + } } #findMz <- function(...) findMz.rcdk(...) @@ -228,7 +472,7 @@ findRt <- function(cpdID) { stop("Compound list must be loaded first.") if(is.character(cpdID)) cpdID <- as.numeric(cpdID) - rt <- .listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"RT"] + rt <- as.numeric(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"RT"]) if(!is.null(getOption("RMassBank")$rtShift)) rt <- rt + getOption("RMassBank")$rtShift if(is.na(rt)) return(list(RT=NA)) @@ -243,15 +487,26 @@ findSmiles <- function(cpdID) { stop("Compound list must be loaded first.") if(!exists("compoundList", where=.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") + if(.listEnvEnv$listEnv$compoundList[match(cpdID, .listEnvEnv$listEnv$compoundList$ID),"SMILES"] == "") + return(NA) return(.listEnvEnv$listEnv$compoundList[match(cpdID, .listEnvEnv$listEnv$compoundList$ID),"SMILES"]) } #' @export -findFormula <- function(cpdID) { - smiles <- findSmiles(cpdID) - mol <- getMolecule(smiles) - f <- .get.mol2formula(mol) - return(f@string) +findFormula <- function(cpdID, retrieval = "standard") { + + # In case of tentative: read formula from table + if(retrieval=="tentative"){ + return(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"Formula"]) + } + + # Otherwise: Convert smiles to formula + if(retrieval=="standard"){ + smiles <- findSmiles(cpdID) + mol <- getMolecule(smiles) + f <- .get.mol2formula(mol) + return(f@string) + } } #' @export @@ -276,6 +531,31 @@ findName <- function(cpdID) { return(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"Name"]) } +#' @export +findLevel <- function(cpdID, compact=FALSE) { + if(is.character(cpdID)) + cpdID <- as.numeric(cpdID) + if(is.null(.listEnvEnv$listEnv)) + stop("Compound list must be loaded first.") + if(!exists("compoundList", where=.listEnvEnv$listEnv)) + stop("Compound list must be loaded first.") + if(compact){ + level <- .listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"Level"] + if(level %in% c("0","1","1a","1b","1c","2","2a","2b","3","3a")){ + return("standard") + } + if(level %in% c("3b","3c","3d","4")){ + return("tentative") + } + if(level == "5"){ + return("unknown") + } + + return("tentative") + } + return(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"Level"]) +} + #' Calculate exact mass #' #' Retrieves the exact mass of the uncharged molecule. It works directly from @@ -284,9 +564,14 @@ findName <- function(cpdID) { #' SMILES code retrieved from the database. (Alternatively, takes also the #' compound ID as parameter and looks it up.) Calculation relies on Rcdk. #' -#' @usage findMass(cpdID_or_smiles) #' @param cpdID_or_smiles SMILES code or compound ID of the molecule. (Numerics #' are treated as compound ID). +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' Only needed for retrieval="unknown" #' @return Returns the exact mass of the uncharged molecule. #' @author Michael Stravs #' @seealso \code{\link{findMz}} @@ -296,12 +581,49 @@ findName <- function(cpdID) { #' findMass("OC[C@@H]1OC(O)[C@@H](O)[C@@@@H](O)[C@@@@H]1O") #' #' @export -findMass <- function(cpdID_or_smiles) +findMass <- function(cpdID_or_smiles, retrieval="standard", mode = "pH") { - if(!is.numeric(cpdID_or_smiles)) - s <- cpdID_or_smiles - else - s <- findSmiles(cpdID_or_smiles) - mol <- getMolecule(s) - return(get.exact.mass(mol)) + # Must calculate mass manually if no formula is given + if(retrieval == "unknown"){ + if(mode == "pH") { + mass <- 1.00784 + mode.charge <- 1 + } else if(mode == "pNa") { + mass <- 22.989769 + mode.charge <- 1 + } else if(mode == "pM") { + mass <- 0 + mode.charge <- 1 + } else if(mode == "mM") { + mass <- 0 + mode.charge <- -1 + } else if(mode == "mH") { + mass <- -1.00784 + mode.charge <- -1 + } else if(mode == "mFA") { + mass <- 59.0440 + mode.charge <- -1 + } else if(mode == "pNH4") { + mass <- 18.03846 + mode.charge <- 1 + } else{ + stop("mode = \"", mode, "\" not defined") + } + return(findMz(cpdID_or_smiles, mode=mode, retrieval=retrieval)$mzCenter - mass + mode.charge * .emass) + } + + # Calculate mass from formula + if(retrieval == "tentative"){ + return(get.formula(findFormula(cpdID_or_smiles, "tentative"))@mass) + } + + # Calculate mass from SMILES + if(retrieval == "standard"){ + if(!is.numeric(cpdID_or_smiles)) + s <- cpdID_or_smiles + else + s <- findSmiles(cpdID_or_smiles) + mol <- getMolecule(s) + return(get.exact.mass(mol)) + } } \ No newline at end of file diff --git a/R/leMsMs.r b/R/leMsMs.r index 9e85bccf5629b42c2a5e488135aac7bda79d53cc..3590b7db354244f6c5728e704e6170398e2e6218 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -79,164 +79,205 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec progressbar = "progressBarHook", MSe = FALSE) { .checkMbSettings() - if(!any(mode %in% c("pH","pNa","pNH4","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) - - if(!is.na(archivename)) - w@archivename <- archivename - - # Make a progress bar: - nProg <- 0 - nLen <- length(w@files) + if(!any(mode %in% c("pH","pNa","pNH4","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) + + if(!is.na(archivename)) + w@archivename <- archivename - if(readMethod == "minimal"){ - ##Edit options - opt <- getOption("RMassBank") - opt$recalibrator$MS1 <- "recalibrate.identity" - opt$recalibrator$MS2 <- "recalibrate.identity" - options(RMassBank=opt) - ##Edit analyzemethod - analyzeMethod <- "intensity" - } + # Make a progress bar: + nProg <- 0 + nLen <- length(w@files) + + allUnknown <- FALSE + + # If all compounds are unknown some specific conditions apply + if(all(.listEnvEnv$listEnv$compoundList$Level == "5")){ + allUnknown <- TRUE + message("All compounds are unknown, the workflow will be adjusted accordingly") + } + + if(readMethod == "minimal"){ + ##Edit options + opt <- getOption("RMassBank") + opt$recalibrator$MS1 <- "recalibrate.identity" + opt$recalibrator$MS2 <- "recalibrate.identity" + opt$add_annotation <- FALSE + opt$multiplicityFilter <- 1 + options(RMassBank=opt) + settings <- getOption("RMassBank") + ##Edit analyzemethod + analyzeMethod <- "intensity" + } - # clean rerun functionality: - # if any step after 3 has been run, rerunning steps 4 or below needs moving back to the parent workspace. - # However, the recalibration must be preserved, because: - # if someone runs - # w <- msmsWorkflow(w, steps=c(1:4)), - # then substitutes the recalibration - # w@rc <- myrecal - # then runs step 4 again - # w <- msmsWorkflow(w, steps=c(4), newRecalibration=FALSE) - # the rc and rc.ms1 must be preserved and not taken from the parent workspace - if(!all(steps > 4) & !is.null(w@parent)) - { - rc <- w@rc - rc.ms1 <- w@rc.ms1 - w <- w@parent - w@rc <- rc - w@rc.ms1 <- rc.ms1 - } + # clean rerun functionality: + # if any step after 3 has been run, rerunning steps 4 or below needs moving back to the parent workspace. + # However, the recalibration must be preserved, because: + # if someone runs + # w <- msmsWorkflow(w, steps=c(1:4)), + # then substitutes the recalibration + # w@rc <- myrecal + # then runs step 4 again + # w <- msmsWorkflow(w, steps=c(4), newRecalibration=FALSE) + # the rc and rc.ms1 must be preserved and not taken from the parent workspace + if(!all(steps > 4) & !is.null(w@parent)) + { + rc <- w@rc + rc.ms1 <- w@rc.ms1 + w <- w@parent + w@rc <- rc + w@rc.ms1 <- rc.ms1 + } - # Step 1: acquire all MSMS spectra from files - if(1 %in% steps) - { - message("msmsWorkflow: Step 1. Acquire all MSMS spectra from files") - w <- msmsRead(w = w, files = w@files, readMethod=readMethod, mode=mode, confirmMode = confirmMode, useRtLimit = useRtLimit, - Args = findPeaksArgs, settings = settings, progressbar = progressbar, MSe = MSe) - } - # Step 2: first run analysis before recalibration - if(2 %in% steps) - { - nProg <- 0 - message("msmsWorkflow: Step 2. First analysis pre recalibration") - pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) - w@spectra <- as(lapply(w@spectra, function(spec) { - #print(spec$id) - s <- analyzeMsMs(spec, mode=mode, detail=TRUE, run="preliminary", - filterSettings = settings$filterSettings, - spectraList = settings$spectraList, method = analyzeMethod) - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(s) - }), "SimpleList") - ## for(f in w@files) - ## w@spectra[[basename(as.character(f))]]@name <- basename(as.character(f)) - suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) - } - # Step 3: aggregate all spectra - if(3 %in% steps) - { - message("msmsWorkflow: Step 3. Aggregate all spectra") - w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) - } - # Step 4: recalibrate all m/z values in raw spectra - if(4 %in% steps) - { - message("msmsWorkflow: Step 4. Recalibrate m/z values in raw spectra") - if(newRecalibration) - { - # note: makeRecalibration takes w as argument now, because it needs to get the MS1 spectra from @spectra - recal <- makeRecalibration(w, mode, - recalibrateBy = settings$recalibrateBy, - recalibrateMS1 = settings$recalibrateMS1, - recalibrator = settings$recalibrator, - recalibrateMS1Window = settings$recalibrateMS1Window) - w@rc <- recal$rc - w@rc.ms1 <- recal$rc.ms1 - } - w@parent <- w - w@aggregated <- data.frame() - spectra <- recalibrateSpectra(mode, w@spectra, w = w, - recalibrateBy = settings$recalibrateBy, - recalibrateMS1 = settings$recalibrateMS1) - w@spectra <- spectra - } - # Step 5: re-analysis on recalibrated spectra - if(5 %in% steps) - { - nProg <- 0 - message("msmsWorkflow: Step 5. Reanalyze recalibrated spectra") - pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) - - w@spectra <- as(lapply(w@spectra, function(spec) { - #print(spec$id) - s <- analyzeMsMs(spec, mode=mode, detail=TRUE, run="recalibrated", - filterSettings = settings$filterSettings, - spectraList = settings$spectraList, method = analyzeMethod) - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(s) - }), "SimpleList") - ## for(f in w@files) - ## w@spectra[[basename(as.character(f))]]@name <- basename(as.character(f)) - suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) - - do.call(progressbar, list(object=pb, close=TRUE)) - } - # Step 6: aggregate recalibrated results - if(6 %in% steps) - { - message("msmsWorkflow: Step 6. Aggregate recalibrated results") - w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) - if(!is.na(archivename)) - archiveResults(w, paste(archivename, ".RData", sep=''), settings) - w@aggregated <- cleanElnoise(w@aggregated, - settings$electronicNoise, settings$electronicNoiseWidth) - } - # Step 7: reanalyze failpeaks for (mono)oxidation and N2 adduct peaks - if(7 %in% steps) - { - message("msmsWorkflow: Step 7. Reanalyze fail peaks for N2 + O") - w@aggregated <- reanalyzeFailpeaks( - w@aggregated, custom_additions="N2O", mode=mode, - filterSettings=settings$filterSettings, - progressbar=progressbar) - if(!is.na(archivename)) - archiveResults(w, paste(archivename, "_RA.RData", sep=''), settings) - } - # Step 8: heuristic filtering based on peak multiplicity; - # creation of failpeak list - if(8 %in% steps) - { - message("msmsWorkflow: Step 8. Peak multiplicity filtering") - if (is.null(settings$multiplicityFilter)) { - message("msmsWorkflow: Step 8. Peak multiplicity filtering skipped because multiplicityFilter parameter is not set.") - } else { - # apply heuristic filter - w@aggregated <- filterMultiplicity( - w, archivename, mode, settings$multiplicityFilter ) - w@aggregated <- processProblematicPeaks(w, mode, archivename) - - if(!is.na(archivename)) - archiveResults(w, paste(archivename, "_RF.RData", sep=''), settings) + # Step 1: acquire all MSMS spectra from files + if(1 %in% steps) + { + message("msmsWorkflow: Step 1. Acquire all MSMS spectra from files") + w <- msmsRead(w = w, files = w@files, readMethod=readMethod, mode=mode, confirmMode = confirmMode, useRtLimit = useRtLimit, + Args = findPeaksArgs, settings = settings, progressbar = progressbar, MSe = MSe) } - } - message("msmsWorkflow: Done.") - return(w) + # Step 2: first run analysis before recalibration + if(2 %in% steps) + { + nProg <- 0 + message("msmsWorkflow: Step 2. First analysis pre recalibration") + if(allUnknown){ + analyzeMethod <- "intensity" + } + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + w@spectra <- as(lapply(w@spectra, function(spec) { + #print(spec$id) + # if(findLevel(spec@id,TRUE) == "unknown"){ + # analyzeMethod <- "intensity" + # } else { + # analyzeMethod <- "formula" + # } + s <- analyzeMsMs(spec, mode=mode, detail=TRUE, run="preliminary", + filterSettings = settings$filterSettings, + spectraList = settings$spectraList, method = analyzeMethod) + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(s) + }), "SimpleList") + ## for(f in w@files) + ## w@spectra[[basename(as.character(f))]]@name <- basename(as.character(f)) + suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) + } + # Step 3: aggregate all spectra + if(3 %in% steps) + { + message("msmsWorkflow: Step 3. Aggregate all spectra") + w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) + } + + if(allUnknown){ + w@aggregated$noise <- FALSE + w@aggregated$noise <- FALSE + w@aggregated$reanalyzed.formula <- NA + w@aggregated$reanalyzed.mzCalc <- NA + w@aggregated$reanalyzed.dppm <- NA + w@aggregated$reanalyzed.formulaCount <- NA + w@aggregated$reanalyzed.dbe <- NA + w@aggregated$matchedReanalysis <- NA + w@aggregated$filterOK <- TRUE + w@aggregated$problematicPeak <- FALSE + w@aggregated$formulaMultiplicity <- unlist(sapply(table(w@aggregated$cpdID),function(x) rep(x,x))) + return(w) + } + + + # Step 4: recalibrate all m/z values in raw spectra + if(4 %in% steps) + { + message("msmsWorkflow: Step 4. Recalibrate m/z values in raw spectra") + if(newRecalibration) + { + # note: makeRecalibration takes w as argument now, because it needs to get the MS1 spectra from @spectra + recal <- makeRecalibration(w, mode, + recalibrateBy = settings$recalibrateBy, + recalibrateMS1 = settings$recalibrateMS1, + recalibrator = settings$recalibrator, + recalibrateMS1Window = settings$recalibrateMS1Window) + w@rc <- recal$rc + w@rc.ms1 <- recal$rc.ms1 + } + w@parent <- w + w@aggregated <- data.frame() + spectra <- recalibrateSpectra(mode, w@spectra, w = w, + recalibrateBy = settings$recalibrateBy, + recalibrateMS1 = settings$recalibrateMS1) + w@spectra <- spectra + } + # Step 5: re-analysis on recalibrated spectra + if(5 %in% steps) + { + nProg <- 0 + message("msmsWorkflow: Step 5. Reanalyze recalibrated spectra") + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + + w@spectra <- as(lapply(w@spectra, function(spec) { + #print(spec$id) + if(findLevel(spec@id,TRUE) == "unknown"){ + analyzeMethod <- "intensity" + } else { + analyzeMethod <- "formula" + } + s <- analyzeMsMs(spec, mode=mode, detail=TRUE, run="recalibrated", + filterSettings = settings$filterSettings, + spectraList = settings$spectraList, method = analyzeMethod) + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(s) + }), "SimpleList") + ## for(f in w@files) + ## w@spectra[[basename(as.character(f))]]@name <- basename(as.character(f)) + suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) + + do.call(progressbar, list(object=pb, close=TRUE)) + } + # Step 6: aggregate recalibrated results + if(6 %in% steps) + { + message("msmsWorkflow: Step 6. Aggregate recalibrated results") + w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) + if(!is.na(archivename)) + archiveResults(w, paste(archivename, ".RData", sep=''), settings) + w@aggregated <- cleanElnoise(w@aggregated, + settings$electronicNoise, settings$electronicNoiseWidth) + } + # Step 7: reanalyze failpeaks for (mono)oxidation and N2 adduct peaks + if(7 %in% steps) + { + message("msmsWorkflow: Step 7. Reanalyze fail peaks for N2 + O") + w@aggregated <- reanalyzeFailpeaks( + w@aggregated, custom_additions="N2O", mode=mode, + filterSettings=settings$filterSettings, + progressbar=progressbar) + if(!is.na(archivename)) + archiveResults(w, paste(archivename, "_RA.RData", sep=''), settings) + } + # Step 8: heuristic filtering based on peak multiplicity; + # creation of failpeak list + if(8 %in% steps) + { + message("msmsWorkflow: Step 8. Peak multiplicity filtering") + if (is.null(settings$multiplicityFilter)) { + message("msmsWorkflow: Step 8. Peak multiplicity filtering skipped because multiplicityFilter parameter is not set.") + } else { + # apply heuristic filter + w@aggregated <- filterMultiplicity( + w, archivename, mode, settings$multiplicityFilter) + w@aggregated <- processProblematicPeaks(w, mode, archivename) + + if(!is.na(archivename)) + archiveResults(w, paste(archivename, "_RF.RData", sep=''), settings) + } + } + message("msmsWorkflow: Done.") + return(w) } #' Analyze MSMS spectra @@ -564,7 +605,6 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi limits <- to.limits.rcdk(parent_formula) - peakmatrix <- lapply( split(shot,shot$row) , function(shot.row) { @@ -641,14 +681,13 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi #iff_rcdk_pM_eln$maxvalence <- unlist(lapply(diff_rcdk_pM_eln$formula.rcdk, maxvalence)) temp.child.ok <- (childPeaks$dbe >= filterSettings$dbeMinLimit) # & dbe < dbe_parent + 3) - # check if a peak was recognized if(length(which(temp.child.ok)) == 0) { child@ok <- FALSE return(child) } -#browser() + #browser() # find the best ppm value bestPpm <- aggregate(as.data.frame(childPeaks[!is.na(childPeaks$dppm),"dppm"]), list(childPeaks[!is.na(childPeaks$dppm),"row"]), @@ -668,14 +707,14 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi childPeaksFilt <- filterLowaccResults(childPeaks, filterMode, filterSettings) childPeaksGood <- childPeaksFilt[["TRUE"]] childPeaksBad <- childPeaksFilt[["FALSE"]] - if(is.null(childPeaksGood)) + if(is.null(childPeaksGood)){ childPeaksGood <- childPeaks[c(),,drop=FALSE] + childPeaksGood$good <- logical(0) + } if(is.null(childPeaksBad)) childPeaksBad <- childPeaks[c(),,drop=FALSE] childPeaksUnassigned <- childPeaks[is.na(childPeaks$dppm),,drop=FALSE] childPeaksUnassigned$good <- rep(FALSE, nrow(childPeaksUnassigned)) - - # count formulas within new limits # (the results of the "old" count stay in childPeaksInt and are returned # in $childPeaks) @@ -686,9 +725,7 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi childPeaksUnassigned$formulaCount <- rep(NA, nrow(childPeaksUnassigned)) childPeaksBad$formulaCount <- rep(NA, nrow(childPeaksBad)) childPeaksBad$good <- rep(FALSE, nrow(childPeaksBad)) - - - + # Now: childPeaksGood (containing the new, recounted peaks with good = TRUE), and childPeaksBad (containing the # peaks with good=FALSE, i.e. outside filter criteria, with the old formula count even though it is worthless) # are bound together. @@ -698,6 +735,7 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi # Now let's cross fingers. Add a good=NA column to the unmatched peaks and reorder the columns # to match order in childPeaks. After that, setData to the child slot. + childPeaksOmitted <- getData(child) childPeaksOmitted <- childPeaksOmitted[child@low | child@satellite,,drop=FALSE] childPeaksOmitted$rawOK <- rep(FALSE, nrow(childPeaksOmitted)) @@ -708,11 +746,10 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi childPeaksOmitted$dbe <- rep(NA, nrow(childPeaksOmitted)) childPeaksOmitted$dppmBest <- rep(NA, nrow(childPeaksOmitted)) childPeaksOmitted$formulaCount <- rep(0, nrow(childPeaksOmitted)) - childPeaks$satellite <- rep(FALSE, nrow(childPeaks)) childPeaks$low <- rep(FALSE, nrow(childPeaks)) childPeaks$rawOK <- rep(TRUE, nrow(childPeaks)) - + childPeaks <- childPeaks[,colnames(childPeaksOmitted), drop=FALSE] childPeaksTotal <- rbind(childPeaks, childPeaksOmitted) @@ -795,8 +832,9 @@ analyzeMsMs.intensity <- function(msmsPeaks, mode="pH", detail=FALSE, run="preli # Filter out low intensity peaks: child@low <- (shot$intensity < cut) | (shot$intensity < max(shot$intensity)*cut_ratio) - shot <- shot[!child@low,,drop=FALSE] shot_full <- shot + shot <- shot[!child@low,,drop=FALSE] + # Is there still anything left? if(length(which(!child@low))==0) @@ -831,9 +869,11 @@ analyzeMsMs.intensity <- function(msmsPeaks, mode="pH", detail=FALSE, run="preli formula=msmsPeaks@formula, dppm=0, x1=0,x2=0,x3=0) - - - childPeaks <- addProperty(shot, "good", "logical", FALSE) + + childPeaks <- addProperty(shot_full, "rawOK", "logical", FALSE) + childPeaks[!(child@low | child@satellite),"rawOK"] <- TRUE + + childPeaks <- addProperty(childPeaks, "good", "logical", FALSE) childPeaks[childPeaks$rawOK,"good"] <- TRUE childPeaks <- addProperty(childPeaks, "mzCalc", "numeric") @@ -853,8 +893,8 @@ analyzeMsMs.intensity <- function(msmsPeaks, mode="pH", detail=FALSE, run="preli childPeaks <- addProperty(childPeaks, "dppmBest", "numeric") childPeaks[childPeaks$rawOK,"dppmBest"] <- 0 - - child <- setData(child, childPeaks) + + child <- setData(child, childPeaks) child@ok <- TRUE return(child) } @@ -1119,7 +1159,7 @@ problematicPeaks <- function(peaks_unmatched, peaks_matched, mode="pH") # find parent m/z to exclude co-isolated peaks #p_control$mzCenter <- numeric(nrow(p_control)) p_control$mzCenter <- as.numeric( - unlist(lapply(p_control$cpdID, function(id) findMz(id, mode)$mzCenter)) ) + unlist(lapply(p_control$cpdID, function(id) findMz(id, mode, retrieval=findLevel(id,TRUE))$mzCenter)) ) p_control_noMH <- p_control[ (p_control$mzFound < p_control$mzCenter - 1) | (p_control$mzFound > p_control$mzCenter + 1),] @@ -1133,7 +1173,9 @@ problematicPeaks <- function(peaks_unmatched, peaks_matched, mode="pH") #' #' @param w \code{msmsWorkspace} to analyze. #' @param mode Processing mode (pH etc) -#' @param archivename Base name of the archive to write to (for "abc" the exported failpeaks list will be "abc_Failpeaks.csv"). +#' @param archivename Base name of the archive to write to (for "abc" the exported failpeaks list will be "abc_Failpeaks.csv"). +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z #' @return Returns the aggregate data.frame with added column "\code{problematic}" (logical) which marks peaks which match the problematic criteria #' #' @author stravsmi @@ -1707,8 +1749,8 @@ reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, # generate.formula starts from empirical mass, but dppm cal- # culation of the evaluation starts from theoretical mass. # So we don't miss the points on 'the border'. - - db_formula <- findFormula(cpdID) + + db_formula <- findFormula(cpdID, retrieval=findLevel(cpdID,TRUE)) ppmlimit <- 2.25 * filterSettings$ppmFine parent_formula <- add.formula(db_formula, allowed_additions) @@ -1824,8 +1866,11 @@ filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) { peaks <- cbind(peaks, data.frame(formulaMultiplicity=numeric())) if(recalcBest){ - if(formulacol == formula){ - warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point.") + if(formulacol == "formula"){ + warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") + } + if(formulacol == "reanalyzed.formula"){ + warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") } peaks$fM_factor <- as.factor(peaks$formulaMultiplicity) return(peaks) @@ -2020,7 +2065,7 @@ filterMultiplicity <- function(w, archivename=NA, mode="pH", recalcBest = TRUE, # Kick the M+H+ satellites out of peaksReanOK: peaksReanOK$mzCenter <- as.numeric( - unlist(lapply(peaksReanOK$cpdID, function(id) findMz(id)$mzCenter)) ) + unlist(lapply(peaksReanOK$cpdID, function(id) findMz(id, retrieval=findLevel(id,TRUE))$mzCenter)) ) peaksReanBad <- peaksReanOK[ !((peaksReanOK$mzFound < peaksReanOK$mzCenter - 1) | (peaksReanOK$mzFound > peaksReanOK$mzCenter + 1)),] @@ -2084,8 +2129,8 @@ recalibrate.addMS1data <- function(spec,mode="pH", recalibrateMS1Window = mzL <- findMz.formula(cpd@formula,mode,recalibrateMS1Window,0) mzCalc <- mzL$mzCenter ms1 <- mz(cpd@parent) - mzFound <- ms1[(ms1 >= mzL$mzMin) & (ms1 <= mzL$mzMax)] - + + mzFound <- ms1[which.min(abs(ms1 - mzL$mzCenter))] if(!length(mzFound)){ return(c( mzFound = NA, @@ -2097,7 +2142,8 @@ recalibrate.addMS1data <- function(spec,mode="pH", recalibrateMS1Window = return(c( mzFound = mzFound, mzCalc = mzCalc, - dppm = dppmRc + dppm = dppmRc, + id=cpd@id )) } }) diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index 41a75fb0817f74818f46d38d89467ed744c25cfb..f69458c26d60f3baaec12cf4af14799b281e73bc 100755 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -10,6 +10,7 @@ #' @import rcdk #' @import rjson #' @import yaml +#' @import digest NULL # This is required so that roxygen knows where the first manpage starts @@ -84,6 +85,9 @@ NULL # This is required so that roxygen knows where the first manpage starts #' @param rtMargin The retention time tolerance to use. #' @param deprofile Whether deprofiling should take place, and what method should be #' used (cf. \code{\link{deprofile}}) +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z #' @return An \code{RmbSpectraSet} (for \code{findMsMsHR}). Contains parent MS1 spectrum (\code{@@parent}), a block of dependent MS2 spectra ((\code{@@children}) #' and some metadata (\code{id},\code{mz},\code{name},\code{mode} in which the spectrum was acquired. #' @@ -112,7 +116,8 @@ findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMo rtMargin = getOption("RMassBank")$rtMargin, deprofile = getOption("RMassBank")$deprofile, headerCache = NULL, - peaksCache = NULL) + peaksCache = NULL, + retrieval="standard") { # access data directly for finding the MS/MS data. This is done using @@ -122,7 +127,7 @@ findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMo if(!is.null(fileName)) msRaw <- openMSfile(fileName) - mzLimits <- findMz(cpdID, mode) + mzLimits <- findMz(cpdID, mode, retrieval=retrieval) mz <- mzLimits$mzCenter limit.fine <- ppm(mz, ppmFine, p=TRUE) if(!useRtLimit) @@ -143,7 +148,12 @@ findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMo #sp@mz <- mzLimits sp@id <- as.character(as.integer(cpdID)) sp@name <- findName(cpdID) - sp@formula <- findFormula(cpdID) + ENV <- environment() + if(retrieval == "unknown"){ + sp@formula <- "" + } else{ + sp@formula <- findFormula(cpdID, retrieval=retrieval) + } sp@mode <- mode # If we had to open the file, we have to close it again @@ -268,7 +278,7 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, tic = line["totIonCurrent"], peaksCount = line["peaksCount"], rt = line["retentionTime"], - acquisitionNum = as.integer(line["acquisitionNum"]), + acquisitionNum = as.integer(line["seqNum"]), centroided = TRUE ) }) @@ -283,7 +293,7 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, polarity = as.integer(masterHeader$polarity), peaksCount = as.integer(masterHeader$peaksCount), rt = masterHeader$retentionTime, - acquisitionNum = as.integer(masterHeader$acquisitionNum), + acquisitionNum = as.integer(masterHeader$seqNum), tic = masterHeader$totIonCurrent, centroided = TRUE ) diff --git a/R/msmsRead.R b/R/msmsRead.R index 4a315fa87ebe0dc376b92ca3361bc9ce0bc14b81..408d8dcfd69dca0f0c2ac1bfda0560e1c2636b76 100644 --- a/R/msmsRead.R +++ b/R/msmsRead.R @@ -42,12 +42,13 @@ #' @export msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, readMethod, mode, confirmMode = FALSE, useRtLimit = TRUE, - Args = NULL, settings = getOption("RMassBank"), progressbar = "progressBarHook", MSe = FALSE, plots = FALSE){ + Args = NULL, settings = getOption("RMassBank"), + progressbar = "progressBarHook", MSe = FALSE, plots = FALSE){ .checkMbSettings() ##Read the files and cpdids according to the definition ##All cases are silently accepted, as long as they can be handled according to one definition if(!any(mode %in% c("pH","pNa","pM","pNH4","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) - + if(is.null(filetable)){ ##If no filetable is supplied, filenames must be named explicitly if(is.null(files)) @@ -83,23 +84,24 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, if(!all(file.exists(w@files))){ stop("The supplied files ", paste(w@files[!file.exists(w@files)]), " don't exist") } - - na.ids <- which(is.na(sapply(cpdids, findSmiles))) - - if(length(na.ids)){ - stop("The supplied compound ids ", paste(cpdids[na.ids], collapse=" "), " don't have a corresponding smiles entry. Maybe they are missing from the compound list") - } + + # na.ids <- which(is.na(sapply(cpdids, findSmiles))) + + # if(length(na.ids)){ + # stop("The supplied compound ids ", paste(cpdids[na.ids], collapse=" "), " don't have a corresponding smiles entry. Maybe they are missing from the compound list") + # } ##This should work - if(readMethod == "minimal"){ - ##Edit options - opt <- getOption("RMassBank") - opt$recalibrator$MS1 <- "recalibrate.identity" - opt$recalibrator$MS2 <- "recalibrate.identity" - options(RMassBank=opt) - ##Edit analyzemethod - analyzeMethod <- "intensity" - } + if(readMethod == "minimal"){ + ##Edit options + opt <- getOption("RMassBank") + opt$recalibrator$MS1 <- "recalibrate.identity" + opt$recalibrator$MS2 <- "recalibrate.identity" + opt$add_annotation==FALSE + options(RMassBank=opt) + ##Edit analyzemethod + analyzeMethod <- "intensity" + } if(readMethod == "mzR"){ ##Progressbar @@ -113,7 +115,7 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, # Find compound ID cpdID <- cpdids[count] - + retrieval <- findLevel(cpdID,TRUE) # Set counter up envir$count <- envir$count + 1 @@ -124,7 +126,7 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, mzCoarse = settings$findMsMsRawSettings$mzCoarse, fillPrecursorScan = settings$findMsMsRawSettings$fillPrecursorScan, rtMargin = settings$rtMargin, - deprofile = settings$deprofile) + deprofile = settings$deprofile, retrieval=retrieval) gc() # Progress: @@ -310,7 +312,7 @@ msmsRead.RAW <- function(w, xRAW = NULL, cpdids = NULL, mode, findPeaksArgs = NU } else{ psp[[i]] <- which.min( abs(getRT(anmsms[[i]]) - RT) ) } - ## Now find the pspec for compound + ## Now find the pspec for compound ## 2nd best: Spectrum closest to MS1 ##psp <- which.min( abs(getRT(anmsms) - actualRT)) @@ -345,7 +347,7 @@ msmsRead.RAW <- function(w, xRAW = NULL, cpdids = NULL, mode, findPeaksArgs = NU spectraNum <- length(w@spectra[[which(IDindex)]]@children) w@spectra[[which(IDindex)]]@children[[spectraNum+1]] <- sp@children[[1]] } else { - w@spectra[[length(www@spectra)+1]] <- sp + w@spectra[[length(w@spectra)+1]] <- sp } } else{ w@spectra[[1]] <- sp diff --git a/R/parseMassBank.R b/R/parseMassBank.R index c881b312cbce3b3c497c7675c963e627e31f6d73..32efc6f9369fce425b5a272cdb3427715e177eb1 100644 --- a/R/parseMassBank.R +++ b/R/parseMassBank.R @@ -167,18 +167,18 @@ parseMassBank <- function(Files){ } ##Extract the peaks and write the data into a data.frame PKStart <- grep('PK$PEAK:',record, fixed = TRUE) + 1 - endslash <- grep('//',record, fixed = TRUE) - if(PKStart < endslash){ - splitted <- strsplit(record[PKStart:(endslash-1)]," ") - PKPeak <- matrix(nrow = endslash - PKStart, ncol = 3) - for(k in 1:length(splitted)){ - splitted[[k]] <- splitted[[k]][which(splitted[[k]] != "")] - PKPeak[k,] <- splitted[[k]] - } - PKPeak <- as.data.frame(PKPeak, stringsAsFactors = FALSE) - PKPeak[] <- lapply(PKPeak, type.convert) - colnames(PKPeak) <- c("m/z", "int", "rel.int.") - } + endslash <- tail(grep('//',record, fixed = TRUE),1) + if(PKStart < endslash){ + splitted <- strsplit(record[PKStart:(endslash-1)]," ") + PKPeak <- matrix(nrow = endslash - PKStart, ncol = 3) + for(k in 1:length(splitted)){ + splitted[[k]] <- splitted[[k]][which(splitted[[k]] != "")] + PKPeak[k,] <- splitted[[k]] + } + PKPeak <- as.data.frame(PKPeak, stringsAsFactors = FALSE) + PKPeak[] <- lapply(PKPeak, type.convert) + colnames(PKPeak) <- c("m/z", "int", "rel.int.") + } mb@compiled_ok[[i]][['PK$PEAK']] <- PKPeak diff --git a/R/settings_example.R b/R/settings_example.R index fe7d81e0ce0e394d42f0b41d8b79478f46ccf840..b8fb266f7db516a7818382561dea238305ee4fa4 100755 --- a/R/settings_example.R +++ b/R/settings_example.R @@ -187,7 +187,7 @@ NULL authors = "Nomen Nescio, The Unseen University", copyright = "Copyright (C) XXX", publication = "", - license = "CC BY-SA", + license = "CC BY", instrument = "LTQ Orbitrap XL Thermo Scientific", instrument_type = "LC-ESI-ITFT", confidence_comment = "standard compound", diff --git a/R/sysData.rda b/R/sysData.rda deleted file mode 100644 index 7b22ce3cc76d4eb12ad0aadb20665a4f755212a4..0000000000000000000000000000000000000000 Binary files a/R/sysData.rda and /dev/null differ diff --git a/R/tools.R b/R/tools.R index cba9f109c208778851db33ba198e86f4ce51381f..030a9a943b1f7a3b03aeeb61a1ad4b6de41f9e4a 100644 --- a/R/tools.R +++ b/R/tools.R @@ -116,15 +116,15 @@ findProgress <- function(workspace) #' updateSettings <- function(settings, warn=TRUE) { - settings.new <- .settingsList - settings.old <- settings - renew <- setdiff(names(settings.new), names(settings.old)) - if(length(renew) > 0 && warn==TRUE){ + settings.new <- .settingsList + settings.old <- settings + renew <- setdiff(names(settings.new), names(settings.old)) + if(length(renew) > 0 && warn==TRUE){ warning(paste0("Your settings are outdated! The following fields were taken from default values: ", paste(renew,collapse=", "))) if("filterSettings" %in% renew) warning("The default values of filterSettings could change the processing behaviour if you have negative-mode spectra. Check ?updateSettings for details.") - } - settings.old[renew] <- settings.new[renew] - return(settings.old) + } + settings.old[renew] <- settings.new[renew] + return(settings.old) } diff --git a/R/validateMassBank.R b/R/validateMassBank.R index a9eeb4f42f0c3ca59806ac5584a068680e0b1ecc..2590b3d0af43fd142f1d5ce9d5a50a19668b9f98 100644 --- a/R/validateMassBank.R +++ b/R/validateMassBank.R @@ -175,7 +175,7 @@ smiles2mass <- function(SMILES){ return(mass) } -.unitTestRMB <- function(WD=getwd()){ +.unitTestmzR <- function(WD=getwd()){ requireNamespace("RUnit",quietly=TRUE) requireNamespace("RMassBankData",quietly=TRUE) oldwd <- getwd() @@ -217,19 +217,12 @@ smiles2mass <- function(SMILES){ #testFuncRegexp = "^test.+", rngKind = "Marsaglia-Multicarry", rngNormalKind = "Kinderman-Ramage") - testSuite3 <- RUnit::defineTestSuite("Evaluation of correct handling if no peaks are found", dirs = system.file("unitTests", - package="RMassBank"), testFileRegexp = "runit.NOPEAKS.R", - #testFuncRegexp = "^test.+", - rngKind = "Marsaglia-Multicarry", - rngNormalKind = "Kinderman-Ramage") testData <- suppressWarnings(RUnit::runTestSuite(testSuite)) testData2 <- suppressWarnings(RUnit::runTestSuite(testSuite2)) - testData3 <- suppressWarnings(RUnit::runTestSuite(testSuite3)) file.remove(c("pH_narcotics_Failpeaks.csv","pH_narcotics.RData","pH_narcotics_RA.RData","pH_narcotics_RF.RData")) RUnit::printTextProtocol(testData) RUnit::printTextProtocol(testData2) - RUnit::printTextProtocol(testData3) setwd(oldwd) return(testData) } diff --git a/R/webAccess.R b/R/webAccess.R index 1f9147b574b1366b5f7c86619be151785b99bf8b..204b34504ed6ad2866ed1212ac4efc94b50fcba0 100755 --- a/R/webAccess.R +++ b/R/webAccess.R @@ -41,7 +41,7 @@ getCactus <- function(identifier, representation) ret <- tryCatch( getURLContent(paste( - "http://cactus.nci.nih.gov/chemical/structure/", + "https://cactus.nci.nih.gov/chemical/structure/", URLencode(identifier), "/", representation, sep='')), error = function(e) NA) if(is.na(ret)) diff --git a/inst/RMB_options.ini b/inst/RMB_options.ini index 2b2aa5bb11205ec22bd428fe2f3db4b9c05a57c2..10574b185029cb954cad133964428b7334330bfd 100755 --- a/inst/RMB_options.ini +++ b/inst/RMB_options.ini @@ -13,7 +13,7 @@ deprofile: # Deviation (in minutes) allowed the for retention time rtMargin: 0.4 # Systematic retention time shift -rtShift: -0.3 +rtShift: 0.0 # Directory to OpenBabel. Required for creating molfiles for MassBank export. # If no OpenBabel directory is given, RMassBank will attempt to use the CACTUS webservice @@ -39,7 +39,7 @@ annotations: authors: Nomen Nescio, The Unseen University copyright: Copyright (C) XXX publication: - license: CC BY-SA + license: CC BY instrument: LTQ Orbitrap XL Thermo Scientific instrument_type: LC-ESI-ITFT confidence_comment: standard compound diff --git a/inst/unitTests/runTests.R b/inst/unitTests/runTests.R new file mode 100644 index 0000000000000000000000000000000000000000..03dd13fd56b28370b8f319b9c648e2200f817446 --- /dev/null +++ b/inst/unitTests/runTests.R @@ -0,0 +1,37 @@ +w <- newMsmsWorkspace() +RmbDefaultSettings() +files <- list.files(system.file("spectra", package="RMassBankData"), ".mzML", full.names = TRUE) + +w@files <- files +loadList(system.file("list/NarcoticsDataset.csv", package="RMassBankData")) +w <- msmsWorkflow(w, mode="pH", steps=c(1:4), archivename="pH_narcotics") +w <- msmsWorkflow(w, mode="pH", steps=c(5:8), archivename="pH_narcotics") +mb <- newMbWorkspace(w) +mb <- loadInfolists(mb, system.file("infolists", package="RMassBankData")) +mb <- mbWorkflow(mb) + + +### Error message code structure borrowed from rcppgls: +### https://github.com/eddelbuettel/rcppgsl/blob/master/inst/unitTests/runTests.R + +testElec <- RUnit::defineTestSuite("Electronic noise and formula calculation Test", dirs = system.file("unitTests", + package="RMassBank"), testFileRegexp = "test_El.*") +testAcqu <- RUnit::defineTestSuite("Evaluation of data acquisition process", dirs = system.file("unitTests", + package="RMassBank"), testFileRegexp = "test_mzR.R") +testNope <- RUnit::defineTestSuite("Evaluation of correct handling if no peaks are found", dirs = system.file("unitTests", + package="RMassBank"), testFileRegexp = "test_NOPEAKS.R") + +errEval <- function(testSuite){ + Res <- RUnit::runTestSuite(testSuite) + err <- getErrors(Res) + if((err$nFail + err$nErr) > 0){ + stop(sprintf('unit test problems: %d failures, %d errors in testfunction: "%s"', err$nFail, err$nErr, testSuite$name) ) + } else{ + success <- err$nTestFunc - err$nFail - err$nErr - err$nDeactivated + cat( sprintf( "%d / %d\n", success, err$nTestFunc ) ) + } +} + +errEval(testElec) +errEval(testAcqu) +errEval(testNope) \ No newline at end of file diff --git a/inst/unitTests/runit.NOPEAKS.R b/inst/unitTests/runit.NOPEAKS.R deleted file mode 100644 index 8c9b014980ff92bd9b3478041fb300c804fc98d3..0000000000000000000000000000000000000000 --- a/inst/unitTests/runit.NOPEAKS.R +++ /dev/null @@ -1,9 +0,0 @@ -test.nopeaks <- function(){ - w <- newMsmsWorkspace() - w@aggregated <- data.frame(mzFound = numeric(0), intensity = numeric(0), good = logical(0), mzCalc = numeric(0), formula = character(0), dbe = numeric(0), - formulaCount = integer(0), dppm = numeric(0), dppmBest = numeric(0), scan = integer(0), cpdID = character(0), parentScan = integer(0), dppmRc = numeric(0), - index = integer(0), noise = logical(0), reanalyzed.formula = character(0), reanalyzed.mzCalc = numeric(0), reanalyzed.dppm = numeric(0),reanalyzed.formulaCount = numeric(0), - reanalyzed.dbe = numeric(0),matchedReanalysis = logical(0)) - filterMultiplicity(w, mode="pH") - RUnit::checkTrue(TRUE) -} \ No newline at end of file diff --git a/inst/unitTests/runit.EN_FC.R b/inst/unitTests/test_ElecNoise_FormulaCalc.R similarity index 97% rename from inst/unitTests/runit.EN_FC.R rename to inst/unitTests/test_ElecNoise_FormulaCalc.R index fec0c3f8064523a10d6c927621f9d6f8955837e0..4f08fa210671e201c40391115d423041bf9eeb53 100644 --- a/inst/unitTests/runit.EN_FC.R +++ b/inst/unitTests/test_ElecNoise_FormulaCalc.R @@ -5,6 +5,5 @@ test.eletronicnoise <- function(){ test.formulacalculation <- function(){ failpeaks <- read.csv("pH_narcotics_Failpeaks.csv") - RUnit::checkTrue(!any(apply(failpeaks,1,function(x) all(x[2:5] == c(70,2758,321,56.04933))))) } \ No newline at end of file diff --git a/inst/unitTests/test_NOPEAKS.R b/inst/unitTests/test_NOPEAKS.R new file mode 100644 index 0000000000000000000000000000000000000000..1f5ec72d6f27d0fcd4b2d22f150dacb10ff8a11a --- /dev/null +++ b/inst/unitTests/test_NOPEAKS.R @@ -0,0 +1,9 @@ +test.nopeaks <- function(){ + w <- newMsmsWorkspace() + w@aggregated <- data.frame(mzFound = numeric(0), intensity = numeric(0), good = logical(0), mzCalc = numeric(0), formula = character(0), dbe = numeric(0), + formulaCount = integer(0), dppm = numeric(0), dppmBest = numeric(0), scan = integer(0), cpdID = character(0), parentScan = integer(0), dppmRc = numeric(0), + index = integer(0), noise = logical(0), reanalyzed.formula = character(0), reanalyzed.mzCalc = numeric(0), reanalyzed.dppm = numeric(0),reanalyzed.formulaCount = numeric(0), + reanalyzed.dbe = numeric(0),matchedReanalysis = logical(0)) + filterMultiplicity(w, mode="pH") + checkTrue(TRUE) +} \ No newline at end of file diff --git a/inst/unitTests/runit.DA.R b/inst/unitTests/test_mzR.R similarity index 97% rename from inst/unitTests/runit.DA.R rename to inst/unitTests/test_mzR.R index 4d25af6a1b29fdbdc9103985ad9938ccddac708e..9fa790a8daa7c63e5780ed921cf729c769758517 100644 --- a/inst/unitTests/runit.DA.R +++ b/inst/unitTests/test_mzR.R @@ -2,7 +2,6 @@ test.mzRRead <- function(){ allOK <- TRUE records <- list.files("XX/recdata",full.names=TRUE) rightrecords <- list.files(system.file("records/XX/recdata", package="RMassBankData"), full.names=TRUE) - for(i in 1:length(records)){ gen <- file(description = records[i], open = "r", blocking = TRUE, encoding = getOption("encoding"), raw = FALSE) diff --git a/man/checkIsotopes.Rd b/man/checkIsotopes.Rd index 3e6bc8f9c71bfcec749f3db90052377d70301aad..e88e38366be042c7b6cfd738a6ea80220981962c 100644 --- a/man/checkIsotopes.Rd +++ b/man/checkIsotopes.Rd @@ -4,8 +4,8 @@ \alias{checkIsotopes} \title{Checks for isotopes in a \code{msmsWorkspace}} \usage{ -checkIsotopes(w, mode = "pH", intensity_cutoff = 1000, - intensity_precision = "low", conflict = "dppm", isolationWindow = 4, +checkIsotopes(w, mode = "pH", intensity_cutoff = 0, + intensity_precision = "none", conflict = "strict", isolationWindow = 2, evalMode = "complete", plotSpectrum = TRUE, settings = getOption("RMassBank")) } @@ -15,7 +15,8 @@ checkIsotopes(w, mode = "pH", intensity_cutoff = 1000, \item{mode}{\code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-).} -\item{intensity_cutoff}{The cutoff (as an absolute intensity value) under which isotopic peaks shouldn't be checked for or accepted as valid.} +\item{intensity_cutoff}{The cutoff (as an absolute intensity value) under which isotopic peaks shouldn't be checked for or accepted as valid. +Please note: The cutoff is not hard in the sense that it interacts with the intensity_precision parameter.} \item{intensity_precision}{The difference that is accepted between the calculated and observed intensity of a possible isotopic peak. Further details down below.} diff --git a/man/compileRecord.Rd b/man/compileRecord.Rd index d954f74eecee566834b6ebc79277d2bf4ec4567a..c80c137526d8e2b7ebf56542a2a17de2c6571ca7 100755 --- a/man/compileRecord.Rd +++ b/man/compileRecord.Rd @@ -4,7 +4,7 @@ \alias{compileRecord} \title{Compile MassBank records} \usage{ -compileRecord(spec, mbdata, aggregated, additionalPeaks = NULL) +compileRecord(spec, mbdata, aggregated, additionalPeaks = NULL, retrieval="standard") } \arguments{ \item{spec}{A \code{RmbSpectraSet} for a compound, after analysis (\code{\link{analyzeMsMs}}). @@ -20,6 +20,10 @@ usage information.)} \item{additionalPeaks}{If present, a table with additional peaks to add into the spectra. As loaded with \code{\link{addPeaks}}.} + +\item{retrieval}{A value that determines whether the files should be handled either as "standard", +if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +if the only know thing is the m/z} } \value{ Returns a MassBank record in list format: e.g. diff --git a/man/findMass.Rd b/man/findMass.Rd index 79cd02c0419330f06b0b7f21a21f03ffed48f43c..01236caee5ba723e4f12eb70179fd60f6461f936 100755 --- a/man/findMass.Rd +++ b/man/findMass.Rd @@ -4,11 +4,19 @@ \alias{findMass} \title{Calculate exact mass} \usage{ -findMass(cpdID_or_smiles) +findMass(cpdID_or_smiles, retrieval = "standard", mode = "pH") } \arguments{ \item{cpdID_or_smiles}{SMILES code or compound ID of the molecule. (Numerics are treated as compound ID).} + +\item{retrieval}{A value that determines whether the files should be handled either as "standard", +if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +if the only know thing is the m/z} + +\item{mode}{\code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions + ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +Only needed for retrieval="unknown"} } \value{ Returns the exact mass of the uncharged molecule. diff --git a/man/findMsMsHR.Rd b/man/findMsMsHR.Rd index e8c05df2e9e893bd750225f58746dc7bb7ed1328..03b34146884f6df8c44471d460880d8c57c08c53 100755 --- a/man/findMsMsHR.Rd +++ b/man/findMsMsHR.Rd @@ -12,7 +12,7 @@ findMsMsHR(fileName = NULL, msRaw = NULL, cpdID, mode = "pH", fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, rtMargin = getOption("RMassBank")$rtMargin, deprofile = getOption("RMassBank")$deprofile, headerCache = NULL, - peaksCache = NULL) + peaksCache = NULL, retrieval = "standard") findMsMsHR.mass(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA, headerCache = NULL, fillPrecursorScan = FALSE, @@ -60,6 +60,10 @@ freshly from \code{msRaw} for every compound.} \item{peaksCache}{If present, the complete output of \code{mzR::peaks(msRaw)}. This speeds up the lookup if multiple compounds should be searched in the same file.} +\item{retrieval}{A value that determines whether the files should be handled either as "standard", +if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +if the only know thing is the m/z} + \item{mz}{The mass to use for spectrum search.} \item{limit.coarse}{Parameter in \code{findMsMsHR.mass} corresponding to \code{mzCoarse}. diff --git a/man/findMz.Rd b/man/findMz.Rd index 8e971d5ee7127715fde147b36901ba936156d9da..3c0e675ae1b6e7cb5018d64535784b1f58ecc3ca 100755 --- a/man/findMz.Rd +++ b/man/findMz.Rd @@ -3,23 +3,26 @@ \name{findMz} \alias{findCAS} \alias{findFormula} +\alias{findLevel} \alias{findMz} \alias{findName} \alias{findRt} \alias{findSmiles} \title{Find compound information} \usage{ -findMz(cpdID, mode = "pH", ppm = 10, deltaMz = 0) +findMz(cpdID, mode = "pH", ppm = 10, deltaMz = 0, retrieval="standard") findRt(cpdID) findSmiles(cpdID) -findFormula(cpdID) +findFormula(cpdID, retrieval="standard") findCAS(cpdID) findName(cpdID) + +findLevel(cpdID, compact=FALSE) } \arguments{ \item{cpdID}{The compound ID in the compound list.} @@ -37,6 +40,13 @@ molecular mass + and - 10 ppm).} \item{deltaMz}{Specifies additional m/z window to add to the range (deltaMz = 0.02 will return the range of the molecular mass +- 0.02 (and additionally +- the set ppm value).} + +\item{retrieval}{A value that determines whether the files should be handled either as "standard", +if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +if the only know thing is the m/z} + +\item{compact}{Only for \code{findLevel}, returns the "retrieval" parameter used for many functions +within RMassBank if TRUE} } \value{ \code{findMz} will return a \code{list(mzCenter=, mzMin=, mzMax=)} diff --git a/man/gatherCompound.Rd b/man/gatherCompound.Rd index fc91507446dfe7977a329b0588b94996b654bd63..91631d0a8ceb5fa6e1eedc810382e52a89781d53 100755 --- a/man/gatherCompound.Rd +++ b/man/gatherCompound.Rd @@ -5,10 +5,10 @@ \alias{gatherSpectrum} \title{Compose data block of MassBank record} \usage{ -gatherCompound(spec, aggregated, additionalPeaks = NULL) +gatherCompound(spec, aggregated, additionalPeaks = NULL, retrieval="standard") gatherSpectrum(spec, msmsdata, ac_ms, ac_lc, aggregated, - additionalPeaks = NULL) + additionalPeaks = NULL, retrieval="standard") } \arguments{ \item{spec}{A \code{RmbSpectraSet} object, representing a compound with multiple spectra.} @@ -18,6 +18,10 @@ gatherCompound(spec, aggregated, additionalPeaks = NULL) \item{additionalPeaks}{If present, a table with additional peaks to add into the spectra. As loaded with \code{\link{addPeaks}}.} +\item{retrieval}{A value that determines whether the files should be handled either as "standard", +if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +if the only know thing is the m/z} + \item{msmsdata}{A \code{RmbSpectrum2} object from the \code{spec} spectra set, representing a single spectrum to give a record.} \item{ac_ms, ac_lc}{Information for the AC\$MASS_SPECTROMETRY and diff --git a/man/gatherDataUnknown.Rd b/man/gatherDataUnknown.Rd new file mode 100644 index 0000000000000000000000000000000000000000..cef099cc9ffa05ae8808f3b62253863975aa70b2 --- /dev/null +++ b/man/gatherDataUnknown.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/createMassBank.R +\name{gatherDataUnknown} +\alias{gatherDataUnknown} +\title{Retrieve annotation data} +\usage{ +gatherDataUnknown(id, mode, retrieval) +} +\arguments{ +\item{id}{The compound ID.} + +\item{mode}{\code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-).} + +\item{retrieval}{A value that determines whether the files should be handled either as "standard", +if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +if the only know thing is the m/z} +} +\value{ +Returns a list of type \code{list(id= \var{compoundID}, ..., +'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +} +\description{ +Retrieves annotation data for an unknown compound by using basic information present +} +\details{ +Composes the "upper part" of a MassBank record filled with chemical data +about the compound: name, exact mass, structure, CAS no.. +The instrument type is also written into this block (even +if not strictly part of the chemical information). Additionally, index +fields are added at the start of the record, which will be removed later: +\code{id, dbcas, dbname} from the compound list. + +Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +inserted empty and will be filled later on. + +This function is used to generate the data in case a substance is unknown, +i.e. not enough information is present to derive anything about formulas or links +} +\examples{ + +# Gather data for compound ID 131 +\dontrun{gatherDataUnknown(131,"pH")} + +} +\author{ +Michael Stravs, Erik Mueller +} +\references{ +MassBank record format: +\url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +} +\seealso{ +\code{\link{mbWorkflow}} +} + diff --git a/man/mbWorkflow.Rd b/man/mbWorkflow.Rd index 2e75e80b6bf03a609048b99ec61ad107f222b1c0..b2e1421bbfa39893743ee361b431ca2f88113cba 100755 --- a/man/mbWorkflow.Rd +++ b/man/mbWorkflow.Rd @@ -17,7 +17,7 @@ which should then be manually inspected.} \item{gatherData}{A variable denoting whether to retrieve information using several online databases \code{gatherData= "online"} or to use the local babel installation \code{gatherData= "babel"}. Note that babel is used either way, if a directory is given -in the settings} +in the settings. This setting will be ignored if retrieval is set to "standard"} } \value{ The processed \code{mbWorkspace}. diff --git a/man/processProblematicPeaks.Rd b/man/processProblematicPeaks.Rd index 7326cde3c530c29662709ec5a0024e8e99963904..99b0f82f9fff74ee7fb3470572ef53481c7ad00b 100644 --- a/man/processProblematicPeaks.Rd +++ b/man/processProblematicPeaks.Rd @@ -11,7 +11,9 @@ processProblematicPeaks(w, mode, archivename = NA) \item{mode}{Processing mode (pH etc)} -\item{archivename}{Base name of the archive to write to (for "abc" the exported failpeaks list will be "abc_Failpeaks.csv").} +\item{archivename}{Base name of the archive to write to (for "abc" the exported failpeaks list will be "abc_Failpeaks.csv"). +if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +if the only know thing is the m/z} } \value{ Returns the aggregate data.frame with added column "\code{problematic}" (logical) which marks peaks which match the problematic criteria diff --git a/tests/doRUnit.R b/tests/doRUnit.R new file mode 100644 index 0000000000000000000000000000000000000000..d481b74b95480dd931e0a56c3e7a0d548bf44993 --- /dev/null +++ b/tests/doRUnit.R @@ -0,0 +1,24 @@ +#### doRUnit.R --- Run RUnit tests +####------------------------------------------------------------------------ + +### Structure borrowed from rcppgls: +### https://github.com/eddelbuettel/rcppgsl/blob/master/tests/doRUnit.R + +if(require("RUnit", quietly = TRUE)) { + if(require("RMassBankData", quietly = TRUE) && !(compareVersion(installed.packages()["RMassBankData","Version"],"1.99.0") == -1)) { + pkg <- "RMassBank" + print("Starting tests") + require(pkg, character.only=TRUE) + + path <- system.file("unitTests", package = pkg) + + stopifnot(file.exists(path), file.info(path.expand(path))$isdir) + + source(file.path(path, "runTests.R"), echo = TRUE) + } else { + ## Taking this message out until the new RMassBankData is on bioc, just to avoid confusion. + # message("Package RMassBankData with version > 1.99 not available, cannot run unit tests") + } +} else { + message("Package RUnit not available, cannot run unit tests") +}