From f1f70dda9884f5e5bbc16206f5bcdc36be3ed4bc Mon Sep 17 00:00:00 2001 From: Emma Schymanski <massbank@eawag.ch> Date: Fri, 8 Mar 2013 10:48:57 +0000 Subject: [PATCH] Added option to switch multiplicity filtering on/off; more informative progress bars git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/RMassBank@74064 bc3139a8-67e5-0310-9ffc-ced21a209358 --- DESCRIPTION | 2 +- R/createMassBank.R | 17 +++++++++-------- R/leMsMs.r | 39 ++++++++++++++++++++++++++------------- R/settings_example.R | 7 ++++++- inst/RMB_options.ini | 5 +++++ 5 files changed, 47 insertions(+), 23 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 61228a4..63a11f8 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 1.1.0 +Version: 1.1.1 Author: Michael Stravs, Emma Schymanski Maintainer: Michael Stravs, Emma Schymanski <massbank@eawag.ch> Description: Workflow to process tandem MS files and build MassBank records. diff --git a/R/createMassBank.R b/R/createMassBank.R index bb85c1b..44f8e95 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -1,5 +1,6 @@ # Script for writing MassBank files +#testtest change #' Load MassBank compound information lists #' #' Loads MassBank compound information lists (i.e. the lists which were created @@ -164,7 +165,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c { mbdata_ids <- lapply(mb@aggregatedRcSpecs$specFound, function(spec) spec$id) - message("mbWorkflow: Step 1") + message("mbWorkflow: Step 1. Gather info from CTS") # Which IDs are not in mbdata_archive yet? new_ids <- setdiff(as.numeric(unlist(mbdata_ids)), mb@mbdata_archive$id) @@ -181,7 +182,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c # Otherwise, continue! if(2 %in% steps) { - message("mbWorkflow: Step 2") + message("mbWorkflow: Step 2. Export infolist (if required)") if(length(mb@mbdata)>0) { mbdata_mat <- flatten(mb@mbdata) @@ -196,7 +197,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c # Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. if(3 %in% steps) { - message("mbWorkflow: Step 3") + message("mbWorkflow: Step 3. Data reformatting") mb@mbdata_relisted <- apply(mb@mbdata_archive, 1, readMbdata) } # Step 4: Compile the spectra! Using the skeletons from the archive data, create @@ -204,7 +205,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c # Also, assign accession numbers based on scan mode and relative scan no. if(4 %in% steps) { - message("mbWorkflow: Step 4") + message("mbWorkflow: Step 4. Spectra compilation") mb@compiled <- mapply( function(r, refiltered) { message(paste("Compiling: ", r$name, sep="")) @@ -228,7 +229,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c # flat-text string arrays (basically, into text-file style, but still in memory) if(5 %in% steps) { - message("mbWorkflow: Step 5") + message("mbWorkflow: Step 5. Flattening records") mb@mbfiles <- lapply(mb@compiled_ok, function(c) lapply(c, toMassbank)) } # Step 6: For all OK records, generate a corresponding molfile with the structure @@ -236,14 +237,14 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c # is still in memory only, not yet a physical file) if(6 %in% steps) { - message("mbWorkflow: Step 6") + message("mbWorkflow: Step 6. Generate molfiles") mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(c[[1]][["CH$SMILES"]])) } # Step 7: If necessary, generate the appropriate subdirectories, and actually write # the files to disk. if(7 %in% steps) { - message("mbWorkflow: Step 7") + message("mbWorkflow: Step 7. Generate subdirs and export") dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "moldata", sep='/'),recursive=TRUE) dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "recdata", sep='/'),recursive=TRUE) for(cnt in 1:length(mb@compiled_ok)) @@ -253,7 +254,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c # to attribute substances to their corresponding structure molfiles. if(8 %in% steps) { - message("mbWorkflow: Step 8") + message("mbWorkflow: Step 8. Create list.tsv") makeMollist(mb@compiled_ok) } return(mb) diff --git a/R/leMsMs.r b/R/leMsMs.r index ae518ab..b482618 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -69,7 +69,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca if(1 %in% steps) { nProg <- 0 - message("msmsWorkflow: Step 1") + message("msmsWorkflow: Step 1. Acquire all MSMS spectra from files") pb <- txtProgressBar(0,nLen,0, style=3, file=stderr()) w@specs <- lapply(w@files, function(fileName) { @@ -97,7 +97,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca if(2 %in% steps) { nProg <- 0 - message("msmsWorkflow: Step 2") + message("msmsWorkflow: Step 2. First analysis pre recalibration") pb <- txtProgressBar(0,nLen,0, style=3, file=stderr()) w@analyzedSpecs <- lapply(w@specs, function(spec) { #print(spec$id) @@ -115,13 +115,13 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca # Step 3: aggregate all spectra if(3 %in% steps) { - message("msmsWorkflow: Step 3") + message("msmsWorkflow: Step 3. Aggregate all spectra") w@aggregatedSpecs <- aggregateSpectra(w@analyzedSpecs) } # Step 4: recalibrate all m/z values in raw spectra if(4 %in% steps) { - message("msmsWorkflow: Step 4") + message("msmsWorkflow: Step 4. Recalibrate m/z values in raw spectra") if(newRecalibration) { recal <- makeRecalibration(w@aggregatedSpecs, mode) @@ -134,7 +134,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca if(5 %in% steps) { nProg <- 0 - message("msmsWorkflow: Step 5") + message("msmsWorkflow: Step 5. Reanalyze recalibrated spectra") pb <- txtProgressBar(0,nLen,0, style=3, file=stderr()) w@analyzedRcSpecs <- lapply(w@recalibratedSpecs, function(spec) { s <- analyzeMsMs(spec, mode=mode, detail=TRUE, run="recalibrated", cut=0, cut_ratio=0 ) @@ -152,7 +152,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca # Step 6: aggregate recalibrated results if(6 %in% steps) { - message("msmsWorkflow: Step 6") + message("msmsWorkflow: Step 6. Aggregate recalibrated results") w@aggregatedRcSpecs <- aggregateSpectra(w@analyzedRcSpecs, addIncomplete=TRUE) if(!is.na(archivename)) archiveResults(w, paste(archivename, ".RData", sep='')) @@ -162,7 +162,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca # Step 7: reanalyze failpeaks for (mono)oxidation and N2 adduct peaks if(7 %in% steps) { - message("msmsWorkflow: Step 7") + message("msmsWorkflow: Step 7. Reanalyze fail peaks for N2 + O") w@reanalyzedRcSpecs <- reanalyzeFailpeaks( w@aggregatedRcSpecs, custom_additions="N2O", mode=mode) if(!is.na(archivename)) @@ -172,7 +172,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8),confirmMode = FALSE, newReca # creation of failpeak list if(8 %in% steps) { - message("msmsWorkflow: Step 8") + message("msmsWorkflow: Step 8. Peak multiplicity filtering") # apply heuristic filter w@refilteredRcSpecs <- filterMultiplicity( w@reanalyzedRcSpecs, archivename, mode) @@ -1428,6 +1428,13 @@ filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) #' @export filterMultiplicity <- function(specs, archivename=NA, mode="pH", recalcBest = TRUE) { + # Read multiplicity filter setting + # For backwards compatibility: If the option is not set, define as 2 + # (which was the behaviour before we introduced the option) + multiplicityFilter <- getOption("RMassBank")$multiplicityFilter + if(is.null(multiplicityFilter)) + multiplicityFilter <- 2 + peaksFiltered <- filterPeaksMultiplicity(specs$peaksMatched, "formula", recalcBest) peaksFilteredReanalysis <- @@ -1437,18 +1444,24 @@ filterMultiplicity <- function(specs, archivename=NA, mode="pH", recalcBest = TR peaksNoformula$formulaMultiplicity <- 0 peaksNoformula$fM_factor <- as.factor(0) - + # Reorder the columns of peaksNoformula such that they match the columns + # of peaksFilteredReanalysis; such that rbind gives an identical result + # when peaksFilteredReanalysis is empty. (Otherwise peaksFilterReanalysis + # would be dropped as 0x0, and rbind's output column order would be the one originally + # in peaksNoformula. See ?cbind) + peaksNoformula <- peaksNoformula[,colnames(peaksFilteredReanalysis)] + # export the peaks which drop through reanalysis or filter criteria fp_rean <- problematicPeaks( rbind( peaksFilteredReanalysis[ - peaksFilteredReanalysis$formulaMultiplicity < 2,], + peaksFilteredReanalysis$formulaMultiplicity < multiplicityFilter,], peaksNoformula), specs$peaksMatched, mode) fp_mult <- problematicPeaks( peaksFiltered[ - peaksFiltered$formulaMultiplicity < 2,], + peaksFiltered$formulaMultiplicity < multiplicityFilter,], specs$peaksMatched, mode) fp_mult$good <- NULL @@ -1479,9 +1492,9 @@ filterMultiplicity <- function(specs, archivename=NA, mode="pH", recalcBest = TR "formulaCount", "parentScan", "aMax", "mzCenter")] peaksOK <- peaksFiltered[ - peaksFiltered$formulaMultiplicity > 1,] + peaksFiltered$formulaMultiplicity > (multiplicityFilter - 1),] peaksReanOK <- peaksFilteredReanalysis[ - (peaksFilteredReanalysis$formulaMultiplicity > 1) & + (peaksFilteredReanalysis$formulaMultiplicity > (multiplicityFilter - 1)) & !is.na(peaksFilteredReanalysis$reanalyzed.formula),] # Kick the M+H+ satellites out of peaksReanOK: peaksReanOK$mzCenter <- as.numeric( diff --git a/R/settings_example.R b/R/settings_example.R index 49c51b2..a709439 100755 --- a/R/settings_example.R +++ b/R/settings_example.R @@ -182,7 +182,12 @@ NULL # The settings define which recal function is used recalibrator = list( MS1 = "recalibrate.loess", - MS2 = "recalibrate.loess") + MS2 = "recalibrate.loess"), + # Define the multiplicity filtering level + # Default is 2 (peak occurs at least twice) + # Set this to 1 if you want to turn this option off. + # Set this to anything > 2 if you want harder filtering + multiplicityFilter = 2 ) # Writes a file with sample settings which the user can adjust with his values. diff --git a/inst/RMB_options.ini b/inst/RMB_options.ini index a29f6e3..e19f638 100755 --- a/inst/RMB_options.ini +++ b/inst/RMB_options.ini @@ -168,3 +168,8 @@ recalibrator: MS1: recalibrate.loess MS2: recalibrate.loess +# Define the multiplicity filtering level +# Default is 2 (peak occurs at least twice) +# Set this to 1 if you want to turn this option off. +# Set this to anything > 2 if you want harder filtering +multiplicityFilter: 2 -- GitLab